{"id":37086138,"url":"https://github.com/johnbumgarner/newshound","last_synced_at":"2026-01-14T10:36:05.215Z","repository":{"id":45492871,"uuid":"414245280","full_name":"johnbumgarner/newshound","owner":"johnbumgarner","description":"This Python package can be used to systematically extract multiple data elements (e.g., title, keywords, text) from news sources around the world in over 50 languages. ","archived":false,"fork":false,"pushed_at":"2023-03-14T03:59:41.000Z","size":29,"stargazers_count":33,"open_issues_count":1,"forks_count":3,"subscribers_count":14,"default_branch":"master","last_synced_at":"2025-08-24T23:08:41.089Z","etag":null,"topics":["article-extracting","article-extractor","data-extraction","data-mining","data-science","datascience","news","news-aggregator","news-crawler","newspaper-crawler","python-newspaper","python3","text-mining","web-scraping","webscraping"],"latest_commit_sha":null,"homepage":"","language":null,"has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/johnbumgarner.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":"CONTRIBUTING.md","funding":"FUNDING.yml","license":null,"code_of_conduct":"CODE_OF_CONDUCT.md","threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":"SECURITY.md","support":null,"governance":null,"roadmap":null,"authors":null},"funding":{"custom":"https://www.buymeacoffee.com/johnbumgarner"}},"created_at":"2021-10-06T14:28:19.000Z","updated_at":"2025-06-25T02:25:02.000Z","dependencies_parsed_at":"2023-12-31T03:43:55.136Z","dependency_job_id":"bc675556-9c5e-4bbf-816a-70f3f4aca952","html_url":"https://github.com/johnbumgarner/newshound","commit_stats":{"total_commits":8,"total_committers":1,"mean_commits":8.0,"dds":0.0,"last_synced_commit":"9b534d3b6fb2613a18e4fccfee2863aac44b2d74"},"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/johnbumgarner/newshound","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/johnbumgarner%2Fnewshound","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/johnbumgarner%2Fnewshound/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/johnbumgarner%2Fnewshound/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/johnbumgarner%2Fnewshound/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/johnbumgarner","download_url":"https://codeload.github.com/johnbumgarner/newshound/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/johnbumgarner%2Fnewshound/sbom","scorecard":{"id":528252,"data":{"date":"2025-08-11","repo":{"name":"github.com/johnbumgarner/newshound","commit":"9b534d3b6fb2613a18e4fccfee2863aac44b2d74"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":2.2,"checks":[{"name":"Code-Review","score":0,"reason":"Found 0/8 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"SAST","score":0,"reason":"no SAST tool detected","details":["Warn: no pull requests merged into dev branch"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"Security-Policy","score":10,"reason":"security policy file detected","details":["Info: security policy file detected: SECURITY.md:1","Info: Found linked content: SECURITY.md:1","Info: Found disclosure, vulnerability, and/or timelines in security policy: SECURITY.md:1","Info: Found text in security policy: SECURITY.md:1"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"License","score":0,"reason":"license file not detected","details":["Warn: project does not have a license file"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"Vulnerabilities","score":0,"reason":"10 existing vulnerabilities detected","details":["Warn: Project is vulnerable to: PYSEC-2024-230 / GHSA-248v-346w-9cwc","Warn: Project is vulnerable to: PYSEC-2022-42986 / GHSA-43fp-rhv2-5gv8","Warn: Project is vulnerable to: PYSEC-2023-135 / GHSA-xqr8-7jwr-rhp7","Warn: Project is vulnerable to: PYSEC-2024-60 / GHSA-jjg7-2v4v-x38h","Warn: Project is vulnerable to: GHSA-55x5-fj6c-h6m8","Warn: Project is vulnerable to: PYSEC-2022-230 / GHSA-wrxv-2j5q-m38w","Warn: Project is vulnerable to: GHSA-fpfv-jqm9-f5jm","Warn: Project is vulnerable to: GHSA-9hjg-9r4m-mvj7","Warn: Project is vulnerable to: GHSA-9wx4-h78v-vm56","Warn: Project is vulnerable to: PYSEC-2023-74 / GHSA-j8r2-6x86-q33q"],"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}}]},"last_synced_at":"2025-08-20T05:01:28.633Z","repository_id":45492871,"created_at":"2025-08-20T05:01:28.633Z","updated_at":"2025-08-20T05:01:28.633Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":28417664,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-01-14T10:25:19.714Z","status":"ssl_error","status_checked_at":"2026-01-14T10:22:49.371Z","response_time":107,"last_error":"SSL_read: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["article-extracting","article-extractor","data-extraction","data-mining","data-science","datascience","news","news-aggregator","news-crawler","newspaper-crawler","python-newspaper","python3","text-mining","web-scraping","webscraping"],"created_at":"2026-01-14T10:36:04.599Z","updated_at":"2026-01-14T10:36:05.208Z","avatar_url":"https://github.com/johnbumgarner.png","language":null,"funding_links":["https://www.buymeacoffee.com/johnbumgarner"],"categories":[],"sub_categories":[],"readme":"# Currently under development.  BETA will be released soon.\n########### ########### ########### ########### ########### \n\n\n# NewsHound\n---\n\n![PyPI](https://img.shields.io/pypi/v/newshound) \u0026nbsp;\n\u003c!-- ![License: MIT](https://img.shields.io/github/license/johnbumgarner/newshound)\u0026nbsp; --\u003e\n![GitHub issues](https://img.shields.io/github/issues/johnbumgarner/newshound)\u0026nbsp;\n![GitHub pull requests](https://img.shields.io/github/issues-pr/johnbumgarner/newshound)\u0026nbsp;\n[![newshound](https://snyk.io/advisor/python/newshound/badge.svg)](https://snyk.io/advisor/python/newshound)\u0026nbsp;\n[![Downloads](https://static.pepy.tech/personalized-badge/newshound?period=total\u0026units=international_system\u0026left_color=grey\u0026right_color=brightgreen\u0026left_text=Total%20Downloads)](https://pepy.tech/project/newshound)\u0026nbsp;\n\n\n## Description\n\n\u003cp align=\"justify\"\u003e \n\t\u003cstrong\u003eNewsHound\u003c/strong\u003e is a \u003ci\u003ePython 3\u003c/i\u003e module that was designed to perform high quality news and article extraction for sources in multiple languages.\n\u003c/p\u003e\n\n\u003cp align=\"justify\"\u003e \n\tFor instance \u003cstrong\u003eNewsHound\u003c/strong\u003e cleanly parses article content from \u003ca href=\"https://www.bbc.com\"\u003ethe BBC\u003c/a\u003e in English, the \u003ca href=\"https://www.bhaskar.com\"\u003eDainik Bhaskar\u003c/a\u003e in Hindi, the \u003ca href=\"https://www.people.com.cn\"\u003ePeople's Daily\u003c/a\u003e in Chinese, the \u003ca href=\"https://www.manoramaonline.com\"\u003eMalayala Manorama\u003c/a\u003e in Malayalam and the \u003ca href=\" www.khaosod.co.th\"\u003eKhaosod\u003c/a\u003e in Thai.\n\u003c/p\u003e\n\n\u003cp align=\"justify\"\u003e \n\tThe builtin extraction architecture is designed to systematically parse specific data elements from the underlying navigation structure of either an online web page or an offline file containing HTML content.  \n\u003c/p\u003e\n\n\u003cp align=\"justify\"\u003e \nThese data elements are:\n\u003c/p\u003e\n\n\u003cul\u003e\n\t\u003cli\u003e Title/Headline\u003c/li\u003e\n\t\u003cli\u003e Description/Summary\u003c/li\u003e\n\t\u003cli\u003e Keywords \u003c/li\u003e\n\t\u003cli\u003e Name(s) of Author(s) \u003c/li\u003e\n\t\u003cli\u003e Main Text/Content \u003c/li\u003e\n\t\u003cli\u003e ISO Language \u003c/li\u003e\n\t\u003cli\u003e Language Name \u003c/li\u003e\n\t\u003cli\u003e Published Date \u003c/li\u003e\n\t\u003cli\u003e Modified Date \u003c/li\u003e\n\t\u003cli\u003e Canonical HREF \u003c/li\u003e\n\t\u003cli\u003e Top Image HREF \u003c/li\u003e\n\u003c/ul\u003e\n\n## Installation\n\n\u003cp align=\"justify\"\u003e \n  \u003cstrong\u003eNewsHound\u003c/strong\u003e requires \u003cstrong\u003ePython \u003e=3.6\u003c/strong\u003e.  This package can be installed using \u003ci\u003epip3\u003c/i\u003e.\n\u003c/p\u003e\n\n```python\npip3 install newshound\n```\n\n## Usage and Documentation\n\n\u003cp align=\"justify\"\u003e\n  For detailed information on \u003cstrong\u003eNewsHound\u003c/strong\u003e please refer to the \u003ca href=\"https://newshound.readthedocs.io/\"\u003edocumentation\u003c/a\u003e.\n\n  - \u003ca href=\"https://newshound.readthedocs.io/dependencies/\"\u003ePackage Dependencies\u003c/a\u003e\n\n\u003c/p\u003e\n\n\n## Predefined Extraction\n\n\u003cp align=\"justify\"\u003e\nThe maintainers of \u003cstrong\u003eNewsHound\u003c/strong\u003e have developed and tested multiple \u003ca href=\"https://github.com/johnbumgarner/newshound/blob/master/predefined_extraction_sources.md\"\u003epredefined extraction modules\u003c/a\u003e for various news sources around the world.  These specific extractors were developed to ensure consistent and accurate parsing from the news sources being queried. Additional sources will be added periodically to this predefined extraction list.  \n\u003c/p\u003e\n\n\n## Development\n\n\u003cp align=\"justify\"\u003e\nIf you would like to contribute to the \u003ci\u003eNewsHound\u003c/i\u003e project please read the \u003ca href=\"https://github.com/johnbumgarner/newshound/blob/master/CONTRIBUTING.md\"\u003econtributing guidelines\u003c/a\u003e.\n   \nItems currently under development:\n   - TDB after BETA release\n\u003c/p\u003e\n\n## Issues\n\n\u003cp align=\"justify\"\u003e\nThis repository is actively maintained.  Feel free to open any issues related to bugs, coding errors, broken links or enhancements. \n\nYou can also contact me at [John Bumgarner](mailto:newshoundproject@gmail.com?subject=[GitHub]%20newshound%20project%20request) with any issues or enhancement requests.\n\u003c/p\u003e\n\n## Sponsorship\n   \nIf you would like to contribute financially to the development and maintenance of the \u003ci\u003eNewsHound\u003c/i\u003e project please read the \u003ca href=\"https://github.com/johnbumgarner/newshound/blob/master/SPONSOR.md\"\u003esponsorship information\u003c/a\u003e.\n\n\u003c!-- ## License\n\n\u003cp align=\"justify\"\u003e\nThe MIT License (MIT).  Please see \u003ca href=\"https://github.com/johnbumgarner/newshound/blob/main/LICENSE\"\u003eLicense File\u003c/a\u003e for more information.\n\u003c/p\u003e --\u003e\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fjohnbumgarner%2Fnewshound","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fjohnbumgarner%2Fnewshound","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fjohnbumgarner%2Fnewshound/lists"}