{"id":21760690,"url":"https://github.com/khuyentran1401/extract-text-from-article","last_synced_at":"2025-04-13T12:50:47.516Z","repository":{"id":106592071,"uuid":"231174296","full_name":"khuyentran1401/Extract-text-from-article","owner":"khuyentran1401","description":null,"archived":false,"fork":false,"pushed_at":"2020-04-06T02:40:27.000Z","size":84,"stargazers_count":6,"open_issues_count":0,"forks_count":0,"subscribers_count":2,"default_branch":"master","last_synced_at":"2025-03-27T03:51:19.744Z","etag":null,"topics":["data-science","natural-language-processing","newspaper3k","nltk","python","text-preprocessing","web-scraping"],"latest_commit_sha":null,"homepage":null,"language":"Jupyter Notebook","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/khuyentran1401.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2020-01-01T04:14:52.000Z","updated_at":"2022-01-24T13:05:17.000Z","dependencies_parsed_at":null,"dependency_job_id":"37e6cdf1-b07c-43b6-940b-5db4174e2143","html_url":"https://github.com/khuyentran1401/Extract-text-from-article","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/khuyentran1401%2FExtract-text-from-article","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/khuyentran1401%2FExtract-text-from-article/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/khuyentran1401%2FExtract-text-from-article/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/khuyentran1401%2FExtract-text-from-article/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/khuyentran1401","download_url":"https://codeload.github.com/khuyentran1401/Extract-text-from-article/tar.gz/refs/heads/master","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":248717252,"owners_count":21150388,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["data-science","natural-language-processing","newspaper3k","nltk","python","text-preprocessing","web-scraping"],"created_at":"2024-11-26T11:45:07.589Z","updated_at":"2025-04-13T12:50:47.507Z","avatar_url":"https://github.com/khuyentran1401.png","language":"Jupyter Notebook","funding_links":[],"categories":[],"sub_categories":[],"readme":"# About this project\nThis project extracts the text from an article using Python Article Library and uses NLTK (Natural Language Processing Toolkit) to preprocess the text and extract the most common words in the article\n\n# Tools\n* Newspaper3k: tool to scrape article\n* NLTK: tool to process text\n\n# Steps\n* Scrape articles with newspaper3k\n```javascript\nfrom newspaper import Article\n\nurl = 'https://mystudentvoices.com/it-took-me-2-years-to-get-1000-followers-life-lessons-ive-learned-throughout-the-journey-9bc44f2959f0'\narticle = Article(url)\n\narticle.download()\n```\n* Find the publish date\n```javascript\narticle.publish_date\n```\n* Extract image\n* Find the author\n* Find the keywords\n* Find the summary\n* Preprocessing with NLTK\n  * Tokenize text\n  * Lowercase and remove stopwords\n* Visualization the frequency of words with Matplotlib\n![image](https://github.com/khuyentran1401/Extract-text-from-article/blob/master/images/Screenshot%202020-04-05%2021.39.00.png)\n\n\n\n# Tutorial blog\nFind the Medium article for this repository [here](https://medium.com/@khuyentran1476/find-common-words-in-article-with-python-module-newspaper-and-nltk-8c7d6c75733)\n\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fkhuyentran1401%2Fextract-text-from-article","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fkhuyentran1401%2Fextract-text-from-article","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fkhuyentran1401%2Fextract-text-from-article/lists"}