{"id":18631762,"url":"https://github.com/esotericpig/nhkore","last_synced_at":"2026-03-06T02:12:43.488Z","repository":{"id":42129359,"uuid":"242808877","full_name":"esotericpig/nhkore","owner":"esotericpig","description":"🇯🇵📰🗻 NHK News Web (Easy) word frequency (core list) scraper for Japanese language learners.","archived":false,"fork":false,"pushed_at":"2025-09-19T22:53:00.000Z","size":391,"stargazers_count":15,"open_issues_count":0,"forks_count":2,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-12-04T12:30:11.056Z","etag":null,"topics":["cli","japanese","japanese-language","japanese-language-learners","japanese-study","news","nhk","nhk-easy-news","nhk-news-web","nokogiri","scraper","scraping","scraping-websites"],"latest_commit_sha":null,"homepage":"","language":"Ruby","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"lgpl-3.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/esotericpig.png","metadata":{"files":{"readme":"README.md","changelog":"CHANGELOG.md","contributing":null,"funding":null,"license":"LICENSE.txt","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2020-02-24T18:16:55.000Z","updated_at":"2025-10-08T06:38:48.000Z","dependencies_parsed_at":"2025-04-11T14:41:59.764Z","dependency_job_id":"6f25cc44-ab26-4c71-8e94-b57f6a46c426","html_url":"https://github.com/esotericpig/nhkore","commit_stats":{"total_commits":341,"total_committers":1,"mean_commits":341.0,"dds":0.0,"last_synced_commit":"7761b4acc32fad4e5319cd94dad8f99d64ceae19"},"previous_names":[],"tags_count":23,"template":false,"template_full_name":null,"purl":"pkg:github/esotericpig/nhkore","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/esotericpig%2Fnhkore","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/esotericpig%2Fnhkore/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/esotericpig%2Fnhkore/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/esotericpig%2Fnhkore/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/esotericpig","download_url":"https://codeload.github.com/esotericpig/nhkore/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/esotericpig%2Fnhkore/sbom","scorecard":{"id":382788,"data":{"date":"2025-08-11","repo":{"name":"github.com/esotericpig/nhkore","commit":"38ef88853ab0da8e72386a287be057381c43aff9"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":3,"checks":[{"name":"Code-Review","score":0,"reason":"Found 0/30 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"SAST","score":0,"reason":"no SAST tool detected","details":["Warn: no pull requests merged into dev branch"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Maintained","score":4,"reason":"5 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 4","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE.txt:0","Info: FSF or OSI recognized license: GNU Lesser General Public License v3.0: LICENSE.txt:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'main'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"Signed-Releases","score":0,"reason":"Project has not signed or included provenance with any releases.","details":["Warn: release artifact v0.3.22 not signed: https://api.github.com/repos/esotericpig/nhkore/releases/215913914","Warn: release artifact v0.3.19 not signed: https://api.github.com/repos/esotericpig/nhkore/releases/215352864","Warn: release artifact v0.3.18 not signed: https://api.github.com/repos/esotericpig/nhkore/releases/214647920","Warn: release artifact v0.3.17 not signed: https://api.github.com/repos/esotericpig/nhkore/releases/173326434","Warn: release artifact v0.3.16 not signed: https://api.github.com/repos/esotericpig/nhkore/releases/170141257","Warn: release artifact v0.3.22 does not have provenance: https://api.github.com/repos/esotericpig/nhkore/releases/215913914","Warn: release artifact v0.3.19 does not have provenance: https://api.github.com/repos/esotericpig/nhkore/releases/215352864","Warn: release artifact v0.3.18 does not have provenance: https://api.github.com/repos/esotericpig/nhkore/releases/214647920","Warn: release artifact v0.3.17 does not have provenance: https://api.github.com/repos/esotericpig/nhkore/releases/173326434","Warn: release artifact v0.3.16 does not have provenance: https://api.github.com/repos/esotericpig/nhkore/releases/170141257"],"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Vulnerabilities","score":9,"reason":"1 existing vulnerabilities detected","details":["Warn: Project is vulnerable to: GHSA-353f-x4gh-cqq8"],"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}}]},"last_synced_at":"2025-08-18T15:52:31.170Z","repository_id":42129359,"created_at":"2025-08-18T15:52:31.170Z","updated_at":"2025-08-18T15:52:31.170Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":30078856,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-03-04T08:01:56.766Z","status":"ssl_error","status_checked_at":"2026-03-04T08:00:42.919Z","response_time":59,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.5:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["cli","japanese","japanese-language","japanese-language-learners","japanese-study","news","nhk","nhk-easy-news","nhk-news-web","nokogiri","scraper","scraping","scraping-websites"],"created_at":"2024-11-07T05:08:32.135Z","updated_at":"2026-03-06T02:12:43.477Z","avatar_url":"https://github.com/esotericpig.png","language":"Ruby","funding_links":[],"categories":[],"sub_categories":[],"readme":"# NHKore\n\n[![Gem Version](https://badge.fury.io/rb/nhkore.svg)](https://badge.fury.io/rb/nhkore)\n\n[![Source Code](https://img.shields.io/badge/source-github-%23211F1F.svg)](https://github.com/esotericpig/nhkore)\n[![Changelog](https://img.shields.io/badge/changelog-md-%23A0522D.svg)](CHANGELOG.md)\n[![License](https://img.shields.io/github/license/esotericpig/nhkore.svg)](LICENSE.txt)\n\nA CLI app that scrapes [NHK News Web Easy](https://www3.nhk.or.jp/news/easy/) to create a list of each word and its frequency (how many times it was used) for Japanese language learners.\n\nThis is similar to a [core word/vocabulary list](https://www.fluentin3months.com/core-japanese-words/), hence the name NHKore.\n\n[![asciinema Demo](https://asciinema.org/a/318958.svg)](https://asciinema.org/a/318958)\n\n## Contents\n\n- [For Non-Power Users](#for-non-power-users-)\n- [Installing](#installing-)\n- [Using](#using-)\n    - [The Basics](#the-basics-)\n    - [Unlimited Powah!](#unlimited-powah-)\n        - [Get Command](#get-command-)\n        - [Sift Command](#sift-command-)\n    - [Sakura Fields Forever](#sakura-fields-forever-)\n        - [Search Command](#search-command-)\n        - [News Command](#news-command-)\n- [Using the Library](#using-the-library-)\n- [Hacking](#hacking-)\n    - [Updating](#updating-)\n    - [Releasing](#releasing-)\n- [License](#license-)\n\n## For Non-Power Users [^](#contents)\n\nFor non-Power Users, you are probably just interested in the data.\n\n[Click here](https://esotericpig.github.io/showcase/nhkore-ez.html) for a big HTML file of the final result from all of the current articles scraped.\n\n[Click here](https://github.com/esotericpig/nhkore/releases/latest) to go to the latest release and download `nhkore-core.zip` from the `Assets`. It contains all of the links scraped, all of the data scraped per article, and a final CSV file.\n\nIf you'd like to try using the app, please download and install [Ruby](https://www.ruby-lang.org/en/downloads/) and then follow the instructions below. You'll need to be able to use the command line.\n\n## Installing [^](#contents)\n\nPick your poison...\n\nWith the RubyGems package manager:\n\n`$ gem install nhkore`\n\nManually:\n\n```\n$ git clone 'https://github.com/esotericpig/nhkore.git'\n$ cd nhkore\n$ bundle install\n$ bundle exec rake install:local\n```\n\nIf there are errors running `nhkore`, you may need to also [install Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) manually, which is used for scraping HTML.\n\n## Using [^](#contents)\n\n### The Basics [^](#contents)\n\nThe most useful thing to do is to simply scrape one article and then study the most frequent words before reading that article.\n\nFirst, scrape the article:\n\n`$ nhkore news easy -u 'https://www3.nhk.or.jp/news/easy/k10011862381000/k10011862381000.html'`\n\nIf your internet is slow, there are several global options to help alleviate your internet woes, which can be used with any sub command:\n\n```\n-m --max-retry=\u003cvalue\u003e       maximum number of times to retry URLs\n                             (-1 or integer \u003e= 0) (default: 3)\n-o --open-timeout=\u003cvalue\u003e    seconds for URL open timeouts\n                             (-1 or decimal \u003e= 0)\n-r --read-timeout=\u003cvalue\u003e    seconds for URL read timeouts\n                             (-1 or decimal \u003e= 0)\n-t --timeout=\u003cvalue\u003e         seconds for all URL timeouts: [open, read]\n                             (-1 or decimal \u003e= 0)\n```\n\nExample usage:\n\n`$ nhkore -t 300 -m 10 news easy -u 'https://www3.nhk.or.jp/news/easy/k10011862381000/k10011862381000.html'`\n\nSome older articles will fail to scrape and need additional options (this is very rare):\n\n```\n-D --no-dict             do not try to parse the dictionary files\n                         for the articles; useful in case of errors\n                         trying to load the dictionaries (or for offline testing)\n-L --lenient             leniently (not strict) scrape articles:\n                           body \u0026 title content without the proper\n                           HTML/CSS classes/IDs and no futsuurl;\n                         example URLs:\n                         - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html\n                         - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html\n-M --missingno           very rarely an article will not have kana or kanji\n                         for a Ruby tag; to not raise an error, this will\n                         use previously scraped data to fill it in;\n                         example URL:\n                         - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html\n-d --datetime=\u003cvalue\u003e    date time to use as a fallback in cases\n                         when an article doesn't have one;\n                         format: YYYY-mm-dd H:M; example: 2020-03-30 15:30\n```\n\nExample usage:\n\n`$ nhkore -t 300 -m 10 news -D -L -M -d '2011-03-07 06:30' easy -u 'https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html'`\n\nNow that the data from the article has been scraped, you can generate a CSV/HTML/JSON/YAML file of the words ordered by frequency:\n\n```\n$ nhkore sift easy -e csv\n$ nhkore sift easy -e html\n$ nhkore sift easy -e json\n$ nhkore sift easy -e yml\n```\n\nComplete demo:\n\n[![asciinema Demo - The Basics](https://asciinema.org/a/318958.svg)](https://asciinema.org/a/318958)\n\n### Unlimited Powah! [^](#contents)\n\nGenerate a core word list (e.g., CSV file) for 1 or more pre-scraped articles with ease.\n\nUnlimited powah at your finger tips!\n\n#### Get Command [^](#contents)\n\nThe `get` command will download and extract `nhkore-core.zip` from the [latest release](https://github.com/esotericpig/nhkore/releases/latest) for you.\n\nThis already has tons of articles scraped so that you don't have to re-scrape them. Then, for example, you can easily create a CSV file from all of `2019` or all of `December 2019`.\n\nExample usage:\n\n`$ nhkore get`\n\nBy default, it will extract the data to `./core/`. You can change this:\n\n`$ nhkore get -o 'my dir/'`\n\nComplete demo:\n\n[![asciinema Demo - Get](https://asciinema.org/a/318967.svg)](https://asciinema.org/a/318967)\n\n#### Sift Command [^](#contents)\n\nAfter obtaining the scraped data, you can `sift` all of the data (or select data) into one of these file formats:\n\n| Format | Typical Purpose |\n| --- | --- |\n| CSV | For uploading to a flashcard website (e.g., Memrise, Anki, Buffl) after changing the data appropriately. |\n| HTML | For comfortable viewing in a web browser or for sharing. |\n| YAML/JSON | For developers to automatically add translations or to manipulate the data in some other way programmatically. |\n\nThe data is sorted by frequency in descending order (i.e., most frequent words first).\n\nIf you wish to sort/arrange the data in some other way, CSV editors (e.g., LibreOffice, WPS Office, Microsoft Office) can do this easily and efficiently, or if you are code-savvy, you can programmatically manipulate the CSV/YAML/JSON/HTML file.\n\nThe defaults will sift all of the data into a CSV file, which may not be what you want:\n\n`$ nhkore sift easy`\n\nYou can filter the data by using different options:\n\n```\n-d --datetime=\u003cvalue\u003e    date time to filter on; examples:\n                         - '2020-7-1 13:10...2020-7-31 11:11'\n                         - '2020-12'   (2020, December 1st-31st)\n                         - '7-4...7-9' (July 4th-9th of Current Year)\n                         - '7-9'       (July 9th of Current Year)\n                         - '9'         (9th of Current Year \u0026 Month)\n-t --title=\u003cvalue\u003e       title to filter on, where search text only\n                         needs to be somewhere in the title\n-u --url=\u003cvalue\u003e         URL to filter on, where search text only\n                         needs to be somewhere in the URL\n```\n\nFilter examples:\n\n```\n# Filter by URL.\n$ nhkore sift easy -u 'k10011862381000'\n\n# Filter by title.\n$ nhkore sift easy -t 'マリオ'\n$ nhkore sift easy -t '植えられた桜'\n\n# Filter by date time.\n$ nhkore sift easy -d 2019\n$ nhkore sift easy -d '2019-12'\n$ nhkore sift easy -d '2019-7-4...9' # July 4th to 9th of 2019\n$ nhkore sift easy -d '2019-12-25 13:10'\n\n# Filter by date time \u0026 title.\n$ nhkore sift easy -d '2019-3-29' -t '桜'\n```\n\nYou can save the data to a different format using one of these options:\n\n```\n-e --ext=\u003cvalue\u003e    type of file (extension) to save;\n                    valid options: [csv, htm, html, json, yaml, yml];\n                    not needed if you specify a file extension with\n                    the '--out' option: '--out sift.html'\n                    (default: csv)\n-o --out=\u003cvalue\u003e    'directory/file' to save sifted data to;\n                    if you only specify a directory or a file, it will\n                    attach the appropriate default directory/file name\n                    (defaults:\n                     core/sift_nhk_news_web_easy{search.criteria}{file.ext},\n                     core/sift_nhk_news_web_regular{search.criteria}{file.ext})\n```\n\nFormat examples:\n\n```\n$ nhkore sift easy -e html\n$ nhkore sift easy -e yml\n$ nhkore sift easy -o 'mario.html'\n$ nhkore sift easy -o 'sakura.yml'\n```\n\nLastly, you can ignore certain columns from the output. Definitions can be quite long, and English translations are currently always blank (meant to be filled in manually/programmatically).\n\n```\n-D --no-defn    do not output the definitions for words\n                (which can be quite long)\n-E --no-eng     do not output the English translations for words\n```\n\nComplete demo:\n\n[![asciinema Demo - Sift](https://asciinema.org/a/318982.svg)](https://asciinema.org/a/318982)\n\n### Sakura Fields Forever [^](#contents)\n\nNo more waiting on a new release with pre-scraped files.\n\nScrape all of the latest articles for yourself, forever!\n\n#### Search Command [^](#contents)\n\nThe [news](#news-command-) command (for scraping articles) relies on having a file of article links.\n\nCurrently, the NHK website doesn't provide an historical record of all of its articles, and it's up to the user to find them.\n\nThe format of the file is simple, so you can edit it by hand (or programmatically) very easily:\n\n```YAML\n# core/links_nhk_news_web_easy.yml\n---\nlinks:\n  https://www3.nhk.or.jp/news/easy/k10012323711000/k10012323711000.html:\n    url: https://www3.nhk.or.jp/news/easy/k10012323711000/k10012323711000.html\n    scraped: false\n  https://www3.nhk.or.jp/news/easy/k10012321401000/k10012321401000.html:\n    url: https://www3.nhk.or.jp/news/easy/k10012321401000/k10012321401000.html\n    scraped: false\n```\n\nOnly the key (which is the URL) and the `url` field are required. The rest of the fields will be populated when you scrape the data.\n\n\u003e \u0026lt;rambling\u0026gt;  \n\u003e Originally, I was planning on using a different key so that's why the URL is duplicated. This also allows for a possible future breaking version (major version change) to alter the key. In addition, I was originally planning to allow filtering in this file, so that's why additional fields are populated after scraping the data.  \n\u003e \u0026lt;/rambling\u0026gt;  \n\nExample after running the `news` command:\n\n```YAML\n# core/links_nhk_news_web_easy.yml\n# - After being scraped\n---\nlinks:\n  https://www3.nhk.or.jp/news/easy/k10012323711000/k10012323711000.html:\n    url: https://www3.nhk.or.jp/news/easy/k10012323711000/k10012323711000.html\n    scraped: true\n    datetime: '2020-03-11T16:00:00+09:00'\n    title: 安倍総理大臣「今月２０日ごろまで大きなイベントをしないで」\n    futsuurl: https://www3.nhk.or.jp/news/html/20200310/k10012323711000.html\n    sha256: d1186ebbc2013564e52f21a2e8ecd56144ed5fe98c365f6edbd4eefb2db345eb\n  https://www3.nhk.or.jp/news/easy/k10012321401000/k10012321401000.html:\n    url: https://www3.nhk.or.jp/news/easy/k10012321401000/k10012321401000.html\n    scraped: true\n    datetime: '2020-03-11T11:30:00+09:00'\n    title: 島根県の会社　中国から技能実習生が来なくて困っている\n    futsuurl: https://www3.nhk.or.jp/news/html/20200309/k10012321401000.html\n    sha256: 2df91884fbbafdc69bc3126cb0cb7b63b2c24e85bc0de707643919e4581927a9\n```\n\nIf you don't wish to edit this file by hand (or programmatically), that's where the `search` command comes into play.\n\nCurrently, it only searches \u0026 scrapes `bing.com`, but other search engines and/or methods can easily be added in the future.\n\nExample usage:\n\n`$ nhkore search easy bing`\n\nThere are a few notable options:\n\n```\n-r --results=\u003cvalue\u003e    number of results per page to request from search\n                        (default: 100)\n   --show-count         show the number of links scraped and exit;\n                        useful for manually writing/updating scripts\n                        (but not for use in a variable);\n                        implies '--dry-run' option\n   --show-urls          show the URLs -- if any -- used when searching \u0026\n                        scraping and exit; you can download these for offline\n                        testing and/or slow internet (see '--in' option)\n```\n\nComplete demo:\n\n[![asciinema Demo - Search](https://asciinema.org/a/320457.svg)](https://asciinema.org/a/320457)\n\n#### News Command [^](#contents)\n\nIn [The Basics](#the-basics-), you learned how to scrape 1 article using the `-u/--url` option with the `news` command.\n\nAfter creating a file of links from the [search](#search-command-) command (or manually/programmatically), you can also scrape multiple articles from this file using the `news` command.\n\nThe defaults will scrape the 1st unscraped article from the `links` file:\n\n`$ nhkore news easy`\n\nYou can scrape the 1st **X** unscraped articles with the `-s/--scrape` option:\n\n```\n# Scrape the 1st 11 unscraped articles.\n$ nhkore news -s 11 easy\n```\n\nYou may wish to re-scrape articles that have already been scraped with the `-r/--redo` option:\n\n`$ nhkore news -r -s 11 easy`\n\nIf you only wish to scrape specific article links, then you should use the `-k/--like` option, which does a fuzzy search on the URLs. For example, `--like '00123'` will match these links:\n\n- http\u003cspan\u003es://w\u003c/span\u003eww3.nhk.or.jp/news/easy/k1**00123**23711000/k10012323711000.html\n- http\u003cspan\u003es://w\u003c/span\u003eww3.nhk.or.jp/news/easy/k1**00123**21401000/k10012321401000.html\n- http\u003cspan\u003es://w\u003c/span\u003eww3.nhk.or.jp/news/easy/k1**00123**21511000/k10012321511000.html\n- ...\n\n`$ nhkore news -k '00123' -s 11 easy`\n\nLastly, you can show the dictionary URL and contents for the 1st article if you're getting dictionary-related errors:\n\n```\n# This will exit after showing the 1st article's dictionary.\n$ nhkore news easy --show-dict\n```\n\nFor the rest of the options, please see [The Basics](#the-basics-).\n\nComplete demo:\n\n[![asciinema Demo - News](https://asciinema.org/a/322324.svg)](https://asciinema.org/a/322324)\n\nWhen I first scraped all of the articles in [nhkore-core.zip](https://github.com/esotericpig/nhkore/releases/latest), I had to use this [script](samples/looper.rb) because my internet isn't very good.\n\n## Using the Library [^](#contents)\n\n### Setup\n\nPick your poison...\n\nIn your *Gemspec* (*\u0026lt;project\u0026gt;.gemspec*):\n\n```Ruby\nspec.add_runtime_dependency 'nhkore', '~\u003e X.X'\n```\n\nIn your *Gemfile*:\n\n```Ruby\n# Pick one...\ngem 'nhkore', '~\u003e X.X'\ngem 'nhkore', :git =\u003e 'https://github.com/esotericpig/nhkore.git', :tag =\u003e 'vX.X.X'\n```\n\n### Require\n\nIn order to not require all of the CLI-related files, require this file instead:\n\n```Ruby\nrequire 'nhkore/lib'\n\n#require 'nhkore' # Slower\n```\n\n### Scraper\n\nAll scraper classes extend this class. You can either extend it or use it by itself. It's a simple wrapper around *open-uri*, *Nokogiri*, etc.\n\n`initialize` automatically opens (connects to) the URL.\n\n```Ruby\nrequire 'nhkore/scraper'\n\nclass MyScraper \u003c NHKore::Scraper\n  def initialize()\n    super('https://www3.nhk.or.jp/news/easy/')\n  end\nend\n\nm = MyScraper.new()\ns = NHKore::Scraper.new('https://www3.nhk.or.jp/news/easy/')\n\n# Read all content into a String.\nmstr = m.read()\nsstr = s.read()\n\n# Get a Nokogiri::HTML object.\nmdoc = m.html_doc()\nsdoc = s.html_doc()\n\n# Get a RSS object.\ns = NHKore::Scraper.new('https://www.bing.com/search?format=rss\u0026q=site%3Anhk.or.jp%2Fnews%2Feasy%2F\u0026count=100')\n\nrss = s.rss_doc()\n```\n\nThere are several useful options:\n\n```Ruby\nrequire 'nhkore/scraper'\n\ns = NHKore::Scraper.new('https://www3.nhk.or.jp/news/easy/',\n  open_timeout: 300, # Open timeout in seconds (default: nil)\n  read_timeout: 300, # Read timeout in seconds (default: nil)\n\n  # Maximum number of times to retry the URL\n  # - default: 3\n  # - Open/connect will fail a couple of times on a bad/slow internet connection.\n  max_retries: 10,\n\n  # Maximum number of redirects allowed.\n  # - default: 3\n  # - You can set this to nil or -1, but I recommend using a number\n  #   for safety (infinite-loop attack).\n  max_redirects: 1,\n\n  # How to check redirect URLs for safety.\n  # - default: :strict\n  # - nil      =\u003e do not check\n  # - :lenient =\u003e check the scheme only\n  #               (i.e., if https, redirect URL must be https)\n  # - :strict  =\u003e check the scheme and domain\n  #               (i.e., if https://bing.com, redirect URL must be https://bing.com)\n  redirect_rule: :lenient,\n\n  # Set the HTTP header field 'cookie' from the 'set-cookie' response.\n  # - default: false\n  # - Currently uses the 'http-cookie' Gem.\n  # - This is currently a time-consuming operation because it opens the URL twice.\n  # - Necessary for Search Engines or other sites that require cookies\n  #   in order to block bots.\n  eat_cookie: true,\n\n  # Set HTTP header fields.\n  # - default: nil\n  # - Necessary for Search Engines or other sites that try to block bots.\n  # - Simply pass in a Hash (not nil) to set the default ones.\n  header: {'user-agent' =\u003e 'Skynet'}, # Must use strings\n)\n\n# Open the URL yourself. This will be passed in directly to Nokogiri::HTML().\n# - In this way, you can use Faraday, HTTParty, RestClient, httprb/http, or\n#   some other Gem.\ns = NHKore::Scraper.new('https://www3.nhk.or.jp/news/easy/',\n  str_or_io: URI.open('https://www3.nhk.or.jp/news/easy/',redirect: false)\n)\n\n# Open and parse a file instead of a URL (for offline testing or slow internet).\ns = NHKore::Scraper.new('./my_article.html',is_file: true)\n\ndoc = s.html_doc()\n```\n\nHere are some other useful methods:\n\n```Ruby\nrequire 'nhkore/scraper'\n\ns = NHKore::Scraper.new('https://www3.nhk.or.jp/news/easy/')\n\ns.reopen() # Re-open the current URL.\n\n# Get a relative URL.\nurl = s.join_url('../../monkey.html')\nputs url # https://www3.nhk.or.jp/monkey.html\n\n# Open a new URL or file.\ns.open(url)\ns.open(url,URI.open(url,redirect: false))\n\ns.open('./my_article.html',is_file: true)\n\n# Open a file manually.\ns.open_file('./my_article.html')\n\n# Fetch the cookie \u0026 open a new URL manually.\ns.fetch_cookie(url)\ns.open_url(url)\n```\n\n### SearchScraper \u0026 BingScraper\n\n`SearchScraper` is used for scraping Search Engines for NHK News Web (Easy) links. It can also be used for search in general.\n\nBy default, it sets the default HTTP header fields and fetches \u0026 sets the cookie.\n\n```Ruby\nrequire 'nhkore/search_scraper'\n\nss = NHKore::SearchScraper.new('https://www.bing.com/search?q=nhk\u0026count=100')\n\ndoc = ss.html_doc()\n\ndoc.css('a').each() do |anchor|\n  link = anchor['href']\n\n  next if ss.ignore_link?(link,cleaned: false)\n\n  if link.include?('https://www3.nhk')\n    puts link\n  end\nend\n```\n\n`BingScraper` will search `bing.com` for you.\n\n```Ruby\nrequire 'nhkore/search_link'\nrequire 'nhkore/search_scraper'\n\nbs     = NHKore::BingScraper.new(:yasashii)\nslinks = NHKore::SearchLinks.new()\n\nnext_page = bs.scrape(slinks)\npage_num  = 1\n\nwhile !next_page.empty?()\n  puts \"Page #{page_num += 1}: #{next_page.count}\"\n\n  bs = NHKore::BingScraper.new(:yasashii,url: next_page.url)\n\n  next_page = bs.scrape(slinks,next_page)\nend\n\nslinks.links.values.each() do |link|\n  puts link.url\nend\n```\n\n### ArticleScraper \u0026 DictScraper\n\n`ArticleScraper` scrapes an NHK News Web Easy article. Regular articles aren't currently supported.\n\n```Ruby\nrequire 'nhkore/article_scraper'\nrequire 'time'\n\nas = NHKore::ArticleScraper.new(\n  'https://www3.nhk.or.jp/news/easy/k10011862381000/k10011862381000.html',\n\n  # If false, scrape the article leniently (for older articles which\n  # may not have certain tags, etc.).\n  # - default: true\n  strict: false,\n\n  # {Dict} to use as the dictionary for words (Easy articles).\n  # - default: :scrape\n  # - nil     =\u003e don't scrape/use it (necessary for Regular articles)\n  # - :scrape =\u003e auto-scrape it using {DictScraper}\n  # - {Dict}  =\u003e your own {Dict}\n  dict: nil,\n\n  # Date time to use as a fallback if the article doesn't have one\n  # (for older articles).\n  # - default: nil\n  datetime: Time.new(2020,2,2),\n\n  # Year to use as a fallback if the article doesn't have one\n  # (for older articles).\n  # - default: nil\n  year: 2020,\n)\n\narticle = as.scrape()\n\narticle.datetime\narticle.futsuurl\narticle.sha256\narticle.title\narticle.url\n\narticle.words.each() do |key,word|\n  word.defn\n  word.eng\n  word.freq\n  word.kana\n  word.kanji\n  word.key\nend\n\nputs article.to_s(mini: true)\nputs '---'\nputs article\n```\n\n`DictScraper` scrapes an Easy article's dictionary file (JSON).\n\n```Ruby\nrequire 'nhkore/dict_scraper'\n\nurl = 'https://www3.nhk.or.jp/news/easy/k10011862381000/k10011862381000.html'\nds  = NHKore::DictScraper.new(\n  url,\n\n  # Change the URL appropriately to the dictionary URL.\n  # - default: true\n  parse_url: true,\n)\n\nputs NHKore::DictScraper.parse_url(url)\nputs\n\ndict = ds.scrape()\n\ndict.entries.each() do |key,entry|\n  entry.id\n\n  entry.defns.each() do |defn|\n    defn.hyoukis.each() {|hyouki| }\n    defn.text\n    defn.words.each() {|word| }\n  end\n\n  puts entry.build_hyouki()\n  puts entry.build_defn()\n  puts '---'\nend\n\nputs\nputs dict\n```\n\n### Fileable\n\nAny class that includes the `Fileable` mixin will have the following methods:\n\n- Class.load_file(file,mode: 'rt:BOM|UTF-8',**kargs)\n- save_file(file,mode: 'wt',**kargs)\n\nAny *kargs* will be passed to `File.open()`.\n\n```Ruby\nrequire 'nhkore/news'\nrequire 'nhkore/search_link'\n\nyn = NHKore::YasashiiNews.load_file()\nsl = NHKore::SearchLinks.load_file(NHKore::SearchLinks::DEFAULT_YASASHII_FILE)\n\nyn.articles.each() {|key,article| }\nyn.sha256s.each()  {|sha256,url|  }\n\nsl.links.each() do |key,link|\n  link.datetime\n  link.futsuurl\n  link.scraped?\n  link.sha256\n  link.title\n  link.url\nend\n\n#yn.save_file()\n#sl.save_file(NHKore::SearchLinks::DEFAULT_YASASHII_FILE)\n```\n\n### Sifter\n\n`Sifter` will sift \u0026 sort the `News` data into a single file. The data is sorted by frequency in descending order (i.e., most frequent words first).\n\n```Ruby\nrequire 'nhkore/datetime_parser'\nrequire 'nhkore/news'\nrequire 'nhkore/sifter'\nrequire 'time'\n\nnews = NHKore::YasashiiNews.load_file()\n\nsifter = NHKore::Sifter.new(news)\n\nsifter.caption = 'Sakura Fields Forever!'\n\n# Filter the data.\nsifter.filter_by_datetime(NHKore::DatetimeParser.parse_range('2019-12-4...7'))\nsifter.filter_by_datetime([Time.new(2019,12,4),Time.new(2019,12,7)])\nsifter.filter_by_datetime(\n  from: Time.new(2019,12,4),to: Time.new(2019,12,7)\n)\nsifter.filter_by_title('桜')\nsifter.filter_by_url('k100')\n\n# Ignore certain columns from the output.\nsifter.ignore(:defn)\nsifter.ignore(:eng)\n\n# An array of the sifted words.\nwords = sifter.sift() # Filtered \u0026 Sorted array of Word\nrows  = sifter.build_rows(words) # Ignored array of array\n\n# Choose the file format.\n#sifter.put_csv!()\n#sifter.put_html!()\n#sifter.put_json!()\nsifter.put_yaml!()\n\n# Save to a file.\nfile = 'sakura.yml'\n\nif !File.exist?(file)\n  sifter.save_file(file)\nend\n```\n\n### Util \u0026 DatetimeParser\n\nThese provide a variety of useful methods/constants.\n\nHere are some of the most useful ones:\n\n```Ruby\nrequire 'nhkore/datetime_parser'\nrequire 'nhkore/util'\n\ninclude NHKore\n\nputs '======='\nputs '[ Net ]'\nputs '======='\nuri = URI('https://www.bing.com/search?q=nhk')\nUtil.replace_uri_query!(uri,q: 'banana')\n\nputs \"URI query:   #{uri}\" # https://www.bing.com/search?q=banana\n# nhk.or.jp\nputs \"Domain:      #{Util.domain(URI('https://www.nhk.or.jp/news/easy').host)}\"\n# Ben \u0026amp; Jerry\u0026#39;s\u003cbr\u003e\nputs \"Escape HTML: #{Util.escape_html(\"Ben \u0026 Jerry's\\n\")}\"\nputs\n\nputs '========'\nputs '[ Time ]'\nputs '========'\nputs \"JST now:   #{Util.jst_now()}\"\n# Drops in JST_OFFSET, does not change hour/min.\nputs \"JST time:  #{Util.jst_time(Time.now)}\"\nputs \"JST year:  #{Util::JST_YEAR}\"\nputs \"1999 sane? #{Util.sane_year?(1999)}\" # true\nputs \"1776 sane? #{Util.sane_year?(1776)}\" # false\nputs \"Guess 5:   #{DatetimeParser.guess_year(5)}\"  # 2005\nputs \"Guess 99:  #{DatetimeParser.guess_year(99)}\" # 1999\n# =\u003e [2020-12-01 00:00:00 +0900, 2020-12-31 23:59:59 +0900]\nputs \"Parse:     #{DatetimeParser.parse_range('2020-12')}\"\nputs\nputs \"JST timezone offset:        #{Util::JST_OFFSET}\"\nputs \"JST timezone offset hour:   #{Util::JST_OFFSET_HOUR}\"\nputs \"JST timezone offset minute: #{Util::JST_OFFSET_MIN}\"\nputs\n\nputs '============'\nputs '[ Japanese ]'\nputs '============'\n\nJPN = ['桜','ぶ','ブ']\n\ndef fmt_jpn()\n  fmt = []\n\n  JPN.each() do |x|\n    x = yield(x)\n    x = x ? \"\\u2B55\" : Util::JPN_SPACE unless x.is_a?(String)\n    fmt \u003c\u003c x\n  end\n\n  return \"[ #{fmt.join(' | ')} ]\"\nend\n\nputs \"          #{fmt_jpn{|x| x}}\"\nputs \"Hiragana? #{fmt_jpn{|x| Util.hiragana?(x)}}\"\nputs \"Kana?     #{fmt_jpn{|x| Util.kana?(x)}}\"\nputs \"Kanji?    #{fmt_jpn{|x| Util.kanji?(x)}}\"\nputs \"Reduce:   #{Util.reduce_jpn_space(\"'     '\")}\"\nputs\n\nputs '========='\nputs '[ Files ]'\nputs '========='\nputs \"Dir str?   #{Util.dir_str?('dir/')}\"          # true\nputs \"Dir str?   #{Util.dir_str?('dir')}\"           # false\nputs \"File str?  #{Util.filename_str?('file')}\"     # true\nputs \"File str?  #{Util.filename_str?('dir/file')}\" # false\n```\n\n## Hacking [^](#contents)\n\n```\n$ git clone 'https://github.com/esotericpig/nhkore.git'\n$ cd nhkore\n$ bundle install\n$ bundle exec rake -T\n```\n\nInstall Nokogiri:\n\n```\n$ bundle exec rake nokogiri_apt   # Ubuntu/Debian\n$ bundle exec rake nokogiri_dnf   # Fedora/CentOS/Red Hat\n$ bundle exec rake nokogiri_other # macOS, Windows, etc.\n```\n\n### Running\n\n`$ ruby -w lib/nhkore.rb`\n\n### Testing\n\n`$ bundle exec rake test`\n\n### Generating Doc\n\n`$ bundle exec rake doc`\n\n### Installing Locally\n\nYou can make some changes/fixes to the code and then install your local version:\n\n`$ bundle exec rake install:local`\n\n### Updating [^](#contents)\n\nThis will update *core/* for you:\n\n`$ bundle exec rake update_core`\n\n### Releasing [^](#contents)\n\n1. Update *CHANGELOG.md*, *version.rb*, \u0026 *Gemfile.lock*:\n    - With *Raketary*:\n        - `$ raketary bump -v`\n        - `$ raketary bump -p`\n    - `$ bundle update`\n    - `$ bundle outdated`\n2. Update packages:\n    - `$ bundle exec rake update_core`\n    - `$ bundle exec rake clobber build pkg_core`\n3. Commit \u0026 Push.\n4. Create a new tag \u0026 release:\n    - Note: make sure to add *pkg/nhkore-core.zip*\n    - `$ gh release create v0 pkg/*.gem pkg/*.zip`\n    - `$ git pull \u0026\u0026 git fetch`\n5. Release to *RubyGems*:\n    - `$ bundle exec rake release`\n\nReleasing new HTML file for website:\n\n1. `$ bundle exec rake update_showcase`\n\n## License [^](#contents)\n\n[GNU LGPL v3+](LICENSE.txt)\n\n\u003e NHKore (\u003chttps://github.com/esotericpig/nhkore\u003e)  \n\u003e Copyright (c) 2020-2025 Bradley Whited  \n\u003e \n\u003e NHKore is free software: you can redistribute it and/or modify  \n\u003e it under the terms of the GNU Lesser General Public License as published by  \n\u003e the Free Software Foundation, either version 3 of the License, or  \n\u003e (at your option) any later version.  \n\u003e \n\u003e NHKore is distributed in the hope that it will be useful,  \n\u003e but WITHOUT ANY WARRANTY; without even the implied warranty of  \n\u003e MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  \n\u003e GNU Lesser General Public License for more details.  \n\u003e \n\u003e You should have received a copy of the GNU Lesser General Public License  \n\u003e along with NHKore.  If not, see \u003chttps://www.gnu.org/licenses/\u003e.  \n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fesotericpig%2Fnhkore","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fesotericpig%2Fnhkore","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fesotericpig%2Fnhkore/lists"}