{"id":19084330,"url":"https://github.com/coghost/iparse","last_synced_at":"2025-10-12T00:13:34.603Z","repository":{"id":55641268,"uuid":"228325148","full_name":"coghost/iparse","owner":"coghost","description":"To extract HTML/json content identified by CSS selectors(with bs4) with yaml config support","archived":false,"fork":false,"pushed_at":"2021-01-28T08:01:17.000Z","size":80,"stargazers_count":4,"open_issues_count":0,"forks_count":1,"subscribers_count":1,"default_branch":"master","last_synced_at":"2025-09-15T06:44:49.204Z","etag":null,"topics":["crawler","parser","parser-library","python","xkcd","yaml"],"latest_commit_sha":null,"homepage":"","language":"HTML","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"gpl-3.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/coghost.png","metadata":{"files":{"readme":"README.md","changelog":"CHANGELOG.md","contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null}},"created_at":"2019-12-16T07:11:49.000Z","updated_at":"2023-03-28T05:13:27.000Z","dependencies_parsed_at":"2022-08-15T05:20:14.609Z","dependency_job_id":null,"html_url":"https://github.com/coghost/iparse","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/coghost/iparse","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coghost%2Fiparse","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coghost%2Fiparse/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coghost%2Fiparse/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coghost%2Fiparse/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/coghost","download_url":"https://codeload.github.com/coghost/iparse/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coghost%2Fiparse/sbom","scorecard":{"id":298693,"data":{"date":"2025-08-11","repo":{"name":"github.com/coghost/iparse","commit":"a1de07a977dfba43a9262b630081fe785dab03b4"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":3,"checks":[{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Code-Review","score":0,"reason":"Found 0/12 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Vulnerabilities","score":10,"reason":"0 existing vulnerabilities detected","details":null,"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE:0","Info: FSF or OSI recognized license: GNU General Public License v3.0: LICENSE:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 5 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}}]},"last_synced_at":"2025-08-17T20:10:11.811Z","repository_id":55641268,"created_at":"2025-08-17T20:10:11.812Z","updated_at":"2025-08-17T20:10:11.812Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":279009475,"owners_count":26084609,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-10-11T02:00:06.511Z","response_time":55,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["crawler","parser","parser-library","python","xkcd","yaml"],"created_at":"2024-11-09T02:50:54.638Z","updated_at":"2025-10-12T00:13:34.588Z","avatar_url":"https://github.com/coghost.png","language":"HTML","readme":"## iparse\n\n**iparse** is a Python package for parsing HTML to structured data in an easy way with as little code as possible.\n\nIt aims to make the process of parsing HTML quick and easy!\n\niparse highlights:\n\n- mainly code with **YAML**\n- only refine raw HTML info with **fewest python code**\n- lot's HTML layout changes, only **YAML** will be involved\n\n### Installation\n\n```sh\npip install iparse\n```\n\n### A Simple Example\n\nfor HTML page: i.e. [lovely xkcd python](https://xkcd.com/353/)\n\nto get the structured data all you need are\n\n- create a class inherit from `IParser`\n- write a YAML config file represents all locators\n\n#### create **xkcd_353.py**\n\n`xkcd_353.py` will go through the startup_dir, look for a file named as the snake_case of the ClassName without `suffix:Parser`, so `XkcdParser` will be `xkcd.yaml`\n\n```python\nfrom pathlib import Path\nfrom iparse._parse import IParser, RsvWords\n\nHOME_DIR = Path(__file__).parents[0]\n\n\nclass XkcdParser(IParser):\n    def __init__(self, file_name, is_test_mode=False, **kwargs):\n        kwargs['startup_dir'] = kwargs.get('startup_dir', HOME_DIR)\n        super().__init__(file_name, is_test_mode=is_test_mode, **kwargs)\n\n\nif __name__ == \"__main__\":\n    xkcd = XkcdParser(file_name=HOME_DIR / 'xkcd_python_353.htm')\n    xkcd.do_parse()\n    print(xkcd.data)\n```\n\n#### create a file named **xkcd.yaml**\n\n\u003e you can use any locator that is supported, but [css selector](http://www.java2s.com/Tutorials/HTML_CSS/CSS_Selector/index.htm) is recommended\n\n```yaml\npage:\n  # css_selector of title: head\u003etitle\n  title: head\u003etitle\n  # css_selector: div#footnote\n  footnote: div#footnote\n  # css_selector: div#licenseText\n  license: div#licenseText\n```\n\n#### the output parsed data\n\nthe parsed data `xkcd.data` is dict, but you can also use it with `xkcd.data_as_yaml/xkcd.data_as_json`\n\n\u003e yaml output\n\n```yaml\npage:\n  footnote: \"xkcd.com is best viewed with Netscape Navigator 4.0 or below on a Pentium\\\n    \\ 3\\xB11 emulated in Javascript on an Apple IIGSat a screen resolution of 1024x1.\\\n    \\ Please enable your ad blockers, disable high-heat drying, and remove your devicefrom\\\n    \\ Airplane Mode and set it to Boat Mode. For security reasons, please leave caps\\\n    \\ lock on while browsing.\"\n  license: '\n\n\n    This work is licensed under a\n\n    Creative Commons Attribution-NonCommercial 2.5 License.\n\n\n    This means you''re free to copy and share these comics (but not to sell them).\n    More details.\n\n    '\n  title: 'xkcd: Python'\n```\n\n\u003e json output\n\n```json\n{\n  \"page\": {\n    \"footnote\": \"xkcd.com is best viewed with Netscape Navigator 4.0 or below on a Pentium 3\\u00b11 emulated in Javascript on an Apple IIGSat a screen resolution of 1024x1. Please enable your ad blockers, disable high-heat drying, and remove your devicefrom Airplane Mode and set it to Boat Mode. For security reasons, please leave caps lock on while browsing.\",\n    \"license\": \"\\n\\nThis work is licensed under a\\nCreative Commons Attribution-NonCommercial 2.5 License.\\n\\nThis means you're free to copy and share these comics (but not to sell them). More details.\\n\",\n    \"title\": \"xkcd: Python\"\n  }\n}\n```\n\n### Details\n\n```yaml\n# all settings added to __raw, will be kept as it added\n__raw:\n  site_url: https://xkcd.com/\n\n\npage:\n  # if not _locator supplied will reuse parent soup\n  # page has no parent soup, so use default root soup\n  title: head\u003etitle\n  footnote: div#footnote\n  license:\n    _locator: div#licenseText\n    # strip blank with true, but also can specified a str\n    _striped: true\n\ntop_container:\n  # we set a _locator here, all sub-nodes will select within top_container\n  _locator: div#topContainer\n  top_left:\n    # _index:~ means None, so we can use whole list\n    _index: ~\n    _locator: div#topLeft\u003eul\u003eli\u003ea\n    # if non-reserved key set to ~, means use parent soup, and use its text\n    # this is a convenient way to get text\n    menu_text: ~\n    menu_url:\n      # when other attributes exist, no need to add _locator to use its parent soup\n      _attr: href\n      # if we need some extra work on _attr, goes with two ways\n      # 1. `_attr_refine: true` will auto generate =\u003e _refine_menu_url_href\n      # the rule of auto-generator is _refine_\u003ckey_name\u003e_\u003cattr_value\u003e\n      # 2. `_attr_refine: _a_valid_method_name`\n      _attr_refine: true\n  top_right:\n    _locator: div#topRight\n    masthead:\n      # two way to get more than one attributes on a element\n      # e.g. image.src/.alt\n      # way1: if all src/alt need refine, this will treat attrs as list\n      image_1:\n        _attr:\n          - src\n          - alt\n        _attr_refine: true\n        _locator: \u0026LOGO_IMG span\u003ea\u003eimg\n      # way2: not all src/alt need refine, this will treat attrs as dict\n      image_2:\n        _locator: *LOGO_IMG\n        src:\n          _attr: src\n          # only set _attr_refine to src\n          # 1. _attr_refine: true =\u003e _refine_src_src\n          # 2. _attr_refine: _refine_image_1_src to reuse exists method\n          _attr_refine: _refine_image_1_src\n        alt:\n          _attr: alt\n\n      slogan: span#slogan\n```\n\n### more\n\nplease check the `tests/` for more infomation.\n","funding_links":[],"categories":[],"sub_categories":[],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcoghost%2Fiparse","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fcoghost%2Fiparse","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcoghost%2Fiparse/lists"}