{"id":18266029,"url":"https://github.com/zjhiphop/cnext","last_synced_at":"2026-01-22T10:35:51.001Z","repository":{"id":141467089,"uuid":"183179295","full_name":"zjhiphop/cnext","owner":"zjhiphop","description":"A Chinese content extractor for web page.","archived":false,"fork":false,"pushed_at":"2019-04-24T08:07:44.000Z","size":3715,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":2,"default_branch":"master","last_synced_at":"2025-04-09T01:47:12.371Z","etag":null,"topics":["extractor","machine-learning","web"],"latest_commit_sha":null,"homepage":null,"language":"HTML","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/zjhiphop.png","metadata":{"files":{"readme":"readme.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2019-04-24T08:03:48.000Z","updated_at":"2019-04-24T08:09:30.000Z","dependencies_parsed_at":null,"dependency_job_id":"da0d25a1-d792-4168-80f0-45804ca39b34","html_url":"https://github.com/zjhiphop/cnext","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/zjhiphop/cnext","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/zjhiphop%2Fcnext","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/zjhiphop%2Fcnext/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/zjhiphop%2Fcnext/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/zjhiphop%2Fcnext/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/zjhiphop","download_url":"https://codeload.github.com/zjhiphop/cnext/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/zjhiphop%2Fcnext/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":28661874,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-01-22T01:17:37.254Z","status":"online","status_checked_at":"2026-01-22T02:00:07.137Z","response_time":144,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["extractor","machine-learning","web"],"created_at":"2024-11-05T11:21:17.525Z","updated_at":"2026-01-22T10:35:50.986Z","avatar_url":"https://github.com/zjhiphop.png","language":"HTML","funding_links":[],"categories":[],"sub_categories":[],"readme":"Content Extractor\n================\n\n\u003e A text based extractor based on modern tech such as Machine Learning.\n\nRoad Map\n========\n1. Web content Extractor\n2. Email content Extractor\n3. IM content Extractor\n\nTODO\n====\n1. Chinese content extractor\n\u003e\n    (1) 预处理：将网页解析成DOM树，并剔除不可视节点.\n    (2) 获取待提取文本块：根据网页DOM树计算各个块的文本密度，并将文本密度大于\u003cbody\u003e块的文本块的上一级文本块作为待提取块.\n    (3) 获取标签路径集合：计算每条标签路径的TPR值，设定阈值，获取正文节点候选的路径集合.\n    (4) 提取正文：将 (3) 的候选路径集合与 (2) 获取的文本块中的路径集合求交集，将交集中路径节点的文本提取，输出为网页正文.","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fzjhiphop%2Fcnext","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fzjhiphop%2Fcnext","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fzjhiphop%2Fcnext/lists"}