{"id":16973587,"url":"https://github.com/carloocchiena/python_url_crawler","last_synced_at":"2026-02-12T22:02:06.856Z","repository":{"id":109540168,"uuid":"341337518","full_name":"carloocchiena/python_url_crawler","owner":"carloocchiena","description":"A script that starting from a webpage, iterate thru all its link, appending them in a list. Sort of proxy to get all pages in a website","archived":false,"fork":false,"pushed_at":"2022-11-02T20:19:45.000Z","size":8,"stargazers_count":2,"open_issues_count":0,"forks_count":2,"subscribers_count":2,"default_branch":"main","last_synced_at":"2025-05-30T14:38:18.950Z","etag":null,"topics":["beautifulsoup","crawler","python","python3"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/carloocchiena.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2021-02-22T21:07:01.000Z","updated_at":"2024-07-16T11:34:37.000Z","dependencies_parsed_at":"2023-03-22T03:48:39.267Z","dependency_job_id":null,"html_url":"https://github.com/carloocchiena/python_url_crawler","commit_stats":{"total_commits":9,"total_committers":1,"mean_commits":9.0,"dds":0.0,"last_synced_commit":"73bd76d1e36d05e0f394f2398e811f383d119765"},"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/carloocchiena/python_url_crawler","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/carloocchiena%2Fpython_url_crawler","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/carloocchiena%2Fpython_url_crawler/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/carloocchiena%2Fpython_url_crawler/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/carloocchiena%2Fpython_url_crawler/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/carloocchiena","download_url":"https://codeload.github.com/carloocchiena/python_url_crawler/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/carloocchiena%2Fpython_url_crawler/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":29382871,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-02-12T20:34:40.886Z","status":"ssl_error","status_checked_at":"2026-02-12T20:23:00.490Z","response_time":55,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.6:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["beautifulsoup","crawler","python","python3"],"created_at":"2024-10-14T01:02:34.076Z","updated_at":"2026-02-12T22:02:06.841Z","avatar_url":"https://github.com/carloocchiena.png","language":"Python","funding_links":[],"categories":[],"sub_categories":[],"readme":"# python_url_crawler\nA script that starting from a webpage, iterate thru all its link, appending them in a list. Sort of proxy to get all pages in a website.\n\nthe old_main is a raw version I made in 1 hours outta a stack overflow questions;\n\nmain.py is a quite better version I created from blank, with less code entropy. Seems working decently.\n\nConsider that the script aims to find only urls within the domain, but this could be easily configured tweaking the \"cleaner\" function\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcarloocchiena%2Fpython_url_crawler","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fcarloocchiena%2Fpython_url_crawler","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcarloocchiena%2Fpython_url_crawler/lists"}