{"id":19989816,"url":"https://github.com/wcygan/crawler","last_synced_at":"2026-05-08T23:03:04.612Z","repository":{"id":132525518,"uuid":"600932103","full_name":"wcygan/crawler","owner":"wcygan","description":"web crawler","archived":false,"fork":false,"pushed_at":"2023-09-15T04:44:43.000Z","size":127,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":1,"default_branch":"master","last_synced_at":"2025-01-12T12:20:27.048Z","etag":null,"topics":["crawler","crawling","tokio","tokio-rs","web-crawler"],"latest_commit_sha":null,"homepage":"","language":"Rust","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/wcygan.png","metadata":{"files":{"readme":"readme.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2023-02-13T01:57:52.000Z","updated_at":"2023-03-10T02:45:14.000Z","dependencies_parsed_at":"2024-11-13T04:50:31.413Z","dependency_job_id":"49311f59-cae8-409c-b33e-4826b560d1b8","html_url":"https://github.com/wcygan/crawler","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/wcygan%2Fcrawler","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/wcygan%2Fcrawler/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/wcygan%2Fcrawler/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/wcygan%2Fcrawler/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/wcygan","download_url":"https://codeload.github.com/wcygan/crawler/tar.gz/refs/heads/master","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":241430318,"owners_count":19961635,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["crawler","crawling","tokio","tokio-rs","web-crawler"],"created_at":"2024-11-13T04:50:25.435Z","updated_at":"2026-05-08T23:02:59.593Z","avatar_url":"https://github.com/wcygan.png","language":"Rust","funding_links":[],"categories":[],"sub_categories":[],"readme":"# Crawler\n\nA web crawler written in Rust.\n\nThis crawler creates a web graph by exploring all URLs that it finds.\n\n## Design\n\nThe crawler is split into two parts:\n\n1. The connection pool\n2. The parser pool\n\nThe crawler will spin up as many connections \u0026 parsers as you specify. \n\nThe connection pool will handle all HTTP requests, while the parser pool will handle all HTML parsing.\n\nRequests to the same domain are rate limited to avoid being blocked by the server.\n\nThe URL mapping is written to an index which can be written to disk during shutdown.\n\n## Resources\n\n- [Tokio](https://crates.io/crates/tokio) - asynchronous runtime\n- [Tokio-utils](https://crates.io/crates/tokio-utils) - rate limiter, graceful shutdown\n- [Reqwest](https://crates.io/crates/reqwest/) - HTTP client\n- [Dashmap](https://crates.io/crates/dashmap/) - concurrent hash map","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fwcygan%2Fcrawler","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fwcygan%2Fcrawler","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fwcygan%2Fcrawler/lists"}