{"id":18458856,"url":"https://github.com/coverified/spider","last_synced_at":"2026-04-29T00:32:25.702Z","repository":{"id":103784847,"uuid":"391694064","full_name":"coverified/spider","owner":"coverified","description":"A microservice with web-crawler/spider capabilities which only follows and indexes urls of the provided host domain(s)","archived":false,"fork":false,"pushed_at":"2021-09-30T08:25:27.000Z","size":286,"stargazers_count":2,"open_issues_count":0,"forks_count":0,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-04-23T17:42:04.173Z","etag":null,"topics":["akka","crawler","graphql","hacktoberfest","microservice","spider"],"latest_commit_sha":null,"homepage":"","language":"Scala","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"bsd-3-clause","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/coverified.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2021-08-01T17:30:53.000Z","updated_at":"2023-05-25T02:31:51.000Z","dependencies_parsed_at":null,"dependency_job_id":"e9762c62-bfba-4206-ba9d-f005d505f331","html_url":"https://github.com/coverified/spider","commit_stats":null,"previous_names":[],"tags_count":3,"template":false,"template_full_name":null,"purl":"pkg:github/coverified/spider","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coverified%2Fspider","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coverified%2Fspider/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coverified%2Fspider/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coverified%2Fspider/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/coverified","download_url":"https://codeload.github.com/coverified/spider/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/coverified%2Fspider/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":32405901,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-04-28T19:38:08.556Z","status":"ssl_error","status_checked_at":"2026-04-28T19:37:55.688Z","response_time":56,"last_error":"SSL_read: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["akka","crawler","graphql","hacktoberfest","microservice","spider"],"created_at":"2024-11-06T08:20:29.641Z","updated_at":"2026-04-29T00:32:25.661Z","avatar_url":"https://github.com/coverified.png","language":"Scala","funding_links":[],"categories":[],"sub_categories":[],"readme":"# spider\nA microservice to crawl a set of sites by following links to pages of the relevant domains.\nOnly the relevant host urls of the provided host(s) are considered.\nNew URLs are entered into a GraphQL database.\n\n## Used Frameworks / Libraries\n_(not comprehensive, but the most important ones)_\n\n-   [akka](https://akka.io/)\n-   [Caliban Client](https://ghostdogpr.github.io/caliban/) to talk to GraphQL endpoint\n-   [Sentry](https://sentry.io/welcome/) (error reporting)\n\n\n## Configuration\nConfiguration is done using environment variables.\nThe following configuration parameters are available.\n\nEnvironment config values:\n- `API_URL` - GraphQL API URL (**required**)\n- `AUTH_SECRET` - GraphQL authentication secret (**required**)\n- `SCRAPE_PARALLELISM` - number of pages that crawler visits in parallel (default: 100)\n- `SCRAPE_INTERVAL` - time interval between page hits (default: 500ms)\n- `SCRAPE_TIMEOUT` - timeout of each page load attempt (default: 20.000ms)\n- `SHUTDOWN_TIMEOUT` - time after which spider exits, if no new URLs have been found (default: 15.000ms)\n- `MAX_RETRIES` - max number of retries after attempts to load a page failed (default: 0)\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcoverified%2Fspider","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fcoverified%2Fspider","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcoverified%2Fspider/lists"}