{"id":15460988,"url":"https://github.com/machawk1/use","last_synced_at":"2026-01-11T01:58:00.937Z","repository":{"id":68435972,"uuid":"320054327","full_name":"machawk1/use","owner":"machawk1","description":"Experiments in examining the use of web archives.","archived":false,"fork":false,"pushed_at":"2020-12-09T19:09:53.000Z","size":5025,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":1,"default_branch":"master","last_synced_at":"2025-03-24T09:45:21.021Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":null,"language":null,"has_issues":false,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/machawk1.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2020-12-09T19:09:24.000Z","updated_at":"2020-12-10T10:38:15.000Z","dependencies_parsed_at":"2023-07-02T19:05:36.308Z","dependency_job_id":null,"html_url":"https://github.com/machawk1/use","commit_stats":{"total_commits":9,"total_committers":1,"mean_commits":9.0,"dds":0.0,"last_synced_commit":"280fe519733224d7051fc01f8af89a40e77fd6d8"},"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/machawk1%2Fuse","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/machawk1%2Fuse/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/machawk1%2Fuse/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/machawk1%2Fuse/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/machawk1","download_url":"https://codeload.github.com/machawk1/use/tar.gz/refs/heads/master","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":246634411,"owners_count":20809236,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":[],"created_at":"2024-10-01T23:40:27.650Z","updated_at":"2026-01-11T01:58:00.897Z","avatar_url":"https://github.com/machawk1.png","language":null,"readme":"# use\n\nThese are some experiments being conducted by Jess Ogden, Shawn Walker and Ed\nSummers to examine the use of web archives on the web. We are looking for the\ntraces of web archives in a web archive (CommonCrawl). If that sounds\nconfusingly meta then you are understanding correctly :)\n\nAt the moment `load.py` downloads all the WAT files for a given CommonCrawl\nsnapshot and looks for links to known web archives, and writes out a CSV of\ndata about those links including:\n\n* source_url: the url linking to the web archive\n* source_host: the host name of the source url \n* archive_url: the URL of a web archive resource\n* archive_service: the archive service, e.g. InternetArchive, ArchiveToday, etc\n* link_text: the text of the hypertext link\n* path: the CSS selector path to the link in the source page\n* link_count: the total number of hyperlinks on the page\n* warc: the CommonCrawl WARC file where the response is stored\n* offset: the byte offset into the WARC file where the response is\n* inflated_length: the inflated length of the response\n* deflated_length: the compressed length of the response\n\n## Install\n\n    git clone https://github.com/edsu/use.git\n    cd use pip\n    install -r requirements.txt\n\n## Run\n\nYou give `load.py` the snapshot ID of a [CommonCrawl] dataset. For\nexample:\n\n    ./load.py CC-MAIN-2020-45\n\nWait a looooong time. Look at load.log to see what's happening then when\nit's done you will have a CSV file:\n\n    CC-MAIN-2020-45.csv\n\n[CommonCrawl]: https://commoncrawl.org/\n[CommonCrawl dataset]: https://commoncrawl.org/the-data/get-started/\n","funding_links":[],"categories":[],"sub_categories":[],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fmachawk1%2Fuse","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fmachawk1%2Fuse","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fmachawk1%2Fuse/lists"}