{"id":13589728,"url":"https://github.com/datacoon/metawarc","last_synced_at":"2026-01-18T13:26:48.400Z","repository":{"id":48348934,"uuid":"262877321","full_name":"datacoon/metawarc","owner":"datacoon","description":"metawarc: a command-line tool for metadata extraction from files from WARC (Web ARChive)","archived":false,"fork":false,"pushed_at":"2025-07-10T16:29:44.000Z","size":73,"stargazers_count":34,"open_issues_count":11,"forks_count":1,"subscribers_count":3,"default_branch":"master","last_synced_at":"2025-09-15T03:55:36.245Z","etag":null,"topics":["metadata","osint","osint-python","warc","warc-files","webarchiving"],"latest_commit_sha":null,"homepage":null,"language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/datacoon.png","metadata":{"files":{"readme":"README.rst","changelog":"HISTORY.rst","contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":"AUTHORS.rst","dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2020-05-10T21:18:32.000Z","updated_at":"2025-08-13T09:45:43.000Z","dependencies_parsed_at":"2024-01-14T10:32:27.973Z","dependency_job_id":"d59866fa-daef-47fe-98d1-cc87c2b0b66d","html_url":"https://github.com/datacoon/metawarc","commit_stats":{"total_commits":22,"total_committers":3,"mean_commits":7.333333333333333,"dds":0.2272727272727273,"last_synced_commit":"7aca3da045172d2fab88aa5f5fce9ff7d1ca2f50"},"previous_names":[],"tags_count":1,"template":false,"template_full_name":null,"purl":"pkg:github/datacoon/metawarc","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datacoon%2Fmetawarc","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datacoon%2Fmetawarc/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datacoon%2Fmetawarc/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datacoon%2Fmetawarc/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/datacoon","download_url":"https://codeload.github.com/datacoon/metawarc/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datacoon%2Fmetawarc/sbom","scorecard":{"id":324332,"data":{"date":"2025-08-11","repo":{"name":"github.com/datacoon/metawarc","commit":"94db57069906becc01d64c39ec2eae2b692ddf81"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":2.7,"checks":[{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Maintained","score":9,"reason":"11 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 9","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Code-Review","score":0,"reason":"Found 0/28 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE:0","Info: FSF or OSI recognized license: MIT License: LICENSE:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Signed-Releases","score":0,"reason":"Project has not signed or included provenance with any releases.","details":["Warn: release artifact v1.1.1 not signed: https://api.github.com/repos/datacoon/metawarc/releases/81292322","Warn: release artifact v1.1.1 does not have provenance: https://api.github.com/repos/datacoon/metawarc/releases/81292322"],"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 4 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}},{"name":"Vulnerabilities","score":1,"reason":"9 existing vulnerabilities detected","details":["Warn: Project is vulnerable to: PYSEC-2024-203","Warn: Project is vulnerable to: PYSEC-2024-25","Warn: Project is vulnerable to: GHSA-55x5-fj6c-h6m8","Warn: Project is vulnerable to: PYSEC-2014-9 / GHSA-57qw-cc2g-pv5p","Warn: Project is vulnerable to: PYSEC-2021-19 / GHSA-jq4v-f5q6-mjqq","Warn: Project is vulnerable to: GHSA-pgww-xf46-h92r","Warn: Project is vulnerable to: PYSEC-2022-230 / GHSA-wrxv-2j5q-m38w","Warn: Project is vulnerable to: PYSEC-2018-12 / GHSA-xp26-p53h-6h2p","Warn: Project is vulnerable to: PYSEC-2017-74"],"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}}]},"last_synced_at":"2025-08-18T02:06:41.105Z","repository_id":48348934,"created_at":"2025-08-18T02:06:41.105Z","updated_at":"2025-08-18T02:06:41.105Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":28536751,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-01-18T13:04:05.990Z","status":"ssl_error","status_checked_at":"2026-01-18T13:01:44.092Z","response_time":98,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.5:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["metadata","osint","osint-python","warc","warc-files","webarchiving"],"created_at":"2024-08-01T16:00:33.565Z","updated_at":"2026-01-18T13:26:48.386Z","avatar_url":"https://github.com/datacoon.png","language":"Python","readme":"metawarc: a command-line tool for metadata extraction from files from WARC (Web ARChive)\n########################################################################################\n\nmetawarc (pronounced *me-ta-warc*) is a command line WARC files processing tools.\nIts goal is to make CLI interaction with files inside WARC archives so easy as possible.\nIt provides a simple ``metawarc`` command that allows to extract metadata from images, documents and other files inside\nWARC archives.\n\n\n.. contents::\n\n.. section-numbering::\n\n\n\nMain features\n=============\n\n* Built-in WARC support\n* Metadata extraction for a lot of file formats\n* Low memory footprint\n* Documentation\n* Test coverage\n\n\nFile formats supported\n======================\n\n* MS Office OLE: .doc, .xls, .ppt\n* MS Office XML: .docx, .xlsx, .pptx\n* Adobe PDF: .pdf\n* Images: .png, .jpg, .tiff, .jpeg, .jp2\n\n\nInstallation\n============\n\n\nAny OS\n-------------\n\nA universal installation method (that works on Windows, Mac OS X, Linux, …,\nand always provides the latest version) is to use pip:\n\n\n.. code-block:: bash\n\n    # Make sure we have an up-to-date version of pip and setuptools:\n    $ pip install --upgrade pip setuptools\n\n    $ pip install --upgrade metawarc\n\n\n(If ``pip`` installation fails for some reason, you can try\n``easy_install metawarc`` as a fallback.)\n\n\nPython version\n--------------\n\nPython version 3.6 or greater is required.\n\nUsage\n=====\n\n\nSynopsis:\n\n.. code-block:: bash\n\n    $ metawarc [command] [flags]  inputfile\n\n\nSee also ``metawarc --help`` and ``metawarc [command] --help`` for help for each command.\n\n\nQuickstart\n==========\n\nIndex all WARC files in all subfolders\n\n.. code-block:: bash\n\n    $ metawarc index '*/*.warc.gz'\n\nView file extensions statistics\n\n.. code-block:: bash\n\n    $ metawarc stats -m exts\n\n\nList all PDF files\n\n.. code-block:: bash\n\n    $ metawarc list-files -e pdf\n\n\nDumps all records with size greater than 10M and file extension 'pdf' to 'bigpdf' directory\n\n.. code-block:: bash\n\n    $ metawarc dump -q \"content_length \u003e 10000000 and ext = 'pdf'\" -o bigpdf\n\n\n\n\nCommands\n========\n\nIndex command\n-------------\nGenerates 'warcindex.db' DuckDB database with WARC files meta and for each WARC file generated two Parquet files in 'data' directory, they inherit WARC file name and have suffix '_records' and \"_headers\".\nAll of them registered in 'warcindex.db' with tables as \"files\" and \"tables\". \n\nAnalyzes 'armstat.am.warc.gz' and writes 'warcindex.db' with records and headers metadata.\n\n.. code-block:: bash\n\n    $ metawarc index armstat.am.warc.gz\n\nAnalyzes all WARC files in all subfolders and writes 'warcindex.db' with records and headers metadata.\n\n.. code-block:: bash\n\n    $ metawarc index '*/*.warc.gz'\n\n\nIndex content command\n---------------------\nAnalyzes WARC files records and extracts relevant metadata / content for future reuse. Supported metadata types: ooxmldocs, oledocs, pdfs, images, links\nResults saved to Parquet file in 'data' directory with suffix of the related metdata. For example '_images' for images.\n\nCollects PDF files metadata from all WARC files\n\n.. code-block:: bash\n\n    $ metawarc index-content -t pdfs\n\nCollects all links for selected WARC file (should be listed in 'warcindex.db' after index command run)\n\n.. code-block:: bash\n\n    $ metawarc index-content -i armstat.am.warc.gz -t links\n\n\n\nStats command\n-------------\nReturns total length and count of records by each mime or file extension.\n\nProcesses data in 'metawarc.db' and prints total length and count for each mime\n\n.. code-block:: bash\n\n    $ metawarc stats -m mimes\n\nProcesses data in 'metawarc.db' and prints total length and count for each file extension\n\n.. code-block:: bash\n\n    $ metawarc stats -m exts\n\n\nDump metadata command\n---------------------\nDumps metadata from tables. Supported metadata types: pdfs, ooxmldocs, oledocs, images, links\n\nExports PDF files metadata and writes as 'pdfs_metadata.jsonl'\n\n.. code-block:: bash\n\n    $ metawarc dump-metadata -t pdfs -o pdfs_metadata.jsonl\n\n\nList files command\n------------------\nPrints list of records with id, offset, length and url using 'metawarc.db'. Accepts list of mime types or list of file extensions or query as WHERE clause\n\nPrints all records with mime type (content type) 'application/zip'\n\n.. code-block:: bash\n\n    $ metawarc list-files -m 'application/zip'\n\nPrints all records with file extensions 'xls' and 'xlsx'\n\n.. code-block:: bash\n\n    $ metawarc list-files -e xls,xlsx\n\nPrints all records with size greater than 10M and file extension 'pdf'\n\n.. code-block:: bash\n\n    $ metawarc list-files -q \"content_length \u003e 10000000 and ext = 'pdf'\"\n\n\nDump command\n------------\nDumps records payloads as files using 'metawarc.db' as WARC index. Accepts list of mime types or list of file extensions or query as WHERE clause.\nAdds CSV file 'records.csv' to the output directory with basic data about each dumped record.\n\nDumps all records with mime type (content type) 'application/zip' to 'allzip' directory\n\n.. code-block:: bash\n\n    $ metawarc dump -m 'application/zip' -o allzip\n\nDumps all records with file extensions 'xls' and 'xlsx' to 'sheets' directory\n\n.. code-block:: bash\n\n    $ metawarc dump -e xls,xlsx -o sheets\n\nDumps all records with size greater than 10M and file extension 'pdf' to 'bigpdf' directory\n\n.. code-block:: bash\n\n    $ metawarc dump -q \"content_length \u003e 10000000 and ext = 'pdf'\" -o bigpdf\n\n","funding_links":[],"categories":["[](#table-of-contents) Table of contents","Web Archiving"],"sub_categories":["[](#warc)Tools for working with WARC (WebARChive) files","Analysis \u0026 Data Processing"],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdatacoon%2Fmetawarc","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fdatacoon%2Fmetawarc","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdatacoon%2Fmetawarc/lists"}