{"id":20911422,"url":"https://github.com/brandonrobertz/autoscrape-py","last_synced_at":"2025-09-02T04:05:40.305Z","repository":{"id":41866954,"uuid":"133901266","full_name":"brandonrobertz/autoscrape-py","owner":"brandonrobertz","description":"An automated, programming-free web scraper for interactive sites","archived":false,"fork":false,"pushed_at":"2023-07-06T21:24:20.000Z","size":1172,"stargazers_count":111,"open_issues_count":1,"forks_count":17,"subscribers_count":10,"default_branch":"master","last_synced_at":"2025-08-16T14:35:41.971Z","etag":null,"topics":["data-journalism","scraper","selenium","webscraper"],"latest_commit_sha":null,"homepage":"","language":"HTML","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"agpl-3.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/brandonrobertz.png","metadata":{"files":{"readme":"README.rst","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2018-05-18T04:04:22.000Z","updated_at":"2025-05-17T04:37:02.000Z","dependencies_parsed_at":"2024-11-16T02:15:37.298Z","dependency_job_id":null,"html_url":"https://github.com/brandonrobertz/autoscrape-py","commit_stats":{"total_commits":422,"total_committers":2,"mean_commits":211.0,"dds":"0.0023696682464454666","last_synced_commit":"360f791c167c54336179120df4f52b30bf323bd0"},"previous_names":[],"tags_count":13,"template":false,"template_full_name":null,"purl":"pkg:github/brandonrobertz/autoscrape-py","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/brandonrobertz%2Fautoscrape-py","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/brandonrobertz%2Fautoscrape-py/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/brandonrobertz%2Fautoscrape-py/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/brandonrobertz%2Fautoscrape-py/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/brandonrobertz","download_url":"https://codeload.github.com/brandonrobertz/autoscrape-py/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/brandonrobertz%2Fautoscrape-py/sbom","scorecard":{"id":251312,"data":{"date":"2025-08-11","repo":{"name":"github.com/brandonrobertz/autoscrape-py","commit":"360f791c167c54336179120df4f52b30bf323bd0"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":1.6,"checks":[{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Code-Review","score":0,"reason":"Found 0/30 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"SAST","score":0,"reason":"no SAST tool detected","details":["Warn: no pull requests merged into dev branch"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE:0","Info: FSF or OSI recognized license: GNU Affero General Public License v3.0: LICENSE:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"Vulnerabilities","score":0,"reason":"21 existing vulnerabilities detected","details":["Warn: Project is vulnerable to: PYSEC-2021-858 / GHSA-q4xr-rc97-m4xx","Warn: Project is vulnerable to: PYSEC-2023-62 / GHSA-m2qf-hxjv-5gpq","Warn: Project is vulnerable to: PYSEC-2020-107 / GHSA-jjw5-xxj6-pcv5","Warn: Project is vulnerable to: PYSEC-2024-110 / GHSA-jw8x-6495-233v","Warn: Project is vulnerable to: PYSEC-2020-108","Warn: Project is vulnerable to: PYSEC-2023-102","Warn: Project is vulnerable to: PYSEC-2023-114","Warn: Project is vulnerable to: GHSA-55x5-fj6c-h6m8","Warn: Project is vulnerable to: PYSEC-2021-19 / GHSA-jq4v-f5q6-mjqq","Warn: Project is vulnerable to: PYSEC-2020-62 / GHSA-pgww-xf46-h92r","Warn: Project is vulnerable to: PYSEC-2022-230 / GHSA-wrxv-2j5q-m38w","Warn: Project is vulnerable to: PYSEC-2021-856 / GHSA-5545-2q6w-2gh6","Warn: Project is vulnerable to: GHSA-6p56-wp2h-9hxr","Warn: Project is vulnerable to: PYSEC-2019-108 / GHSA-9fq2-x9r6-wfmf","Warn: Project is vulnerable to: PYSEC-2021-857 / GHSA-f7c7-j99h-c22f","Warn: Project is vulnerable to: GHSA-fpfv-jqm9-f5jm","Warn: Project is vulnerable to: GHSA-9hjg-9r4m-mvj7","Warn: Project is vulnerable to: GHSA-9wx4-h78v-vm56","Warn: Project is vulnerable to: PYSEC-2023-74 / GHSA-j8r2-6x86-q33q","Warn: Project is vulnerable to: PYSEC-2022-43167","Warn: Project is vulnerable to: PYSEC-2023-206"],"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"Pinned-Dependencies","score":0,"reason":"dependency not pinned by hash detected -- score normalized to 0","details":["Warn: containerImage not pinned by hash: Dockerfile:1","Warn: containerImage not pinned by hash: Dockerfile:37","Warn: containerImage not pinned by hash: Dockerfile:45","Warn: containerImage not pinned by hash: Dockerfile:70","Warn: containerImage not pinned by hash: Dockerfile:73","Warn: containerImage not pinned by hash: Dockerfile:77","Warn: pipCommand not pinned by hash: Dockerfile:40","Warn: pipCommand not pinned by hash: Dockerfile:43","Warn: npmCommand not pinned by hash: Dockerfile:55-63","Warn: npmCommand not pinned by hash: Dockerfile:68","Info:   0 out of   6 containerImage dependencies pinned","Info:   0 out of   2 pipCommand dependencies pinned","Info:   0 out of   2 npmCommand dependencies pinned"],"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}}]},"last_synced_at":"2025-08-17T08:31:16.377Z","repository_id":41866954,"created_at":"2025-08-17T08:31:16.378Z","updated_at":"2025-08-17T08:31:16.378Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":273227970,"owners_count":25067691,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-09-02T02:00:09.530Z","response_time":77,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["data-journalism","scraper","selenium","webscraper"],"created_at":"2024-11-18T14:21:35.456Z","updated_at":"2025-09-02T04:05:40.258Z","avatar_url":"https://github.com/brandonrobertz.png","language":"HTML","funding_links":[],"categories":[],"sub_categories":[],"readme":"AutoScrape\n==========\n\n.. image:: https://pypip.in/v/autoscrape/badge.svg\n        :target: https://pypi.python.org/pypi/autoscrape/\n\n.. image:: https://pypip.in/license/autoscrape/badge.svg\n        :target: https://pypi.python.org/pypi/autoscrape/\n\n\n.. figure:: https://github.com/brandonrobertz/autoscrape-py/blob/master/images/ai.png\n   :alt: Artificial Informer Labs\n\nA project of `Artificial Informer Labs \u003chttps://artificialinformer.com\u003e`__.\n\nAutoScrape is an automated scraper of structured data from interactive\nweb pages. You point this scraper at a site, give it a little information\nand structured data can then be extracted. No brittle, site-specific\nprogramming necessary.\n\nThis is an implementation of the web scraping framework described in the\npaper, `Robust Web Scraping in the Public Interest with AutoScrape \u003chttps://bxroberts.org/files/autoscrape.pdf\u003e`__ and presented at\n`Computation + Journalism Symposium 2019 \u003chttp://cplusj.org/\u003e`__. This is\nan experimental work in progress!\n\nCurrently there are a few ways to use  AutoScrape:\n\n- via a full Web interface for scraping (see bottom of page, make sure to pull in the submodule!)\n- as a local CLI python script\n- as a simplified web scraping framework\n\nInstallation and running instructions are provided for both below.\n\nQuickstart\n----------\n\nTwo ways, easiest first.\n\n::\n\n    pip install autoscrape[all]\n    autoscrape --backend requests --output outdir --maxdepth 2 https://bxroberts.org\n\nThis will install all dependencies for all backends and various options.\n\nOr:\n\n::\n\n    git clone https://github.com/brandonrobertz/autoscrape-py\n    cd autoscrape-py/\n    pip install .[all]\n    autoscrape --backend requests --output outdir --maxdepth 2 https://bxroberts.org\n\nEither way, you can now use ``autoscrape`` from the command line.\n\nUsage Examples\n--------------\n\nHere are some straightforward use cases for AutoScrape and how you'd use\nthe CLI tool to execute them. These, of course, assume you have the\ndependencies installed.\n\nCrawler Backends\n~~~~~~~~~~~~~~~~\n\nThere are two backends available for driving AutoScrape: ``requests``,\n``selenium`` and ``warc``. The ``requests`` backend (the default) is based on the\nPython requests library and is only capable of crawling sites and submitting\nsimple HTTP forms. For any interaction with forms or JavaScript powered\nbuttons, you'll need to use the ``selenium`` backend.\n\nYou can control the backened with the ``--backend`` option:\n\n::\n\n    autoscrape \\\n      --backend requests \\\n      --output requests_crawled_site \\\n      'https://some.page/to-crawl'\n\nIn order to use backends other than requests, you need to install\nthe proper dependencies. `pip install autoscrape[all]` will\ninstall everything required for all backends/functionality, but\nyou can also install dependencies in isolation:\n\n::\n    Selenium backend:\n    pip install autoscrape[selenium-backend]\n\n    Crawl graph builder (for use in --save-graph)\n    pip install autoscrape[graph]\n\n    WARC backend:\n    pip install autoscrape[warc-backend]\n\nNote that for the Selenium backend, you need to install geckodriver or\nchromedriver, depending if you're using Firefox or Chrome, respectively.\nMore information is below in the External Dependencies section.\n\nCrawl\n~~~~~\n\nCrawl an entire website, saving all HTML and stylesheets (no\nscreenshots):\n\n::\n\n    autoscrape \\\n      --backend requests \\\n      --maxdepth -1 \\\n      --output crawled_site \\\n      'https://some.page/to-crawl'\n\nArchive Page (Screenshot \u0026 Code)\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nArchive a single webpage, both code and full-content screenshot (PNG),\nfor future reference:\n\n::\n\n    autoscrape \\\n      --backend selenium \\\n      --full-page-screenshots \\\n      --load-images --maxdepth 0 \\\n      --save-screenshots --driver Firefox \\\n      --output archived_webpage \\\n      'https://some.page/to-archive'\n\nSearch Forms and Crawl Result Pages\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nQuery a web form, identified by containing the text \"I'm a search form\",\nentering \"NAME\" into the first (0th) text input field and select January\n20th, 1992 in the second (1st) date field. Then click all buttons with\nthe text \"Next -\u003e\" to get all results pages:\n\n::\n\n    autoscrape \\\n      --backend selenium \\\n      --output search_query_data \\\n      --form-match \"I'm a search form\" \\\n      --input \"i:0:NAME,d:1:1992-01-20\" \\\n      --next-match \"Next -\u003e\" \\\n      'https://some.page/search?s=newquery'\n\nSetup for Standalone Local CLI\n------------------------------\n\nExternal Dependencies\n~~~~~~~~~~~~~~~~~~~~~\n\nIf you want to use the ``selenium`` backend for interactive crawling,\nyou need to have geckodriver installed. You can do that here:\n\n::\n\n    https://github.com/mozilla/geckodriver/releases\n\nOr through your package manager:\n\n::\n    apt install firefox-geckodriver\n\nYour ``geckodriver`` needs to be compatible with your current version of\nFirefox or you will get errors. If you install FF and the driver\nthrough your package manager, you *should* be okay, but it's\nnot guaranteed. We have specific versions of both pinned in the\n``Dockerfile``.\n\nIf you prefer to use Chrome, you will need the ChromeDriver (we've\ntested using v2.41). It can be found in your distribution's package\nmanager or here:\n\n::\n\n    https://sites.google.com/a/chromium.org/chromedriver/downloads\n\nInstalling the remaining Python dependencies can be done using pip.\n\nPip Install Method\n~~~~~~~~~~~~~~~~~~\n\nNext you need to set up your python virtual environment (Python 3.6\nrequired) and install the Python dependencies:\n\n::\n\n    pip install -r requirements.txt\n\nRunning Standalone Scraper\n--------------------------\n\nEnvironment Test Crawler\n~~~~~~~~~~~~~~~~~~~~~~~~\n\nYou can run a test to ensure your webdriver is set up correctly by\nrunning the ``test`` crawler:\n\n::\n\n    ./autoscrape --backend selenium --show-browser [SITE_URL]\n\nThe ``test`` crawler will just do a depth-first click-only crawl of an\nentire website. It will not interact with forms or POST data. Data will\nbe saved to ``./autoscrape-data/`` (the default output directory).\n\nManual Config-Based Scraper\n~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nAutoscrape has a manually controlled mode, similar to wget, except this\nuses interactive capabilities and can input data to search forms, follow\n\"next page\"-type buttons, etc. This functionality can be used either as\na standalone crawler/scraper or as a method to build a training set for\nthe automated scrapers.\n\nAutoscrape manual-mode full options:\n\n::\n\n    AUTOSCRAPE - Interactively crawl, find searchable forms,\n    input data to them and scrape data on the results, from an\n    initial BASEURL.\n\n    Usage:\n        autoscrape [options] BASEURL\n\n    General Options:\n        --backend BACKEND\n            The backend to use. Currently one of \"selenium\", \"requests\" or\n            \"warc\".  The requests browser is only capable of crawling, but\n            is approximately 2-3.5x faster. WARC is for emulating browsing\n            through Common Crawl archival data.\n            [default: selenium]\n\n        --loglevel LEVEL\n            Loglevel, note that DEBUG is extremely verbose.\n            [default: INFO]\n\n        --quiet\n            This will silence all logging to console.\n\n    Crawl-Specific Options:\n        --maxdepth DEPTH\n            Maximum depth to crawl a site (in search of form\n            if the option --form-match STRING is specified,\n            see below). Setting to 0 means don't crawl at all,\n            all operations are limited to the BASEURL page.\n            Setting to -1 means unlimited maximum crawl depth.\n            [default: 10]\n\n        --max-pages NUM\n            Maximum number of unique pages, in total, to fetch.\n            AutoScrape will stop crawling once this is hit.\n\n        --leave-host\n            By default, autoscrape will not leave the host given\n            in the BASEURL. This option lets the scraper leave\n            the host.\n\n        --only-links MATCH_STREING\n            A whitelist of links to follow. All others will\n            be ignored. Can be a string or a regex with\n            multiple strings to match separated by a pipe\n            (|) character.\n\n        --ignore-links MATCH_STRING\n            This option can be used to remove any links matching\n            MATCH_STRING (can be a regex or just a string match)\n            from consideration for clicking. Accepts the same\n            argument format as --only-links.\n\n        --link-priority SORT_STRING\n            A string to sort the links by. In this case, any link\n            containing \"SORT_STRING\" will be clicked before any other\n            links. In most cases you probably want to use the\n            whitelist, --only-links, option.\n\n        --ignore-extensions IGNORE_EXTENSIONS\n            Don't click on or download URLs pointing to files with\n            these extensions.\n\n        --result-page-links MATCH_STRINGS_LIST\n            If specified, AutoScrape will click on any links matching\n            this string when it arrives on a search result page.\n\n    Interactive Form Search Options:\n        --form-match SEARCH_STRING\n            The crawler will identify a form to search/scrape if it\n            contains the specified string. If matched, it will be\n            interactively scraped using the below instructions.\n\n        --input INPUT_DESCRIPTION\n            Interactive search descriptor. This describes how to\n            interact with a matched form. The inputs are\n            described in the following format:\n\n            \"c:0:True,i:0:atext,s:1:France:d:0:1991-01-20\"\n\n            A single-input type can be one of three types:\n            checkbox (\"c\"), input box (\"i\"), option select\n            (\"s\"), and date inputs (\"d\", with inputs in the\n            \"YYYY-MM-DD\" format). The type is separated by a\n            colon, and the input index position is next. (Each\n            input type has its own list, so a form with one\n            input, one checkbox, and one option select, will all\n            be at index 0.) The final command, sepearated by\n            another colon, describes what to do with the input.\n\n            Multiple inputs are separated by a comma, so you can\n            interact with multiple inputs before submitting the\n            form.\n\n            To illustrate this, the above command does the following:\n                - first input checkbox is checked (uncheck is False)\n                - first input box gets filled with the string \"first\"\n                - second select input gets the \"France\" option chosen\n                - first date input gets set to Jan 20, 1991\n\n        --next-match NEXT_BTN_STRING\n            A string to match a \"next\" button with, after\n            searching a form.  The scraper will continue to\n            click \"next\" buttons after a search until no matches\n            are found, unless limited by the --formdepth option\n            (see below). [default: next page]\n\n        --formdepth DEPTH\n            How deep the scraper will iterate, by clicking\n            \"next\" buttons. Zero means infinite depth.\n            [default: 0]\n\n        --form-submit-natural-click\n            Some webpages make clicking a link element difficult\n            due to JavaScript onClick events. In cases where a\n            click does nothing, you can use this option to get\n            the scraper to emulate a mouse click over the link's\n            poition on the page, activating any higher level JS\n            interactions.\n\n        --form-submit-wait SECONDS\n            How many seconds to force wait after a submit to a form.\n            This should be used in cases where the builtin\n            wait-for-page-load isn't working properly (JS-heavy\n            pages, etc). [default: 5]\n\n    Webdriver-Specific and General Options:\n        --load-images\n            By default, images on a page will not be fetched.\n            This speeds up scrapes on sites and lowers bandwidth\n            needs. This option fetches all images on a page.\n\n        --show-browser\n            By default, we hide the browser during operation.\n            This option displays a browser window, mostly\n            for debugging purposes.\n\n        --driver DRIVER\n            Which browser to use. Current support for \"Firefox\",\n            \"Chrome\", and \"remote\". [default: Firefox]\n\n        --browser-binary PATH_TO_BROWSER\n            Path to a specific browser binary. If left blank\n            selenium will pull the browser found on your path.\n\n        --remote-hub URI\n            If using \"remote\" driver, specify the hub URI to\n            connect to. Needs the proto, address, port, and path.\n            [default: http://localhost:4444/wd/hub]\n\n    WARC Options:\n        --warc-directory PATH_TO_WARCS\n            Path to the folder containing GZipped WARC files. These can be\n            downloaded from Common Crawl. Required when using the \"warc\"\n            backend.\n\n        --warc-index-file PATH_TO_LEVELDB\n            Path to the level DB database holding the URL-to-file\n            index: URL =\u003e (filename, record_number)\n            This will be generated from the WARCS in the --warc-directory\n            speficied if it's not already. Required when using the \"warc\"\n            backend.\n\n    Data Saving Options:\n        --output DIRECTORY_OR_URL\n            If specified, this indicates where to save pages during a\n            crawl. This directory will be created if it does not\n            currently exist.  This directory will have several\n            sub-directories that contain the different types of pages\n            found (i.e., search_pages, data_pages, screenshots).\n            This can also accept a URL (i.e., http://localhost:5000/files)\n            and AutoScrape will POST to that endpoint with each\n            file scraped.\n            [default: autoscrape-data]\n\n        --keep-filename\n            By default, we hash the files in a scrape in order to\n            account for dynamic content under a single-page app\n            (SPA) website implmentation. This option will force\n            the scraper to retain the original filename, from the\n            URL when saving scrape data.\n\n        --save-screenshots\n            This option makes the scraper save screenshots of each\n            page, interaction, and search. Screenshots will be\n            saved to the screenshots folder of the output dir.\n\n        --full-page-screenshots\n            By default, we only save the first displayed part of the\n            webpage. The remaining portion that you can only see\n            by scrolling down isn't captured. Setting this option\n            forces AutoScrape to scroll down and capture the entire\n            web content. This can fail in certain circumstances, like\n            in API output mode and should be used with care.\n\n        --save-graph\n            This option allows the scraper to build a directed graph\n            of the entire scrape and will save it to the \"graph\"\n            subdirectory under the output dir. The output file\n            is a timestamped networkx pickled graph.\n\n        --disable-style-saving\n            By default, AutoScrape saves the stylesheets associated\n            with a scraped page. To save storage, you can disable this\n            functionality by using this option.\n\nAutoScrape Web UI (Docker)\n--------------------------\n\nAutoScrape can be ran as a containerized cluster environment, where\nscrapes can be triggered and stopped via API calls and data can be\nstreamed to this server.\n\nThis requires the `autoscrape-www \u003chttps://github.com/brandonrobertz/autoscrape-www\u003e`__ submodule to be pulled:\n\n::\n\n    git submodule init\n    git submodule update\n\nThis will pull the browser-based UI into the `www/` folder.\n\nYou need\n`docker-ce \u003chttps://docs.docker.com/install/#server\u003e`__ and\n`docker-compose \u003chttps://docs.docker.com/compose/install/\u003e`__. Once you\nhave these dependencies installed, simply run:\n\n::\n\n    docker-compose build --pull\n    docker-compose up\n\nThis will build the containers and launch a API server running on local\nport 5000. More information about the API calls can be found in\n``autoscrape-server.py``.\n\nIf you have make installed, you can simply run ``make start``.\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fbrandonrobertz%2Fautoscrape-py","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fbrandonrobertz%2Fautoscrape-py","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fbrandonrobertz%2Fautoscrape-py/lists"}