{"id":33062702,"url":"https://github.com/zipstack/unstract","last_synced_at":"2026-04-24T11:00:52.831Z","repository":{"id":230009420,"uuid":"761150311","full_name":"Zipstack/unstract","owner":"Zipstack","description":"LLM-Driven Extraction of Unstructured Data — Built for API Deployments \u0026 ETL Pipeline Workflows","archived":false,"fork":false,"pushed_at":"2026-04-21T07:30:07.000Z","size":95454,"stargazers_count":6545,"open_issues_count":72,"forks_count":623,"subscribers_count":44,"default_branch":"main","last_synced_at":"2026-04-21T07:37:10.929Z","etag":null,"topics":["api-deployments","data-extraction","document-processing","etl-pipelines","open-source-data-pipeline","unstructured-data-extraction"],"latest_commit_sha":null,"homepage":"https://unstract.com","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"agpl-3.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/Zipstack.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":"CONTRIBUTING.md","funding":null,"license":"LICENSE","code_of_conduct":"CODE_OF_CONDUCT.md","threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":"SECURITY.md","support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null}},"created_at":"2024-02-21T10:34:33.000Z","updated_at":"2026-04-21T06:30:31.000Z","dependencies_parsed_at":"2024-04-15T05:10:21.872Z","dependency_job_id":"c8105bdc-c3af-4bff-9364-46656fc09b89","html_url":"https://github.com/Zipstack/unstract","commit_stats":null,"previous_names":["zipstack/unstract"],"tags_count":531,"template":false,"template_full_name":null,"purl":"pkg:github/Zipstack/unstract","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/Zipstack%2Funstract","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/Zipstack%2Funstract/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/Zipstack%2Funstract/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/Zipstack%2Funstract/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/Zipstack","download_url":"https://codeload.github.com/Zipstack/unstract/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/Zipstack%2Funstract/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":32220295,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-04-24T10:26:35.452Z","status":"ssl_error","status_checked_at":"2026-04-24T10:25:27.643Z","response_time":64,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.5:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["api-deployments","data-extraction","document-processing","etl-pipelines","open-source-data-pipeline","unstructured-data-extraction"],"created_at":"2025-11-14T06:00:37.646Z","updated_at":"2026-04-24T11:00:52.825Z","avatar_url":"https://github.com/Zipstack.png","language":"Python","readme":"\u003cdiv align=\"center\"\u003e\n  \u003cimg src=\"docs/assets/unstract_u_logo.png\" style=\"height: 120px\"\u003e\n  \u003ch1\u003eUnstract\u003c/h1\u003e\n  \u003ch2\u003eTurn Unstructured Documents into Structured Data\u003c/h2\u003e\n  \u003cp\u003e\n    \u003ca href=\"https://docs.unstract.com\"\u003eDocumentation\u003c/a\u003e |\n    \u003ca href=\"https://unstract.com/pricing/\"\u003eEnterprise\u003c/a\u003e\n  \u003c/p\u003e\n  \u003cp\u003e\n    \u003ca href=\"LICENSE\"\u003e\u003cimg src=\"https://img.shields.io/github/license/Zipstack/unstract\" alt=\"License\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://docs.unstract.com/unstract/unstract_platform/quick_start\"\u003e\u003cimg src=\"https://img.shields.io/badge/tutorials-docs-brightgreen\" alt=\"Tutorials\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://status.unstract.com\"\u003e\u003cimg src=\"https://img.shields.io/badge/uptime-status-brightgreen\" alt=\"Uptime Status\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://hub.docker.com/u/unstract\"\u003e\u003cimg src=\"https://img.shields.io/docker/pulls/unstract/backend\" alt=\"Docker Pulls\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://deepwiki.com/Zipstack/unstract\"\u003e\u003cimg src=\"https://deepwiki.com/badge.svg\" alt=\"Ask DeepWiki\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://cla-assistant.io/Zipstack/unstract\"\u003e\u003cimg src=\"https://cla-assistant.io/readme/badge/Zipstack/unstract\" alt=\"CLA assistant\"\u003e\u003c/a\u003e\n  \u003c/p\u003e\n  \u003cp\u003e\n    \u003cimg src=\"https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2FZipstack%2Funstract%2Frefs%2Fheads%2Fmain%2Fpyproject.toml\" alt=\"Python Version from PEP 621 TOML\"\u003e\n    \u003ca href=\"https://github.com/astral-sh/uv\"\u003e\u003cimg src=\"https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json\" alt=\"uv\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://vite.dev/\"\u003e\u003cimg src=\"https://img.shields.io/badge/Vite-6.x-646CFF?logo=vite\u0026logoColor=white\" alt=\"Vite\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://bun.sh/\"\u003e\u003cimg src=\"https://img.shields.io/badge/Bun-1.x-000000?logo=bun\u0026logoColor=white\" alt=\"Bun\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://biomejs.dev/\"\u003e\u003cimg src=\"https://img.shields.io/badge/Biome-2.x-60A5FA?logo=biome\u0026logoColor=white\" alt=\"Biome\"\u003e\u003c/a\u003e\n  \u003c/p\u003e\n  \u003cp\u003e\n    \u003ca href=\"https://results.pre-commit.ci/latest/github/Zipstack/unstract/main\"\u003e\u003cimg src=\"https://results.pre-commit.ci/badge/github/Zipstack/unstract/main.svg\" alt=\"pre-commit.ci status\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://sonarcloud.io/summary/new_code?id=Zipstack_unstract\"\u003e\u003cimg src=\"https://sonarcloud.io/api/project_badges/measure?project=Zipstack_unstract\u0026metric=alert_status\" alt=\"Quality Gate Status\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://sonarcloud.io/summary/new_code?id=Zipstack_unstract\"\u003e\u003cimg src=\"https://sonarcloud.io/api/project_badges/measure?project=Zipstack_unstract\u0026metric=code_smells\" alt=\"Code Smells\"\u003e\u003c/a\u003e\n    \u003ca href=\"https://sonarcloud.io/summary/new_code?id=Zipstack_unstract\"\u003e\u003cimg src=\"https://sonarcloud.io/api/project_badges/measure?project=Zipstack_unstract\u0026metric=duplicated_lines_density\" alt=\"Duplicated Lines (%)\"\u003e\u003c/a\u003e\n  \u003c/p\u003e\n\u003c/div\u003e\n\n## What is Unstract?\n\nUnstract uses LLMs to extract structured JSON from documents — PDFs, images, scans, you name it. Define what you want to extract using natural language prompts, and deploy as an API or ETL pipeline.\n\nBuilt for teams in **finance**, **insurance**, **healthcare**, **KYC/compliance**, and much more.\n\n## Current State vs. Unstract\n\n| Task | Without Unstract | With Unstract |\n|------|------------------|---------------|\n| Schema definition | Write regex, build templates per vendor | Write a prompt once, handles variations |\n| New document type | Days of development | Minutes in Prompt Studio |\n| LLM integration | Build your own pipeline | Plug in any provider (OpenAI, Anthropic, Bedrock, Ollama) |\n| Deployment | Custom infrastructure | `./run-platform.sh` or managed cloud |\n| Output | Unstructured text blobs | Clean JSON, ready for your database |\n\n\u003e ⭐ If Unstract helps you, star this repo!\n\u003e\n\u003e ![Star Unstract](docs/assets/github_star.gif)\n\n## ✨ Key Features\n\n**Prompt Studio** — Define document extraction schemas with natural language. [Docs →](https://docs.unstract.com/unstract/unstract_platform/features/prompt_studio/prompt_studio_intro/)\n\n![Prompt Studio](docs/assets/prompt_studio.gif)\n\n**API Deployment** — Send a document over REST API, get JSON back. [Docs →](https://docs.unstract.com/unstract/unstract_platform/api_deployment/unstract_api_deployment_intro/)\n\n![API Deployment](docs/assets/api_deployment.gif)\n\n**ETL Pipeline** — Pull documents from a folder, process them, load to your warehouse. [Docs →](https://docs.unstract.com/unstract/unstract_platform/etl_pipeline/unstract_etl_pipeline_intro/)\n\n**MCP Server** — Connect to AI agents (Claude, etc.) via Model Context Protocol. [Docs →](https://docs.unstract.com/unstract/unstract_platform/mcp/unstract_platform_mcp_server/)\n\n**n8n Node** — Drop into existing automation workflows. [Docs →](https://docs.unstract.com/unstract/unstract_platform/api_deployment/unstract_api_deployment_n8n_custom_node/)\n\n## 🚀 Quickstart (~5 mins)\n\n### System Requirements \u0026 Prerequisites\n\n- Linux or macOS (Intel or M-series)\n- Docker \u0026 Docker Compose\n- 8 GB RAM minimum\n- Git\n\n### Run Locally\n\n```bash\n# Clone and start\ngit clone https://github.com/Zipstack/unstract.git\ncd unstract\n./run-platform.sh\n```\n\nThat's it!\n\n- Visit [http://frontend.unstract.localhost](http://frontend.unstract.localhost) in your browser\n- Login with username: `unstract` password: `unstract`\n- Start extracting data!\n\n## 📦 Other Deployment Options\n\n### Docker Compose\n\n```bash\n# Pull and run entire Unstract platform with default env config.\n./run-platform.sh\n\n# Pull and run docker containers with a specific version tag.\n./run-platform.sh -v v0.1.0\n\n# Upgrade existing Unstract platform setup by pulling the latest available version.\n./run-platform.sh -u\n\n# Upgrade existing Unstract platform setup by pulling a specific version.\n./run-platform.sh -u -v v0.2.0\n\n# Build docker images locally as a specific version tag.\n./run-platform.sh -b -v v0.1.0\n\n# Build docker images locally from working branch as `current` version tag.\n./run-platform.sh -b -v current\n\n# Display the help information.\n./run-platform.sh -h\n\n# Only do setup of environment files.\n./run-platform.sh -e\n\n# Only do docker images pull with a specific version tag.\n./run-platform.sh -p -v v0.1.0\n\n# Only do docker images pull by building locally with a specific version tag.\n./run-platform.sh -p -b -v v0.1.0\n\n# Upgrade existing Unstract platform setup with docker images built locally from working branch as `current` version tag.\n./run-platform.sh -u -b -v current\n\n# Pull and run docker containers in detached mode.\n./run-platform.sh -d -v v0.1.0\n```\n\n## 🔐 Backup Encryption Key\n\n\u003e [!WARNING]\n\u003e This key encrypts adapter credentials — losing it makes existing adapters inaccessible!\n\nCopy the value of `ENCRYPTION_KEY` from `backend/.env` or `platform-service/.env` to a secure location.\n\n## 🏗️ Unstract Architecture\n\n```text\n┌────────────────────────────────────────────────────────────┐\n│                          Unstract                          │\n├─────────────┬─────────────┬─────────────┬──────────────────┤\n│  Frontend   │   Backend   │   Worker    │ Platform Service │\n│  (React)    │  (Django)   │  (Celery)   │   (FastAPI)      │\n├─────────────┴─────────────┴─────────────┴──────────────────┤\n│                      Cache (Redis)                         │\n├────────────────────────────────────────────────────────────┤\n│                  Message Queue (RabbitMQ)                  │\n├────────────────────────────────────────────────────────────┤\n│                   Database (PostgreSQL)                    │\n├────────────────────────────────────────────────────────────┤\n│  LLM Adapters    │  Vector DBs    │  Text Extractors       │\n│  (OpenAI, etc.)  │ (Qdrant, etc.) │  (LLMWhisperer)        │\n└────────────────────────────────────────────────────────────┘\n```\n\nAlso see [architecture](docs/ARCHITECTURE.md).\n\n## 📄 Document File Formats\n\n| Category | Formats |\n|----------|---------|\n| Documents | PDF, DOCX, DOC, ODT, TXT, CSV, JSON |\n| Spreadsheets | XLSX, XLS, ODS |\n| Presentations | PPTX, PPT, ODP |\n| Images | PNG, JPG, JPEG, TIFF, BMP, GIF, WEBP |\n\n## 🔌 Connectors \u0026 Adapters\n\n### LLM Providers\n\n| Provider | Status | Provider | Status |\n|----------|--------|----------|--------|\n| OpenAI | ✅ | Azure OpenAI | ✅ |\n| Anthropic Claude | ✅ | Google Gemini | ✅ |\n| AWS Bedrock | ✅ | Mistral AI | ✅ |\n| Ollama (local) | ✅ | Anyscale | ✅ |\n\n### Vector Databases\n\n| Provider | Status | Provider | Status |\n|----------|--------|----------|--------|\n| Qdrant | ✅ | Pinecone | ✅ |\n| Weaviate | ✅ | PostgreSQL | ✅ |\n| Milvus | ✅ | | |\n\n### Text Extractors\n\n| Provider | Status |\n|----------|--------|\n| LLMWhisperer | ✅ |\n| Unstructured.io | ✅ |\n| LlamaIndex Parse | ✅ |\n\n### ETL Sources \u0026 Destinations\n\n**Sources:** AWS S3, MinIO, Google Cloud Storage, Azure Blob, Google Drive, Dropbox, SFTP\n\n**Destinations:** Snowflake, Amazon Redshift, Google BigQuery, PostgreSQL, MySQL, MariaDB, SQL Server, Oracle\n\n[Full Connector List](https://docs.unstract.com/unstract/unstract_platform/setup_accounts/whats_needed)\n\n## 🛠️ Development\n\n### Change Default Credentials\n\nFollow [these steps](backend/README.md#authentication) to change the default username and password.\n\n### Local Development\n\n```bash\n# Install pre-commit hooks\n./dev-env-cli.sh -p\n\n# Run pre-commit checks\n./dev-env-cli.sh -r\n```\n\n[Local Development Guide](https://docs.unstract.com/unstract/unstract_platform/user_guides/run_platform)\n\n## 🏢 Use Cases by Industry\n\n[Finance \u0026 Banking →](https://unstract.com/finance-automation/) | [Insurance →](https://unstract.com/insurance-automation/) | [Healthcare →](https://unstract.com/healthcare-automation/) | [Income Tax →](https://unstract.com/ai-income-tax-forms-data-extraction/)\n\n## ☁️ Cloud \u0026 Enterprise\n\nFor teams that need managed infrastructure, advanced accuracy features, or compliance certifications.\n\n- ✅ **LLMChallenge** — dual-LLM verification\n- ✅ **SinglePass \u0026 Summarized Extraction** — reduce LLM token costs\n- ✅ **Human-in-the-Loop** — review interface with document highlighting\n- ✅ **SSO \u0026 Enterprise RBAC** — SAML/OIDC integration with granular role-based access control\n- ✅ **SOC 2, HIPAA, ISO 27001, GDPR Compliant** — third-party audited security certifications\n- ✅ **Priority Support with SLA** — dedicated support team with response time guarantees\n\n\u003ca href=\"https://unstract.com/schedule-a-demo/\"\u003e\u003cimg src=\"docs/assets/book-demo-button-blue.svg\" alt=\"Book a Demo\"\u003e\u003c/a\u003e\n\n## 📚 Cookbooks\n\n- [Unstract + PostgreSQL + DeepSeek](https://unstract.com/blog/open-source-document-data-extraction-with-unstract-deepseek/)\n- [Unstract + n8n](https://unstract.com/blog/unstract-n8n/)\n- [Unstract + Snowflake](https://unstract.com/blog/process-unstructured-data-with-unstract-snowflake/)\n- [Unstract + BigQuery](https://unstract.com/blog/process-unstructured-data-with-unstract-bigquery/)\n- [Unstract + Crew.AI](https://unstract.com/blog/agentic-document-extraction-processing-with-unstract-crew-ai/)\n- [Unstract + PydanticAI](https://unstract.com/blog/building-real-world-ai-agents-with-pydanticai-and-unstract/)\n- [Unstract MCP Server](https://unstract.com/blog/unstract-mcp-server/)\n\n## 🤝 Contributing\n\nWe welcome contributions! The easiest way to start:\n\n1. Pick an issue tagged [`good first issue`](https://github.com/Zipstack/unstract/labels/good%20first%20issue)\n2. Submit a PR\n\n[Report Bug →](https://github.com/Zipstack/unstract/issues/new?template=bug_report.md) | [Request Feature →](https://github.com/Zipstack/unstract/issues/new?template=feature_request.md)\n\n## 👋 Community\n\nJoin the LLM-powered document automation community:\n\n[![Blog](https://img.shields.io/badge/BLOG-FF6B6B?style=flat)](https://unstract.com/blog/) [![LinkedIn](https://img.shields.io/badge/FOLLOW%20US%20ON%20LINKEDIN-C8A2E8?style=flat)](https://www.linkedin.com/showcase/unstract/) [![Slack](https://img.shields.io/badge/SLACK-4CAF50?style=flat)](https://join-slack.unstract.com) [![X](https://img.shields.io/badge/FOLLOW%20US%20ON%20X-FFD700?style=flat)](https://twitter.com/GetUnstract)\n\n## 📊 A Note on Analytics\n\nUnstract integrates Posthog to track minimal usage analytics. Disable by setting `REACT_APP_ENABLE_POSTHOG=false` in the frontend's `.env` file.\n\n## 📜 License\n\nUnstract is released under the [AGPL-3.0 License](LICENSE).\n\n---\n\n\u003cdiv align=\"center\"\u003e\n  \u003cp\u003eBuilt with ❤️ by \u003ca href=\"https://zipstack.com\"\u003eZipstack\u003c/a\u003e\u003c/p\u003e\n  \u003cp\u003e\n    \u003ca href=\"https://unstract.com\"\u003eWebsite\u003c/a\u003e ·\n    \u003ca href=\"https://docs.unstract.com\"\u003eDocumentation\u003c/a\u003e ·\n    \u003ca href=\"https://unstract.com/pricing/\"\u003ePricing\u003c/a\u003e\n  \u003c/p\u003e\n\u003c/div\u003e\n","funding_links":[],"categories":[],"sub_categories":[],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fzipstack%2Funstract","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fzipstack%2Funstract","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fzipstack%2Funstract/lists"}