{"id":25992952,"url":"https://github.com/elkronos/pdfscribe","last_synced_at":"2026-05-27T13:38:47.529Z","repository":{"id":280079876,"uuid":"940932712","full_name":"elkronos/PDFScribe","owner":"elkronos","description":null,"archived":false,"fork":false,"pushed_at":"2025-03-01T04:49:06.000Z","size":0,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-03-01T05:25:31.553Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":null,"language":"R","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/elkronos.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2025-03-01T04:48:09.000Z","updated_at":"2025-03-01T04:49:09.000Z","dependencies_parsed_at":"2025-03-01T05:35:35.419Z","dependency_job_id":null,"html_url":"https://github.com/elkronos/PDFScribe","commit_stats":null,"previous_names":["elkronos/pdfscribe"],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/elkronos/PDFScribe","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/elkronos%2FPDFScribe","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/elkronos%2FPDFScribe/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/elkronos%2FPDFScribe/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/elkronos%2FPDFScribe/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/elkronos","download_url":"https://codeload.github.com/elkronos/PDFScribe/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/elkronos%2FPDFScribe/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":33568857,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-05-26T15:22:16.424Z","status":"online","status_checked_at":"2026-05-27T02:00:06.184Z","response_time":53,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":[],"created_at":"2025-03-05T14:32:51.411Z","updated_at":"2026-05-27T13:38:47.502Z","avatar_url":"https://github.com/elkronos.png","language":"R","funding_links":[],"categories":[],"sub_categories":[],"readme":"# PDFScribe\n\nPDFScribe is an R-based package designed to process PDF documents. It extracts text (and images, if needed), samples content from the PDFs, and automatically builds structured requests for AI analysis. The package supports processing PDFs stored locally or on Amazon S3, leverages parallel processing to improve performance, and incorporates robust error handling and logging.\n\n## Features\n\n- **PDF Extraction:** Reads and validates PDF files using extraction tools.\n- **Content Sampling:** Samples pages using reservoir sampling and extracts key “anchor” text.\n- **AI Prompt Generation:** Automatically constructs structured prompts for AI analysis.\n- **Local \u0026 S3 Integration:** Processes PDFs from local directories and S3.\n- **Parallel Processing:** Utilizes multiple cores for concurrent PDF processing.\n- **Robust Logging \u0026 Error Handling:** Provides detailed logs and retry mechanisms for API calls and file operations.\n- **Comprehensive Testing:** Includes a suite of UAT tests using the `testthat` framework.\n\n## Installation\n\n1. **Clone the Repository:**\n\n   ```bash\n   git clone https://github.com/yourusername/PDFScribe.git\n   cd PDFScribe\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Felkronos%2Fpdfscribe","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Felkronos%2Fpdfscribe","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Felkronos%2Fpdfscribe/lists"}