{"id":25347357,"url":"https://github.com/eli64s/pdflex","last_synced_at":"2025-10-07T00:56:33.444Z","repository":{"id":150159150,"uuid":"321937699","full_name":"eli64s/pdflex","owner":"eli64s","description":"CLI for merging PDF contexts.","archived":false,"fork":false,"pushed_at":"2025-03-20T16:21:50.000Z","size":476,"stargazers_count":3,"open_issues_count":0,"forks_count":1,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-09-22T18:58:45.526Z","etag":null,"topics":["pdf-automation","pdf-converter","pdf-data-extraction","pdf-document","pdf-document-parser","pdf-document-processor","pdf-extractor","pdf-generator","pdf-library","pdf-manipulation","pdf-parser","pdf-processor","pdf-python","pdf-regex","pdf-search","pdf-text-extraction","pdf-tools","python-pdf","python-pdf-tools"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/eli64s.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":"CONTRIBUTING.md","funding":null,"license":"LICENSE","code_of_conduct":"CODE_OF_CONDUCT.md","threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2020-12-16T09:49:47.000Z","updated_at":"2025-03-21T20:17:43.000Z","dependencies_parsed_at":null,"dependency_job_id":"a3d91019-0125-4d7a-91ed-8637ce326446","html_url":"https://github.com/eli64s/pdflex","commit_stats":null,"previous_names":["eli64s/pypdf","eli64s/pdflex"],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/eli64s/pdflex","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/eli64s%2Fpdflex","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/eli64s%2Fpdflex/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/eli64s%2Fpdflex/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/eli64s%2Fpdflex/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/eli64s","download_url":"https://codeload.github.com/eli64s/pdflex/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/eli64s%2Fpdflex/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":278703576,"owners_count":26031205,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-10-06T02:00:05.630Z","response_time":65,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["pdf-automation","pdf-converter","pdf-data-extraction","pdf-document","pdf-document-parser","pdf-document-processor","pdf-extractor","pdf-generator","pdf-library","pdf-manipulation","pdf-parser","pdf-processor","pdf-python","pdf-regex","pdf-search","pdf-text-extraction","pdf-tools","python-pdf","python-pdf-tools"],"created_at":"2025-02-14T14:36:46.585Z","updated_at":"2025-10-07T00:56:33.406Z","avatar_url":"https://github.com/eli64s.png","language":"Python","readme":"\u003cdiv id=\"top\" align=\"left\"\u003e\n\n\u003c!-- HEADER --\u003e\n\u003cpicture\u003e\n  \u003csource media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/eli64s/pdflex/656aa96e7c4b65ca72077d170e4dcdbdd9bbbc45/docs/assets/logo-dark.svg\"\u003e\n  \u003csource media=\"(prefers-color-scheme: light)\" srcset=\"https://raw.githubusercontent.com/eli64s/pdflex/656aa96e7c4b65ca72077d170e4dcdbdd9bbbc45/docs/assets/logo-light.svg\"\u003e\n  \u003cimg alt=\"pdflex Logo\" src=\"https://raw.githubusercontent.com/eli64s/pdflex/656aa96e7c4b65ca72077d170e4dcdbdd9bbbc45/docs/assets/logo-light.svg\" width=\"100%\" style=\"max-width: 100%;\"\u003e\n\u003c/picture\u003e\n\n\u003c!-- BADGES --\u003e\n\u003cdiv align=\"left\"\u003e\n  \u003cp align=\"left\" style=\"margin-bottom: 20px;\"\u003e\n    \u003ca href=\"https://github.com/eli64s/pdflex/actions\"\u003e\n      \u003cimg src=\"https://img.shields.io/github/actions/workflow/status/eli64s/pdflex/ci.yml?label=CI\u0026style=flat\u0026logo=githubactions\u0026logoColor=white\u0026labelColor=2A2A2A\u0026color=FF1493\" alt=\"GitHub Actions\" /\u003e\n    \u003c/a\u003e\n    \u003ca href=\"https://app.codecov.io/gh/eli64s/pdflex\"\u003e\n      \u003cimg src=\"https://img.shields.io/codecov/c/github/eli64s/pdflex?label=Coverage\u0026style=flat\u0026logo=codecov\u0026logoColor=white\u0026labelColor=2A2A2A\u0026color=00F5FF\" alt=\"Coverage\" /\u003e\n    \u003c/a\u003e\n    \u003ca href=\"https://pypi.org/project/pdflex/\"\u003e\n      \u003cimg src=\"https://img.shields.io/pypi/v/pdflex?label=PyPI\u0026style=flat\u0026logo=pypi\u0026logoColor=white\u0026labelColor=2A2A2A\u0026color=3d8be1\" alt=\"PyPI Version\" /\u003e\n    \u003c/a\u003e\n    \u003ca href=\"https://github.com/eli64s/pdflex\"\u003e\n      \u003cimg src=\"https://img.shields.io/pypi/pyversions/pdflex?label=Python\u0026style=flat\u0026logo=python\u0026logoColor=white\u0026labelColor=2A2A2A\u0026color=9b26d4\" alt=\"Python Version\" /\u003e\n    \u003c/a\u003e\n    \u003ca href=\"https://opensource.org/license/mit/\"\u003e\n      \u003cimg src=\"https://img.shields.io/github/license/eli64s/pdflex?label=License\u0026style=flat\u0026logo=opensourceinitiative\u0026logoColor=white\u0026labelColor=2A2A2A\u0026color=4B0082\" alt=\"MIT License\"\u003e\n    \u003c/a\u003e\n  \u003c/p\u003e\n\u003c/div\u003e\n\n\u003cdiv align=\"left\"\u003e\n  \u003cimg src=\"https://raw.githubusercontent.com/eli64s/pdflex/d545ac98f5ad59ece892e638a7d3bdee593d8e88/docs/assets/line.svg\" alt=\"thematic-break\" width=\"100%\" height=\"2px\" style=\"margin: 20px 0;\"\u003e\n\u003c/div\u003e\n\n\u003c/div\u003e\n\n## What is `PDFlex?`\n\nPDFlex is a powerful PDF processing toolkit for Python. It provides robust tools for PDF validation, text extraction, merging (with custom separator pages), searching, and more—all built to streamline your PDF automation workflows.\n\n## Features\n\n- **PDF Validation:** Quickly verify if a file is a valid PDF.\n- **Text Extraction:** Extract text from PDFs using either PyMuPDF or PyPDF.\n- **Directory Processing:** Process entire directories of PDFs for text extraction.\n- **PDF Merging:** Merge multiple PDF files into one, automatically inserting a custom separator page between documents.\n  - The separator page displays the title (derived from the filename) with underscores and hyphens removed.\n  - Supports both portrait and landscape separator pages (ideal for lecture slides).\n- **PDF Searching:** Recursively search for PDFs in a directory based on filename patterns (e.g., numeric float prefixes).\n\n\n\u003c!-- ## Documentation\n\nFull documentation is available at [https://pdflex.readthedocs.io/](https://pdflex.readthedocs.io/)\n\n- [User Guide](https://pdflex.readthedocs.io/en/latest/user_guide.html)\n- [API Reference](https://pdflex.readthedocs.io/en/latest/api.html)\n- [Examples](https://pdflex.readthedocs.io/en/latest/examples.html) --\u003e\n\n---\n\n## Quick Start\n\n## Installation\n\nPDFlex is available on PyPI. To install using pip:\n\n```bash\npip install -U pdflex\n```\n\nAlternatively, install in an isolated environment with pipx:\n\n```bash\npipx install pdflex\n```\n\nFor the fastest installation using uv:\n\n```bash\nuv tool install pdflex\n```\n\n---\n\n## Usage\n\n### Command-Line Interface (CLI)\n\nPDFlex provides a convenient CLI for merging and searching PDFs. The CLI supports two primary commands: `merge` and `search`.\n\n#### Merge Command\n\nMerge multiple PDF files into a single document while automatically inserting a separator page before each document.\n\n**Usage:**\n\n```bash\npdflex merge /path/to/file1.pdf /path/to/file2.pdf -o merged_output.pdf\n```\n\nAdd the `--landscape` flag to create separator pages in landscape orientation:\n\n```bash\npdflex merge /path/to/file1.pdf /path/to/file2.pdf -o merged_output.pdf --landscape\n```\n\n#### Search and Merge Command\n\nSearch for PDF files in a directory based on filename filters (or search for lecture slides with numeric float prefixes) and merge them into one PDF.\n\n**Usage:**\n\n- **General Search:**\n\n  ```bash\n  pdflex search /path/to/search -o merged_output.pdf --prefix \"Chapter\" --suffix \".pdf\"\n  ```\n\n- **Lecture Slides Merge:**\n  (Merges all PDFs whose filenames start with a numeric float prefix like `1.2_`, `3.2_`, etc., in sorted order. Separator pages will be in landscape orientation.)\n\n  ```bash\n  pdflex search /path/to/algorithms-and-computation -o merged_lectures.pdf --lecture\n  ```\n\n### Python API Usage\n\nYou can also use PDFlex directly from your Python code. Below are examples for some common tasks.\n\n#### Merging PDFs with Separator Pages\n\n```python\nfrom pathlib import Path\nfrom pdflex.merge import merge_pdfs\n\n# List of PDF file paths to merge\npdf_files = [\n    \"/path/to/document1.pdf\",\n    \"/path/to/document2.pdf\"\n]\n\n# Merge files, using landscape separator pages (ideal for lecture slides)\nmerge_pdfs(pdf_files, output_path=\"merged_output.pdf\", landscape=True)\n```\n\n#### Searching for PDFs by Filename\n\n```python\nfrom pdflex.search import search_pdfs, search_numeric_prefixed_pdfs\n\n# General search: Find PDFs that start with a prefix and/or end with a suffix\npdf_list = search_pdfs(\"/path/to/search\", prefix=\"Chapter\", suffix=\".pdf\")\nprint(\"Found PDFs:\", pdf_list)\n\n# Lecture slides: Find PDFs with numeric float prefixes (e.g., \"1.2_Intro.pdf\")\nlecture_slides = search_numeric_prefixed_pdfs(\"/path/to/algorithms-and-computation\")\nprint(\"Found lecture slides:\", lecture_slides)\n```\n\n\u003c!--\n#### Extracting Text from a PDF\n\n```python\nfrom pdflex import extract_text_from_pdf\n\n# Extract text from a PDF using the auto-detection method (tries PyMuPDF then falls back to PyPDF)\noutput_txt = extract_text_from_pdf(\"invoice.pdf\", method=\"auto\")\nprint(f\"Extracted text saved to: {output_txt}\")\n```\n\n#### Processing an Entire Directory\n\n```python\nfrom pdflex import process_directory\n\n# Process all PDFs in a directory and extract their text to corresponding .txt files.\nprocess_directory(\"/path/to/pdf_directory\", output_dir=\"/path/to/text_outputs\")\n```\n\n---\n\n## API Reference\n\nFor detailed API documentation, please refer to the [API Reference](https://pdflex.readthedocs.io/en/latest/api.html).\n\n### Exceptions\n\n- **PDFlexError:** Raised for any error during PDF processing (e.g., invalid PDF, extraction failure).\n\n### Modules Overview\n\n- **`pdflex.merge`**\n  Contains functions to merge PDFs, insert separator pages (with customizable orientation and title cleaning), and write the final merged document.\n\n- **`pdflex.search`**\n  Provides functions to recursively search for PDFs in a directory based on filename patterns, including numeric float prefixes for lecture slides.\n\n- **`pdflex.extract`** (and similar)\n  Functions for extracting text using PyMuPDF or PyPDF, validating PDF files, and processing directories of PDFs.\n\n- **`pdflex.cli`**\n  Command-line interface that exposes the `merge` and `search` commands, complete with rich console output.\n--\u003e\n\n---\n\n## Contributing\n\nContributions are welcome! Whether it's bug reports, feature requests, or code contributions, please feel free to:\n\n1. Open an [issue][github-issues]\n2. Submit a [pull request][github-pulls]\n3. Improve documentation.\n4. Share your ideas!\n\n---\n\n## Acknowledgments\n\nThis project is built upon several awesome PDF open-source projects:\n\n- [pypdf](https://github.com/pymupdf/PyMuPDF)\n- [pdfplumber](https://github.com/jsvine/pdfplumber)\n- [reportlab](https://www.reportlab.com/opensource/)\n\n---\n\n## License\n\nPDFlex is released under the [MIT][mit-license] license. \u003cbr /\u003e\nCopyright (c) 2020 to present [PDFlex][pdflex] and contributors.\n\n\u003cdiv align=\"left\"\u003e\n  \u003ca href=\"#top\"\u003e\n    \u003cimg src=\"https://raw.githubusercontent.com/eli64s/pdflex/607d295f58914fc81a5b71fd994af90901b6433c/docs/assets/button.svg\" width=\"100px\" height=\"100px\" alt=\"Return to Top\"\u003e\n  \u003c/a\u003e\n\u003c/div\u003e\n\n\u003cdiv align=\"left\"\u003e\n  \u003cimg src=\"https://raw.githubusercontent.com/eli64s/pdflex/d545ac98f5ad59ece892e638a7d3bdee593d8e88/docs/assets/line.svg\" alt=\"thematic-break\" width=\"100%\" height=\"2px\" style=\"margin: 20px 0;\"\u003e\n\u003c/div\u003e\n\n\u003c!-- REFERENCE LINKS --\u003e\n\n\u003c!-- PROJECT RESOURCES --\u003e\n[pypi]: https://pypi.org/project/pdflex/\n[pdflex]: https://github.com/eli64s/pdflex\n[github-issues]: https://github.com/eli64s/pdflex/issues\n[github-pulls]: https://github.com/eli64s/pdflex/pulls\n[mit-license]: https://github.com/eli64s/pdflex/blob/main/LICENSE\n[examples]: https://github.com/eli64s/pdflex/tree/main/docs/examples\n\n\u003c!-- DEV TOOLS --\u003e\n[python]: https://www.python.org/\n[pip]: https://pip.pypa.io/en/stable/\n[pipx]: https://pipx.pypa.io/stable/\n[uv]: https://docs.astral.sh/uv/\n[mkdocs]: https://www.mkdocs.org/\n[mkdocs.yml]: https://www.mkdocs.org/user-guide/configuration/\n","funding_links":[],"categories":[],"sub_categories":[],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Feli64s%2Fpdflex","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Feli64s%2Fpdflex","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Feli64s%2Fpdflex/lists"}