{"id":21482845,"url":"https://github.com/ibmstreams/streamsx.document","last_synced_at":"2025-10-11T05:11:51.870Z","repository":{"id":18461251,"uuid":"21655600","full_name":"IBMStreams/streamsx.document","owner":"IBMStreams","description":"(Incubation) This toolkit allows extract text and metadata from documents in a binary formats such as PDF, Word, Office, etc","archived":false,"fork":false,"pushed_at":"2020-07-10T10:18:21.000Z","size":62459,"stargazers_count":6,"open_issues_count":2,"forks_count":1,"subscribers_count":5,"default_branch":"develop","last_synced_at":"2025-07-28T00:04:34.612Z","etag":null,"topics":["extractor","ibm-streams","stream-processing"],"latest_commit_sha":null,"homepage":"http://ibmstreams.github.io/streamsx.document","language":"Java","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"other","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/IBMStreams.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE.md","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null}},"created_at":"2014-07-09T14:37:49.000Z","updated_at":"2024-08-14T02:45:59.000Z","dependencies_parsed_at":"2022-09-15T20:50:26.670Z","dependency_job_id":null,"html_url":"https://github.com/IBMStreams/streamsx.document","commit_stats":null,"previous_names":[],"tags_count":1,"template":false,"template_full_name":null,"purl":"pkg:github/IBMStreams/streamsx.document","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/IBMStreams%2Fstreamsx.document","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/IBMStreams%2Fstreamsx.document/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/IBMStreams%2Fstreamsx.document/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/IBMStreams%2Fstreamsx.document/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/IBMStreams","download_url":"https://codeload.github.com/IBMStreams/streamsx.document/tar.gz/refs/heads/develop","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/IBMStreams%2Fstreamsx.document/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":279006349,"owners_count":26084084,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-10-11T02:00:06.511Z","response_time":55,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["extractor","ibm-streams","stream-processing"],"created_at":"2024-11-23T12:38:15.509Z","updated_at":"2025-10-11T05:11:51.836Z","avatar_url":"https://github.com/IBMStreams.png","language":"Java","funding_links":[],"categories":[],"sub_categories":[],"readme":"streamsx.document\n=================\n\nThis toolkit allows extract text and metadata from documents in a binary formats\nsuch as PDF, Word, Office, etc. For this purpose the toolkit implements a DocumentSource operator.\n\nThe DocumentSource operator utilized multiple third party and open source document extraction technologies, \nand can be enhanced with additional commercial /proprietary extractors. The operator automatically determines \nthe document MIME type and delegated the extraction request to appropriate extractor plugin.\n\nOut of the box the toolkit provides the following extractors:\n *  Apache Tika – The primary extractor for binary documents such as Office documents (Word, Powerpoint, Excel), HTML files, etc.\n *  PDFBox – For handling Acrobat PDF files\n *  TrueZIP – ZIP, JAR, TAR, GZ, GZIP files and other archive files\n *  JUnrar – RAR files\n *  Plain Text – Text files of various encodings (ASCII, UTF-8, UTF-16, local encodings)\n\nThe toolkit's home page is available at:\nhttp://ibmstreams.github.io/streamsx.document/\n\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fibmstreams%2Fstreamsx.document","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fibmstreams%2Fstreamsx.document","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fibmstreams%2Fstreamsx.document/lists"}