{"id":13534414,"url":"https://github.com/piskvorky/smart_open","last_synced_at":"2025-12-11T21:04:11.110Z","repository":{"id":25287559,"uuid":"28713483","full_name":"piskvorky/smart_open","owner":"piskvorky","description":"Utils for streaming large files (S3, HDFS, gzip, bz2...)","archived":false,"fork":false,"pushed_at":"2025-11-08T21:38:09.000Z","size":1793,"stargazers_count":3411,"open_issues_count":22,"forks_count":386,"subscribers_count":43,"default_branch":"develop","last_synced_at":"2025-11-27T20:16:10.061Z","etag":null,"topics":["boto","bz2","file","gzip-stream","hacktoberfest","hdfs","python","s3","streaming","streaming-data","webhdfs"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/piskvorky.png","metadata":{"files":{"readme":"README.rst","changelog":"CHANGELOG.md","contributing":"CONTRIBUTING.md","funding":".github/FUNDING.yml","license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null},"funding":{"github":["piskvorky"],"patreon":null,"open_collective":null,"ko_fi":null,"tidelift":null,"community_bridge":null,"liberapay":null,"issuehunt":null,"otechie":null,"custom":null}},"created_at":"2015-01-02T13:05:52.000Z","updated_at":"2025-11-26T14:10:04.000Z","dependencies_parsed_at":"2023-02-17T07:15:47.133Z","dependency_job_id":"9595c262-b206-4ca4-a99a-0b44d2b35485","html_url":"https://github.com/piskvorky/smart_open","commit_stats":{"total_commits":922,"total_committers":131,"mean_commits":7.038167938931298,"dds":0.7169197396963123,"last_synced_commit":"283e93abfe3c8a621ba50d395918290affb14c31"},"previous_names":["piskvorky/smart_open","rare-technologies/smart_open"],"tags_count":68,"template":false,"template_full_name":null,"purl":"pkg:github/piskvorky/smart_open","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/piskvorky%2Fsmart_open","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/piskvorky%2Fsmart_open/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/piskvorky%2Fsmart_open/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/piskvorky%2Fsmart_open/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/piskvorky","download_url":"https://codeload.github.com/piskvorky/smart_open/tar.gz/refs/heads/develop","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/piskvorky%2Fsmart_open/sbom","scorecard":{"id":117040,"data":{"date":"2025-08-11","repo":{"name":"github.com/piskvorky/smart_open","commit":"414e2c45d0cbf49c38c677ce4b468e537386a4b0"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":4.5,"checks":[{"name":"Maintained","score":10,"reason":"16 commit(s) and 7 issue activity found in the last 90 days -- score normalized to 10","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Code-Review","score":2,"reason":"Found 6/29 approved changesets -- score normalized to 2","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Dangerous-Workflow","score":10,"reason":"no dangerous workflow patterns detected","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Token-Permissions","score":0,"reason":"detected GitHub workflow tokens with excessive permissions","details":["Warn: jobLevel 'contents' permission set to 'write': .github/workflows/release.yml:11","Warn: no topLevel permission defined: .github/workflows/python-package.yml:1","Warn: no topLevel permission defined: .github/workflows/release.yml:1"],"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Pinned-Dependencies","score":0,"reason":"dependency not pinned by hash detected -- score normalized to 0","details":["Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:140: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:144: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:192: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:196: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:14: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:19: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:55: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:59: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:100: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/python-package.yml:104: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/python-package.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/release.yml:17: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/release.yml/develop?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/release.yml:22: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/release.yml/develop?enable=pin","Warn: third-party GitHubAction not pinned by hash: .github/workflows/release.yml:35: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/release.yml/develop?enable=pin","Warn: third-party GitHubAction not pinned by hash: .github/workflows/release.yml:41: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/release.yml/develop?enable=pin","Warn: third-party GitHubAction not pinned by hash: .github/workflows/release.yml:43: update your workflow using https://app.stepsecurity.io/secureworkflow/piskvorky/smart_open/release.yml/develop?enable=pin","Warn: pipCommand not pinned by hash: .github/workflows/python-package.yml:150","Warn: pipCommand not pinned by hash: .github/workflows/python-package.yml:202","Warn: pipCommand not pinned by hash: .github/workflows/python-package.yml:25","Warn: pipCommand not pinned by hash: .github/workflows/python-package.yml:65","Warn: pipCommand not pinned by hash: .github/workflows/python-package.yml:71","Warn: pipCommand not pinned by hash: .github/workflows/python-package.yml:110","Warn: pipCommand not pinned by hash: .github/workflows/release.yml:28","Info:   0 out of  12 GitHub-owned GitHubAction dependencies pinned","Info:   0 out of   3 third-party GitHubAction dependencies pinned","Info:   0 out of   7 pipCommand dependencies pinned"],"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE:0","Info: FSF or OSI recognized license: MIT License: LICENSE:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"Packaging","score":10,"reason":"packaging workflow detected","details":["Info: Project packages its releases by way of GitHub Actions.: .github/workflows/release.yml:8"],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Signed-Releases","score":0,"reason":"Project has not signed or included provenance with any releases.","details":["Warn: release artifact v7.3.0.post1 not signed: https://api.github.com/repos/piskvorky/smart_open/releases/229677990","Warn: release artifact v7.3.0 not signed: https://api.github.com/repos/piskvorky/smart_open/releases/229134731","Warn: release artifact v7.3.0.post1 does not have provenance: https://api.github.com/repos/piskvorky/smart_open/releases/229677990","Warn: release artifact v7.3.0 does not have provenance: https://api.github.com/repos/piskvorky/smart_open/releases/229134731"],"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Vulnerabilities","score":10,"reason":"0 existing vulnerabilities detected","details":null,"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'develop'","Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 14 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}}]},"last_synced_at":"2025-08-16T01:35:59.375Z","repository_id":25287559,"created_at":"2025-08-16T01:35:59.375Z","updated_at":"2025-08-16T01:35:59.375Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":27670173,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-12-11T02:00:11.302Z","response_time":56,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["boto","bz2","file","gzip-stream","hacktoberfest","hdfs","python","s3","streaming","streaming-data","webhdfs"],"created_at":"2024-08-01T07:01:32.508Z","updated_at":"2025-12-11T21:04:11.095Z","avatar_url":"https://github.com/piskvorky.png","language":"Python","readme":"======================================================\nsmart_open — utils for streaming large files in Python\n======================================================\n\n\n|License|_ |CI|_ |Coveralls|_ |Downloads|_\n\n.. |License| image:: https://img.shields.io/pypi/l/smart_open.svg\n.. |CI| image:: https://github.com/piskvorky/smart_open/actions/workflows/python-package.yml/badge.svg?branch=develop\u0026event=push\n.. |Coveralls| image:: https://coveralls.io/repos/github/RaRe-Technologies/smart_open/badge.svg?branch=develop\n.. |Downloads| image:: https://pepy.tech/badge/smart-open/month\n.. _License: https://github.com/piskvorky/smart_open/blob/master/LICENSE\n.. _CI: https://github.com/piskvorky/smart_open/actions/workflows/python-package.yml\n.. _Coveralls: https://coveralls.io/github/RaRe-Technologies/smart_open?branch=HEAD\n.. _Downloads: https://pypi.org/project/smart-open/\n\n\nWhat?\n=====\n\n``smart_open`` is a Python 3 library for **efficient streaming of very large files** from/to storages such as S3, GCS, Azure Blob Storage, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats.\n\n``smart_open`` is a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top.\n\n**Python 2.7 is no longer supported. If you need Python 2.7, please use** `smart_open 1.10.1 \u003chttps://github.com/piskvorky/smart_open/releases/tag/1.10.0\u003e`_, **the last version to support Python 2.**\n\nWhy?\n====\n\nWorking with large remote files, for example using Amazon's `boto3 \u003chttps://boto3.amazonaws.com/v1/documentation/api/latest/index.html\u003e`_ Python library, is a pain.\n``boto3``'s ``Object.upload_fileobj()`` and ``Object.download_fileobj()`` methods require gotcha-prone boilerplate to use successfully, such as constructing file-like object wrappers.\n``smart_open`` shields you from that. It builds on boto3 and other remote storage libraries, but offers a **clean unified Pythonic API**. The result is less code for you to write and fewer bugs to make.\n\n\nHow?\n=====\n\n``smart_open`` is well-tested, well-documented, and has a simple Pythonic API:\n\n\n.. _doctools_before_examples:\n\n.. code-block:: python\n\n  \u003e\u003e\u003e from smart_open import open\n  \u003e\u003e\u003e\n  \u003e\u003e\u003e # stream lines from an S3 object\n  \u003e\u003e\u003e for line in open('s3://commoncrawl/robots.txt'):\n  ...    print(repr(line))\n  ...    break\n  'User-Agent: *\\n'\n\n  \u003e\u003e\u003e # stream from/to compressed files, with transparent (de)compression:\n  \u003e\u003e\u003e for line in open('tests/test_data/1984.txt.gz', encoding='utf-8'):\n  ...    print(repr(line))\n  'It was a bright cold day in April, and the clocks were striking thirteen.\\n'\n  'Winston Smith, his chin nuzzled into his breast in an effort to escape the vile\\n'\n  'wind, slipped quickly through the glass doors of Victory Mansions, though not\\n'\n  'quickly enough to prevent a swirl of gritty dust from entering along with him.\\n'\n\n  \u003e\u003e\u003e # can use context managers too:\n  \u003e\u003e\u003e with open('tests/test_data/1984.txt.gz') as fin:\n  ...    with open('tests/test_data/1984.txt.bz2', 'w') as fout:\n  ...        for line in fin:\n  ...           fout.write(line)\n  74\n  80\n  78\n  79\n\n  \u003e\u003e\u003e # can use any IOBase operations, like seek\n  \u003e\u003e\u003e with open('s3://commoncrawl/robots.txt', 'rb') as fin:\n  ...     for line in fin:\n  ...         print(repr(line.decode('utf-8')))\n  ...         break\n  ...     offset = fin.seek(0)  # seek to the beginning\n  ...     print(fin.read(4))\n  'User-Agent: *\\n'\n  b'User'\n\n  \u003e\u003e\u003e # stream from HTTP\n  \u003e\u003e\u003e for line in open('http://example.com/index.html'):\n  ...     print(repr(line[:15]))\n  ...     break\n  '\u003c!doctype html\u003e'\n\n.. _doctools_after_examples:\n\nOther examples of URIs that ``smart_open`` accepts::\n\n    s3://bucket/key\n    s3://access_key_id:secret_access_key@bucket/key\n    s3://access_key_id:secret_access_key@server:port@bucket/key\n    gs://bucket/blob\n    azure://bucket/blob\n    hdfs:///path/file\n    hdfs://path/file\n    webhdfs://host:port/path/file\n    ./local/path/file\n    ~/local/path/file\n    local/path/file\n    ./local/path/file.gz\n    file:///home/user/file\n    file:///home/user/file.bz2\n    [ssh|scp|sftp]://username@host//path/file\n    [ssh|scp|sftp]://username@host/path/file\n    [ssh|scp|sftp]://username:password@host/path/file\n\n\nDocumentation\n=============\n\nThe API reference can be viewed at `help.txt \u003chttps://github.com/piskvorky/smart_open/blob/master/help.txt\u003e`__\n\nInstallation\n------------\n\n``smart_open`` supports a wide range of storage solutions. For all options, see the API reference.\nEach individual solution has its own dependencies.\nBy default, ``smart_open`` does not install any dependencies, in order to keep the installation size small.\nYou can install one or more of these dependencies explicitly using optional dependencies:\n\n    pip install smart_open[s3,gcs,azure,http,webhdfs,ssh,zst]\n\nOr, if you don't mind installing a large number of third party libraries, you can install all dependencies using::\n\n    pip install smart_open[all]\n\nBe warned that this option increases the installation size significantly, e.g. over 100MB.\n\nIf you're upgrading from ``smart_open`` versions 2.x and below, please check out the `Migration Guide \u003cMIGRATING_FROM_OLDER_VERSIONS.rst\u003e`_.\n\nBuilt-in help\n-------------\n\nTo view the API reference, use the ``help`` python builtin:\n\n.. code-block:: python\n\n    help('smart_open')\n\nor view `help.txt \u003chttps://github.com/piskvorky/smart_open/blob/master/help.txt\u003e`__ in your browser.\n\nMore examples\n-------------\n\nFor the sake of simplicity, the examples below assume you have all the dependencies installed, i.e. you have done::\n\n    pip install smart_open[all]\n\n.. code-block:: python\n\n    import os, boto3, botocore\n    from smart_open import open\n\n    # stream content *into* S3 (write mode) using a custom client\n    # this client is thread-safe ref https://github.com/boto/boto3/blob/1.38.41/docs/source/guide/clients.rst?plain=1#L111\n    config = botocore.client.Config(\n        max_pool_connections=64,\n        tcp_keepalive=True,\n        retries={\"max_attempts\": 6, \"mode\": \"adaptive\"},\n    )\n    client = boto3.Session(\n        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],\n        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],\n    ).client(\"s3\", config=config)\n    with open('s3://smart-open-py37-benchmark-results/test.txt', 'wb', transport_params={'client': client}) as fout:\n        bytes_written = fout.write(b'hello world!')\n        print(bytes_written)\n\n    # perform a single-part upload to S3 (saves billable API requests, and allows seek() before upload)\n    with open('s3://smart-open-py37-benchmark-results/test.txt', 'wb', transport_params={'multipart_upload': False}) as fout:\n        bytes_written = fout.write(b'hello world!')\n        print(bytes_written)\n    # now with tempfile.TemporaryFile instead of the default io.BytesIO (to reduce memory footprint)\n    import tempfile\n    with tempfile.TemporaryFile() as tmp, open('s3://smart-open-py37-benchmark-results/test.txt', 'wb', transport_params={'multipart_upload': False, 'writebuffer': tmp}) as fout:\n        bytes_written = fout.write(b'hello world!')\n        print(bytes_written)\n\n    # stream from HDFS\n    for line in open('hdfs://user/hadoop/my_file.txt', encoding='utf8'):\n        print(line)\n\n    # stream from WebHDFS\n    for line in open('webhdfs://host:port/user/hadoop/my_file.txt'):\n        print(line)\n\n    # stream content *into* HDFS (write mode):\n    with open('hdfs://host:port/user/hadoop/my_file.txt', 'wb') as fout:\n        fout.write(b'hello world')\n\n    # stream content *into* WebHDFS (write mode):\n    with open('webhdfs://host:port/user/hadoop/my_file.txt', 'wb') as fout:\n        fout.write(b'hello world')\n\n    # stream from a completely custom s3 server, like s3proxy:\n    for line in open('s3u://user:secret@host:port@mybucket/mykey.txt'):\n        print(line)\n\n    # Stream to Digital Ocean Spaces bucket providing credentials from boto3 profile\n    session = boto3.Session(profile_name='digitalocean')\n    client = session.client('s3', endpoint_url='https://ams3.digitaloceanspaces.com')\n    transport_params = {'client': client}\n    with open('s3://bucket/key.txt', 'wb', transport_params=transport_params) as fout:\n        fout.write(b'here we stand')\n\n    # stream from GCS\n    for line in open('gs://my_bucket/my_file.txt'):\n        print(line)\n\n    # stream content *into* GCS (write mode):\n    with open('gs://my_bucket/my_file.txt', 'wb') as fout:\n        fout.write(b'hello world')\n\n    # stream from Azure Blob Storage\n    connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING']\n    transport_params = {\n        'client': azure.storage.blob.BlobServiceClient.from_connection_string(connect_str),\n    }\n    for line in open('azure://mycontainer/myfile.txt', transport_params=transport_params):\n        print(line)\n\n    # stream content *into* Azure Blob Storage (write mode):\n    connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING']\n    transport_params = {\n        'client': azure.storage.blob.BlobServiceClient.from_connection_string(connect_str),\n    }\n    with open('azure://mycontainer/my_file.txt', 'wb', transport_params=transport_params) as fout:\n        fout.write(b'hello world')\n\nCompression Handling\n--------------------\n\nThe top-level `compression` parameter controls compression/decompression behavior when reading and writing.\nThe supported values for this parameter are:\n\n- ``infer_from_extension`` (default behavior)\n- ``disable``\n- ``.gz``\n- ``.bz2``\n- ``.zst``\n\nBy default, ``smart_open`` determines the compression algorithm to use based on the file extension.\n\n.. code-block:: python\n\n    \u003e\u003e\u003e from smart_open import open\n    \u003e\u003e\u003e with open('tests/test_data/1984.txt.gz') as fin:\n    ...     print(fin.read(32))\n    It was a bright cold day in Apri\n\nYou can override this behavior to either disable compression, or explicitly specify the algorithm to use.\nTo disable compression:\n\n.. code-block:: python\n\n    \u003e\u003e\u003e from smart_open import open\n    \u003e\u003e\u003e with open('tests/test_data/1984.txt.gz', 'rb', compression='disable') as fin:\n    ...     print(fin.read(32))\n    b'\\x1f\\x8b\\x08\\x08\\x85F\\x94\\\\\\x00\\x031984.txt\\x005\\x8f=r\\xc3@\\x08\\x85{\\x9d\\xe2\\x1d@'\n\n\nTo specify the algorithm explicitly (e.g. for non-standard file extensions):\n\n.. code-block:: python\n\n    \u003e\u003e\u003e from smart_open import open\n    \u003e\u003e\u003e with open('tests/test_data/1984.txt.gzip', compression='.gz') as fin:\n    ...     print(fin.read(32))\n    It was a bright cold day in Apri\n\nYou can also easily add support for other file extensions and compression formats.\nFor example, to open xz-compressed files:\n\n.. code-block:: python\n\n    \u003e\u003e\u003e import lzma, os\n    \u003e\u003e\u003e from smart_open import open, register_compressor\n\n    \u003e\u003e\u003e def _handle_xz(file_obj, mode):\n    ...      return lzma.LZMAFile(filename=file_obj, mode=mode)\n\n    \u003e\u003e\u003e register_compressor('.xz', _handle_xz)\n\n    \u003e\u003e\u003e with open('tests/test_data/1984.txt.xz') as fin:\n    ...     print(fin.read(32))\n    It was a bright cold day in Apri\n\nThis is just an example: ``lzma`` is in the standard library and is registered by default.\n\nTransport-specific Options\n--------------------------\n\n``smart_open`` supports a wide range of transport options out of the box, including:\n\n- S3\n- HTTP, HTTPS (read-only)\n- SSH, SCP and SFTP\n- WebHDFS\n- GCS\n- Azure Blob Storage\n\nEach option involves setting up its own set of parameters.\nFor example, for accessing S3, you often need to set up authentication, like API keys or a profile name.\n``smart_open``'s ``open`` function accepts a keyword argument ``transport_params`` which accepts additional parameters for the transport layer.\nHere are some examples of using this parameter:\n\n.. code-block:: python\n\n  \u003e\u003e\u003e import boto3\n  \u003e\u003e\u003e fin = open('s3://commoncrawl/robots.txt', transport_params=dict(client=boto3.client('s3')))\n  \u003e\u003e\u003e fin = open('s3://commoncrawl/robots.txt', transport_params=dict(buffer_size=1024))\n\nFor the full list of keyword arguments supported by each transport option, see the documentation:\n\n.. code-block:: python\n\n  help('smart_open.open')\n\nS3 Credentials\n--------------\n\n``smart_open`` uses the ``boto3`` library to talk to S3.\n``boto3`` has several `mechanisms \u003chttps://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html\u003e`__ for determining the credentials to use.\nBy default, ``smart_open`` will defer to ``boto3`` and let the latter take care of the credentials.\nThere are several ways to override this behavior.\n\nThe first is to pass a ``boto3.Client`` object as a transport parameter to the ``open`` function.\nYou can customize the credentials when constructing the session for the client.\n``smart_open`` will then use the session when talking to S3.\n\n.. code-block:: python\n\n    session = boto3.Session(\n        aws_access_key_id=ACCESS_KEY,\n        aws_secret_access_key=SECRET_KEY,\n        aws_session_token=SESSION_TOKEN,\n    )\n    client = session.client('s3', endpoint_url=..., config=...)\n    fin = open('s3://bucket/key', transport_params={'client': client})\n\nYour second option is to specify the credentials within the S3 URL itself:\n\n.. code-block:: python\n\n    fin = open('s3://aws_access_key_id:aws_secret_access_key@bucket/key', ...)\n\n*Important*: The two methods above are **mutually exclusive**. If you pass an AWS client *and* the URL contains credentials, ``smart_open`` will ignore the latter.\n\n*Important*: ``smart_open`` ignores configuration files from the older ``boto`` library.\nPort your old ``boto`` settings to ``boto3`` in order to use them with ``smart_open``.\n\nS3 Advanced Usage\n-----------------\n\nAdditional keyword arguments can be propagated to the boto3 methods that are used by ``smart_open`` under the hood using the ``client_kwargs`` transport parameter.\n\nFor instance, to upload a blob with Metadata, ACL, StorageClass, these keyword arguments can be passed to ``create_multipart_upload`` (`docs \u003chttps://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.create_multipart_upload\u003e`__).\n\n.. code-block:: python\n\n    kwargs = {'Metadata': {'version': 2}, 'ACL': 'authenticated-read', 'StorageClass': 'STANDARD_IA'}\n    fout = open('s3://bucket/key', 'wb', transport_params={'client_kwargs': {'S3.Client.create_multipart_upload': kwargs}})\n\nIterating Over an S3 Bucket's Contents\n--------------------------------------\n\nSince going over all (or select) keys in an S3 bucket is a very common operation, there's also an extra function ``smart_open.s3.iter_bucket()`` that does this efficiently, **processing the bucket keys in parallel** (using multiprocessing):\n\n.. code-block:: python\n\n  \u003e\u003e\u003e from smart_open import s3\n  \u003e\u003e\u003e # we use workers=1 for reproducibility; you should use as many workers as you have cores\n  \u003e\u003e\u003e bucket = 'silo-open-data'\n  \u003e\u003e\u003e prefix = 'Official/annual/monthly_rain/'\n  \u003e\u003e\u003e for key, content in s3.iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3):\n  ...     print(key, round(len(content) / 2**20))\n  Official/annual/monthly_rain/2010.monthly_rain.nc 13\n  Official/annual/monthly_rain/2011.monthly_rain.nc 13\n  Official/annual/monthly_rain/2012.monthly_rain.nc 13\n\nGCS Credentials\n---------------\n``smart_open`` uses the ``google-cloud-storage`` library to talk to GCS.\n``google-cloud-storage`` uses the ``google-cloud`` package under the hood to handle authentication.\nThere are several `options \u003chttps://googleapis.dev/python/google-api-core/latest/auth.html\u003e`__ to provide\ncredentials.\nBy default, ``smart_open`` will defer to ``google-cloud-storage`` and let it take care of the credentials.\n\nTo override this behavior, pass a ``google.cloud.storage.Client`` object as a transport parameter to the ``open`` function.\nYou can `customize the credentials \u003chttps://googleapis.dev/python/storage/latest/client.html\u003e`__\nwhen constructing the client. ``smart_open`` will then use the client when talking to GCS. To follow allow with\nthe example below, `refer to Google's guide \u003chttps://cloud.google.com/storage/docs/reference/libraries#setting_up_authentication\u003e`__\nto setting up GCS authentication with a service account.\n\n.. code-block:: python\n\n    import os\n    from google.cloud.storage import Client\n    service_account_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS']\n    client = Client.from_service_account_json(service_account_path)\n    fin = open('gs://gcp-public-data-landsat/index.csv.gz', transport_params=dict(client=client))\n\nIf you need more credential options, you can create an explicit ``google.auth.credentials.Credentials`` object\nand pass it to the Client. To create an API token for use in the example below, refer to the\n`GCS authentication guide \u003chttps://cloud.google.com/storage/docs/authentication#apiauth\u003e`__.\n\n.. code-block:: python\n\n\timport os\n\tfrom google.auth.credentials import Credentials\n\tfrom google.cloud.storage import Client\n\ttoken = os.environ['GOOGLE_API_TOKEN']\n\tcredentials = Credentials(token=token)\n\tclient = Client(credentials=credentials)\n\tfin = open('gs://gcp-public-data-landsat/index.csv.gz', transport_params={'client': client})\n\nGCS Advanced Usage\n------------------\n\nAdditional keyword arguments can be propagated to the GCS open method (`docs \u003chttps://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#google_cloud_storage_blob_Blob_open\u003e`__), which is used by ``smart_open`` under the hood, using the ``blob_open_kwargs`` transport parameter.\n\nAdditionally keyword arguments can be propagated to the GCS ``get_blob`` method (`docs \u003chttps://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.bucket.Bucket#google_cloud_storage_bucket_Bucket_get_blob\u003e`__) when in a read-mode, using the ``get_blob_kwargs`` transport parameter.\n\nAdditional blob properties (`docs \u003chttps://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#properties\u003e`__) can be set before an upload, as long as they are not read-only, using the ``blob_properties`` transport parameter.\n\n.. code-block:: python\n\n    open_kwargs = {'predefined_acl': 'authenticated-read'}\n    properties = {'metadata': {'version': 2}, 'storage_class': 'COLDLINE'}\n    fout = open('gs://bucket/key', 'wb', transport_params={'blob_open_kwargs': open_kwargs, 'blob_properties': properties})\n\nAzure Credentials\n-----------------\n\n``smart_open`` uses the ``azure-storage-blob`` library to talk to Azure Blob Storage.\nBy default, ``smart_open`` will defer to ``azure-storage-blob`` and let it take care of the credentials.\n\nAzure Blob Storage does not have any ways of inferring credentials therefore, passing a ``azure.storage.blob.BlobServiceClient``\nobject as a transport parameter to the ``open`` function is required.\nYou can `customize the credentials \u003chttps://docs.microsoft.com/en-us/azure/storage/common/storage-samples-python#authentication\u003e`__\nwhen constructing the client. ``smart_open`` will then use the client when talking to. To follow allow with\nthe example below, `refer to Azure's guide \u003chttps://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python#copy-your-credentials-from-the-azure-portal\u003e`__\nto setting up authentication.\n\n.. code-block:: python\n\n    import os\n    from azure.storage.blob import BlobServiceClient\n    azure_storage_connection_string = os.environ['AZURE_STORAGE_CONNECTION_STRING']\n    client = BlobServiceClient.from_connection_string(azure_storage_connection_string)\n    fin = open('azure://my_container/my_blob.txt', transport_params={'client': client})\n\nIf you need more credential options, refer to the\n`Azure Storage authentication guide \u003chttps://docs.microsoft.com/en-us/azure/storage/common/storage-samples-python#authentication\u003e`__.\n\nAzure Advanced Usage\n--------------------\n\nAdditional keyword arguments can be propagated to the ``commit_block_list`` method (`docs \u003chttps://azuresdkdocs.blob.core.windows.net/$web/python/azure-storage-blob/12.14.1/azure.storage.blob.html#azure.storage.blob.BlobClient.commit_block_list\u003e`__), which is used by ``smart_open`` under the hood for uploads, using the ``blob_kwargs`` transport parameter.\n\n.. code-block:: python\n\n    kwargs = {'metadata': {'version': 2}}\n    fout = open('azure://container/key', 'wb', transport_params={'blob_kwargs': kwargs})\n\nDrop-in replacement of ``pathlib.Path.open``\n--------------------------------------------\n\n``smart_open.open`` can also be used with ``Path`` objects.\nThe built-in `Path.open()` is not able to read text from compressed files, so use ``patch_pathlib`` to replace it with `smart_open.open()` instead.\nThis can be helpful when e.g. working with compressed files.\n\n.. code-block:: python\n\n    \u003e\u003e\u003e from pathlib import Path\n    \u003e\u003e\u003e from smart_open.smart_open_lib import patch_pathlib\n    \u003e\u003e\u003e\n    \u003e\u003e\u003e _ = patch_pathlib()  # replace `Path.open` with `smart_open.open`\n    \u003e\u003e\u003e\n    \u003e\u003e\u003e path = Path(\"tests/test_data/crime-and-punishment.txt.gz\")\n    \u003e\u003e\u003e\n    \u003e\u003e\u003e with path.open(\"r\") as infile:\n    ...     print(infile.readline()[:41])\n    В начале июля, в чрезвычайно жаркое время\n\nHow do I ...?\n=============\n\nSee `this document \u003chowto.md\u003e`__.\n\nExtending ``smart_open``\n========================\n\nSee `this document \u003cextending.md\u003e`__.\n\nTesting ``smart_open``\n======================\n\n``smart_open`` comes with a comprehensive suite of unit tests.\nBefore you can run the test suite, install the test dependencies::\n\n    pip install -e .[test]\n\nNow, you can run the unit tests::\n\n    pytest tests\n\nThe tests are also run automatically with `GitHub Actions \u003chttps://github.com/piskvorky/smart_open/actions/workflows/python-package.yml\u003e`_ on every commit push \u0026 pull request.\n\nComments, bug reports\n=====================\n\n``smart_open`` lives on `Github \u003chttps://github.com/piskvorky/smart_open\u003e`_. You can file\nissues or pull requests there. Suggestions, pull requests and improvements welcome!\n\n----------------\n\n``smart_open`` is open source software released under the `MIT license \u003chttps://github.com/piskvorky/smart_open/blob/master/LICENSE\u003e`_.\nCopyright (c) 2015-now `Radim Řehůřek \u003chttps://radimrehurek.com\u003e`_.\n","funding_links":["https://github.com/sponsors/piskvorky"],"categories":["Python","HarmonyOS","hacktoberfest","Data Loading \u0026 Extraction"],"sub_categories":["Windows Manager"],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fpiskvorky%2Fsmart_open","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fpiskvorky%2Fsmart_open","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fpiskvorky%2Fsmart_open/lists"}