{"id":13699646,"url":"https://github.com/stestagg/pytubes","last_synced_at":"2025-04-06T07:10:12.305Z","repository":{"id":57458108,"uuid":"123112399","full_name":"stestagg/pytubes","owner":"stestagg","description":"A module for getting data into python from large data sources","archived":false,"fork":false,"pushed_at":"2024-03-13T11:14:34.000Z","size":2175,"stargazers_count":175,"open_issues_count":5,"forks_count":20,"subscribers_count":8,"default_branch":"master","last_synced_at":"2025-03-30T06:04:20.003Z","etag":null,"topics":["cpp","cpp11","cython","data","numpy","python"],"latest_commit_sha":null,"homepage":"","language":"C++","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/stestagg.png","metadata":{"files":{"readme":"README.rst","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2018-02-27T10:23:09.000Z","updated_at":"2025-03-16T11:59:28.000Z","dependencies_parsed_at":"2024-10-15T01:21:07.057Z","dependency_job_id":"9328931a-8bec-417f-93e1-5b3582deb3b5","html_url":"https://github.com/stestagg/pytubes","commit_stats":{"total_commits":109,"total_committers":3,"mean_commits":"36.333333333333336","dds":0.08256880733944949,"last_synced_commit":"003b2a5045417317667559ce39ddd0a005208e70"},"previous_names":[],"tags_count":2,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/stestagg%2Fpytubes","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/stestagg%2Fpytubes/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/stestagg%2Fpytubes/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/stestagg%2Fpytubes/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/stestagg","download_url":"https://codeload.github.com/stestagg/pytubes/tar.gz/refs/heads/master","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":247445668,"owners_count":20939958,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["cpp","cpp11","cython","data","numpy","python"],"created_at":"2024-08-02T20:00:40.247Z","updated_at":"2025-04-06T07:10:12.266Z","avatar_url":"https://github.com/stestagg.png","language":"C++","readme":"pytubes\n=======\n\nSource: https://github.com/stestagg/pytubes\n\nPytubes is a library that optimizes loading datasets into memory.\n\nAt it’s core is a set of specialized C++ classes that can be chained together to load and manipulate data using a standard iterator pattern. Around this there is a cython extension module that makes defining and configuring a tube simple and straight-forward.\n\n \n\n\nSimple Example\n--------------\n\n\u003e\u003e\u003e from tubes import Each\n\u003e\u003e\u003e import glob\n\u003e\u003e\u003e tube = (Each(glob.glob(\"*.json\"))   # Iterate over some filenames\n        .read_files()                   # Read each file, chunk by chunk\n        .split()                        # Split the file, line-by-line\n        .json()                         # parse json\n        .get('country_code', 'null'))   # extract field named 'country_code'\n\u003e\u003e\u003e set(tube)                           # collect results in a set\n{'A1', 'AD', 'AE', 'AF', 'AG', 'AL', 'AM', 'AO', 'AP', ...}\n\nMore Complex Example\n--------------------\n\n\u003e\u003e\u003e from tubes import Each\n\u003e\u003e\u003e import glob\n\n\u003e\u003e\u003e x = (Each(glob.glob('*.jsonz'))\n        .map_files()\n        .gunzip()\n        .split(b'\\n')\n        .json()\n        .enumerate()\n        .skip_unless(lambda x: x.slot(1).get('country_code', '\"\"').to(str).equals('GB'))\n        .multi(lambda x: (\n            x.slot(0),\n            x.slot(1).get('timestamp', 'null'),\n            x.slot(1).get('country_code', 'null'),\n            x.slot(1).get('url', 'null'),\n            x.slot(1).get('file', '{}').get('filename', 'null'),\n            x.slot(1).get('file', '{}').get('project'),\n            x.slot(1).get('details', '{}').get('installer', '{}').get('name', 'null'),\n            x.slot(1).get('details', '{}').get('python', 'null'),\n            x.slot(1).get('details', '{}').get('system', 'null'),\n            x.slot(1).get('details', '{}').get('system', '{}').get('name', 'null'),\n            x.slot(1).get('details', '{}').get('cpu', 'null'),\n            x.slot(1).get('details', '{}').get('distro', '{}').get('libc', '{}').get('lib', 'null'),\n            x.slot(1).get('details', '{}').get('distro', '{}').get('libc', '{}').get('version', 'null'),\n        ))\n    )\n\u003e\u003e\u003e print(list(x)[-3])\n(15,612,767, '2017-12-14 09:33:31 UTC', 'GB', '/packages/29/9b/25ef61e948321296f029f53c9f67cc2b54e224db509eb67ce17e0df6044a/certifi-2017.11.5-py2.py3-none-any.whl', 'certifi-2017.11.5-py2.py3-none-any.whl', 'certifi', 'pip', '2.7.5', {'name': 'Linux', 'release': '2.6.32-696.10.3.el6.x86_64'}, 'Linux', 'x86_64', 'glibc', '2.17')\n","funding_links":[],"categories":["C++"],"sub_categories":[],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fstestagg%2Fpytubes","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fstestagg%2Fpytubes","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fstestagg%2Fpytubes/lists"}