{"id":34033659,"url":"https://github.com/giganticode/codeprep","last_synced_at":"2026-04-02T01:04:06.201Z","repository":{"id":52675787,"uuid":"179685171","full_name":"giganticode/codeprep","owner":"giganticode","description":"A toolkit for pre-processing large source code corpora","archived":false,"fork":false,"pushed_at":"2022-09-30T18:57:29.000Z","size":1631,"stargazers_count":45,"open_issues_count":8,"forks_count":11,"subscribers_count":3,"default_branch":"master","last_synced_at":"2026-01-05T22:07:22.061Z","etag":null,"topics":["language-modeling","mining-software-repositories","natural-language-processing","source-code-analysis","word-segmentation"],"latest_commit_sha":null,"homepage":null,"language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/giganticode.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSES/Apache-2.0.txt","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null}},"created_at":"2019-04-05T13:20:24.000Z","updated_at":"2025-12-27T06:56:31.000Z","dependencies_parsed_at":"2022-08-20T14:20:23.590Z","dependency_job_id":null,"html_url":"https://github.com/giganticode/codeprep","commit_stats":null,"previous_names":["giganticode/dataprep"],"tags_count":14,"template":false,"template_full_name":null,"purl":"pkg:github/giganticode/codeprep","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/giganticode%2Fcodeprep","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/giganticode%2Fcodeprep/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/giganticode%2Fcodeprep/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/giganticode%2Fcodeprep/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/giganticode","download_url":"https://codeload.github.com/giganticode/codeprep/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/giganticode%2Fcodeprep/sbom","scorecard":{"id":426523,"data":{"date":"2025-08-11","repo":{"name":"github.com/giganticode/codeprep","commit":"0f41307f7a9ad545e5ec0cc9552a0144328f2422"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":1.9,"checks":[{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Code-Review","score":0,"reason":"Found 1/29 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSES/Apache-2.0.txt:0","Info: FSF or OSI recognized license: Apache License 2.0: LICENSES/Apache-2.0.txt:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"Vulnerabilities","score":1,"reason":"9 existing vulnerabilities detected","details":["Warn: Project is vulnerable to: PYSEC-2022-288 / GHSA-6hrg-qmvc-2xh8","Warn: Project is vulnerable to: PYSEC-2021-356 / GHSA-2ww3-fxvq-293j","Warn: Project is vulnerable to: PYSEC-2024-167 / GHSA-cgvx-9447-vcch","Warn: Project is vulnerable to: PYSEC-2021-859 / GHSA-f8m6-h2c7-8h9x","Warn: Project is vulnerable to: PYSEC-2022-5 / GHSA-rqjh-jp2r-59cj","Warn: Project is vulnerable to: PYSEC-2021-140 / GHSA-9w8r-397f-prfh","Warn: Project is vulnerable to: PYSEC-2023-117 / GHSA-mrwq-x4v8-fh7p","Warn: Project is vulnerable to: PYSEC-2021-141 / GHSA-pq64-v7f5-gqh8","Warn: Project is vulnerable to: GHSA-g7vv-2v7x-gj9p"],"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 5 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}}]},"last_synced_at":"2025-08-19T02:21:21.911Z","repository_id":52675787,"created_at":"2025-08-19T02:21:21.911Z","updated_at":"2025-08-19T02:21:21.911Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":31293631,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-04-01T21:15:39.731Z","status":"ssl_error","status_checked_at":"2026-04-01T21:15:34.046Z","response_time":53,"last_error":"SSL_read: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["language-modeling","mining-software-repositories","natural-language-processing","source-code-analysis","word-segmentation"],"created_at":"2025-12-13T19:17:13.171Z","updated_at":"2026-04-02T01:04:06.175Z","avatar_url":"https://github.com/giganticode.png","language":"Python","funding_links":[],"categories":[],"sub_categories":[],"readme":"\u003c!--\nSPDX-FileCopyrightText: 2020 Hlib Babii \u003chlibbabii@gmail.com\u003e\n\nSPDX-License-Identifier: Apache-2.0\n--\u003e\n\n# Codeprep\n\n[![Build Status](https://travis-ci.org/giganticode/codeprep.svg?branch=master)](https://travis-ci.org/giganticode/codeprep)\n[![Maintainability](https://api.codeclimate.com/v1/badges/64c9b107bc09fdb1b3b1/maintainability)](https://codeclimate.com/github/giganticode/codeprep/maintainability)\n[![Test Coverage](https://api.codeclimate.com/v1/badges/64c9b107bc09fdb1b3b1/test_coverage)](https://codeclimate.com/github/giganticode/codeprep/test_coverage)\n[![PyPI version fury.io](https://badge.fury.io/py/codeprep.svg)](https://pypi.python.org/pypi/codeprep/)\n\n**This is a tool for preprocessing source code corpora according to a specified vocabulary modeling choice.**\n\nSupported modeling choices are: \n* Splitting algorithm (no identifier splitting, camel-case splitting, snake-case splitting, BPE (byte-pair-encoding), \nnumber-splitting, ronin: http://joss.theoj.org/papers/10.21105/joss.00653); \n* Number of merges if using BPE; \n* Ignoring/preserving string literals; \n* Ignoring/preserving comments; \n* Preserving case/lowercasing;\n* Preserving/ignoring newlines and tabs.\n* applying/not applying stemming after basic splitting \n\n# Getting started\n\nMake sure you have python \u003e= 3.6 installed in your system; pip, setuptools and wheel are up to date.\n```bash\npython --version\npython -m pip install --upgrade pip setuptools wheel\n```\n\nInstall **codeprep** lib:\n```bash\npip install codeprep\n```\n\nIn order to run the **ronin** algorithm, you will have to additionally install Spiral module (https://github.com/casics/spiral/):\n```bash\npip install git+https://github.com/casics/spiral.git\n```\n\nThe tool can be used **as a python library** as well as a standalone module runnable with a **CLI**. \nYou can pass the path to the dataset or the text itself to be preprocessed. When using Python API for the former option \nyou need to import methods from `codeprep.api.text` module, for the latter - from `codeprep.api.corpus`.\nBelow you can see the general patterns of usage.\n\n\nPython API\n```python\n\u003e\u003e\u003e import codeprep.api.text as cp\n\u003e\u003e\u003e cp.\u003ccommmand\u003e('Some code to be split')\n```\n\n```python\n\u003e\u003e\u003e import codeprep.api.corpus as cp\n\u003e\u003e\u003e cp.\u003ccommmand\u003e('/path/to/the/dataset')\n```\n\nCLI\n```bash\ncodeprep \u003ccommmand\u003e \"Some code to be split\"\n```\n\n```bash\ncodeprep \u003ccommmand\u003e --path /path/to/the/dataset\n```\n\n\nHereafter we will demonstrate the usage as a python library. The CLI is analogous to the python API. You can find the documentation about how to use it [here](codeprep/cli/spec.py). \n\n## Usage examples\n\n### Basic splitting \nTokenization + CamelCase- and snake_case- splitting:\n\n```python\n\u003e\u003e\u003e import codeprep.api.text as cp\n\u003e\u003e\u003e input_code = '''void test_WordUeberraschungPrinter() {\n...     if (eps \u003e= 0.345e+4) { // FIXME\n...         printWord(\"     ...     Überraschung\");\n...     }\n... }'''\n\u003e\u003e\u003e cp.basic(input_code)\n['void', '\u003cw\u003e', 'test', '_', 'Word', 'Ueberraschung', 'Printer', '\u003c/w\u003e', '(', ')', '{', '\\n', \n'\\t', 'if', '(', 'eps', '\u003e', '=', '0', '.', '\u003cw\u003e', '345', 'e', '\u003c/w\u003e', '+', '4', ')', '{', '/', '/', 'FIXME', '\\n', \n'\\t', '\\t', '\u003cw\u003e', 'print', 'Word', '\u003c/w\u003e', '(', '\"', '\\t', '.', '.', '.', '\\t', 'Überraschung', '\"', ')', ';', '\\n', \n'\\t', '}', '\\n', \n'}']\n```\n\n### Tokenize but don't split identifiers\n\n```python\n\u003e\u003e\u003e import codeprep.api.text as cp\n\u003e\u003e\u003e input_code = '''void test_WordUeberraschungPrinter() {\n...     if (eps \u003e= 0.345e+4) { // FIXME\n...         printWord(\"     ...     Überraschung\");\n...     }\n... }'''\n\u003e\u003e\u003e cp.nosplit(input_code)\n['void', 'test_WordUeberraschungPrinter', '(', ')', '{', '\\n', \n'\\t', 'if', '(', 'eps', '\u003e', '=', '0', '.', '345e', '+', '4', ')', '{', '/', '/', 'FIXME', '\\n', \n'\\t', '\\t', 'printWord', '(', '\"', '\\t', '.', '.', '.', '\\t', 'Überraschung', '\"', ')', ';', '\\n', \n'\\t', '}', '\\n', \n'}']\n```\n\n### BPE (Byte-Pair encoding)\n\nThe following code does **camelCase-** and **snake_case-** splitting and applies **bpe with 10k merges** on top:\n\n```python\n\u003e\u003e\u003e import codeprep.api.text as cp\n\u003e\u003e\u003e input_code = '''void test_WordUeberraschungPrinter() {\n...     if (eps \u003e= 0.345e+4) { // FIXME\n...         printWord(\"     ...     Überraschung\");\n...     }\n... }'''\n\u003e\u003e\u003e cp.bpe(input_code, bpe_codes_id='10k')\n['v', 'oid\u003c/t\u003e', 'test_', 'Word', 'U', 'eb', 'err', 'as', 'ch', 'un', 'g', 'Print', 'er\u003c/t\u003e', '(\u003c/t\u003e', ')\u003c/t\u003e', '{\u003c/t\u003e', '\\n', \n'\\t', 'i', 'f\u003c/t\u003e', '(\u003c/t\u003e', 'e', 'ps\u003c/t\u003e', '\u003e\u003c/t\u003e', '=\u003c/t\u003e', '0\u003c/t\u003e', '.\u003c/t\u003e', '34', '5', 'e\u003c/t\u003e', '+\u003c/t\u003e', '4\u003c/t\u003e', ')\u003c/t\u003e', '{\u003c/t\u003e', '/\u003c/t\u003e', '/\u003c/t\u003e', 'FIX', 'M', 'E\u003c/t\u003e',  '\\n', \n'\\t', '\\t', 'print', 'Word\u003c/t\u003e', '(\u003c/t\u003e', '\"\u003c/t\u003e', '\\t', '.\u003c/t\u003e', '.\u003c/t\u003e', '.\u003c/t\u003e', '\\t', 'Ü', 'b', 'err', 'as', 'ch', 'un', 'g\u003c/t\u003e', '\"\u003c/t\u003e', ')\u003c/t\u003e', ';\u003c/t\u003e', '\\n', \n'\\t', '}\u003c/t\u003e', '\\n', \n'}\u003c/t\u003e']\n```\n\n**codeprep** by default does BPE using bpe codes leaned on [the Github Java Corpus](http://groups.inf.ed.ac.uk/cup/javaGithub/). The argument `bpe_codes_id='10k'` tells the **codeprep** tool to use 10,000 bpe merges. \nOther possible values are `1k` and `5k` (1,000 and 5,000 merges respectively). Please refer to section [Learning custom BPE codes](#Learning-custom-BPE-codes) to train custom bpe codes.\n\n**For other commands and options like `chars`, `--split-numbers`, `--ronin`, `--stem`, please refer to the [docs](codeprep/cli/spec.py)**.\n\n## Calculate vocabulary \nSet `calc_vocab` param to `True` when calling a preprocessing method to calculate the vocabulary of the preprocessed corpus, e.g.:\n```python\n\u003e\u003e\u003e import codeprep.api.corpus as cp\n\u003e\u003e\u003e cp.basic('/path/to/train/on', calc_vocab=True)\n...\nVocab is available at /path/to/vocab\n```\n\n## Learning custom BPE codes\nIf you don't want to use, pre-trained BPE codes, it's possible to train custom ones. For example, to train 10,000 merges on the corpus located at the path `/path/to/train/on`, the following command should be run (only CLI):\n\n```bash\ncodeprep learn-bpe 10000 -p /path/to/train/on --id custom-bpe-codes \n```\n\nNow it is possible to do bpe splitting by running the bpe command with the number of merges from 0 to 10,000 (for example with 3500 merges):\n\n```bash\ncodeprep bpe custom-bpe-codes-3500 -p /path/to/preprocess \n```\n\nBefore bpe codes are trained, the [basic preprocessing](#basic-splitting) is done, which can also be tuned with arguments described in section [Tweaking preprocessing](#tweaking-preprocessing).\n\n\n## Additional options\n### Tweaking preprocessing\nYou can pass the following parameters with a `True` value (default values for all of them are False), to tweak the way the imput is preprocessed:\n\n * `no_str` - replace strings with \u003cstring\u003e placeholders.\n * `no_com` - replace comments with \u003ccomment\u003e placeholders.\n * `no_spaces` - remove newlines and tabs.\n * `no_unicode` - replace words containing non-ascii characters with \u003cnon-en\u003e placeholders.\n * `no_case` - lowercase words and encode information about case in \u003cCap\u003e \u003cCAP\u003e tokens.\n```python\n\u003e\u003e\u003e import codeprep.api.text as cp\n\u003e\u003e\u003e input_code = '''void test_WordUeberraschungPrinter() {\n...     if (eps \u003e= 0.345e+4) { // FIXME\n...         printWord(\"     ...     Überraschung\");\n...     }\n... }'''\n\u003e\u003e\u003e cp.basic(input_code, no_spaces=True, no_unicode=True, no_case=True, no_com=True, no_str=True)\n['void', '\u003cw\u003e', 'test', '_', '\u003cCap\u003e', 'word', '\u003cCap\u003e', 'ueberraschung', '\u003cCap\u003e', 'printer', '\u003c/w\u003e', '(', ')', '{', \n'if', '(', 'eps', '\u003e', '=', '0', '.', '\u003cw\u003e', '345', 'e', '\u003c/w\u003e', '+', '4', ')', '{', '/', '/', '\u003cCAPS\u003e', 'fixme', \n'\u003cw\u003e', 'print', '\u003cCap\u003e', 'word', '\u003c/w\u003e', '(', '\"', '.', '.', '.', '\u003cCap\u003e', '\u003cnon-en\u003e', '\"', ')', ';', \n'}', \n'}']\n```\n\nSimilar params can be specified as switches `--no-str`, `--no-com`, `--no-spaces`, `--no-unicode`, `--no-case` in CLI commands.\n\n### Specifying the language\nUnless explicitely specified, **codeprep** will assume the language is java. To make sure the input is preprocessed as intended, it is always **highly recommended** to specify it:\n```python\nimport codeprep.api.text as cp\n\u003e\u003e\u003e cp.bpe(\"volatile\", '1k')\n['volatile']\n\u003e\u003e\u003e cp.bpe(\"volatile\", '1k', extension=\"py\")\n['v', 'ol', 'a', 'ti', 'le\u003c/t\u003e']\n# Since 'volatile' is a keyword in java, it is represented as one token unlike in python \n# where it is pretty rare when used as an identifier and therefore represented as multiple subtokens.\n```\n\nWhen preprocessing a corpus, `codeprep` identifies the language based on the file extension. If you want only files with (a) certain extension(s) to be preprocessed, you can specify --ext param \n```bash\ncodeprep basic --path /path/to/be/preprocessed --ext \"java\"\n\n# or if you want to pre-process multiple types of files: \ncodeprep basic --path /path/to/be/preprocessed --ext \"java|c|py|js\"\n```\n### Miscellaneous\nYou can specify the path to where the preprocessed corpus will be written:\n```bash\ncodeprep basic --path /path/to/preprocess --output-path /path/to/output\n```\n\nTo print logs with log level DEBUG and higher to stdout:\n```bash\ncodeprep basic --path /path/to/preprocess --verbose\n```\n\n## Getting Help\nTo get help on commands and options:\n\n```bash\ncodeprep --help\n```\n\n## Paper\n\nThis library was build to run experiments for our paper accepted at ICSE 2020: [Big Code != Big Vocabulary: Open-Vocabulary Models for Source Code](https://arxiv.org/pdf/2003.07914.pdf)\n\nIf you you the library or the results, please cite the paper:\n\n ```\n @article{karampatsis2020big,\n  title={Big Code!= Big Vocabulary: Open-Vocabulary Models for Source Code},\n  author={Karampatsis, Rafael-Michael and Babii, Hlib and Robbes, Romain and Sutton, Charles and Janes, Andrea},\n  journal={arXiv preprint arXiv:2003.07914},\n  year={2020}\n}\n ```\n\n\n# Advanced\n\n### Caching\n\nWhen preprocessing a dataset, **codeprep** first parses source code and converts it into internal representation, \nwhich is after that converted to a preprocessed dataset depending on provided parameters. The intermediate \nrepresentation is cached, so that when the same dataset is pre-processed again with different parameters,\n**codeprep** (providing no changes have been made to the dataset) would use the cache rather than parsing \nthe source code again.\n\nTo store the cache, **codeprep** uses a directory speecified by `$XDG_CACHE_HOME/codeprep/\u003ccodeprep_version\u003e` variable if its value is set, \n`$HOME/.cache/codeprep/\u003ccodeprep_version\u003e` otherwise.\n\nRemoving the cache will not change the final result, however, will result in slower pre-processing.\n\n# Releases\n\n## 1.0.3\n- Add more flixibility with versions of dependencies\n\n## 1.0.1\n- Fix training custom bpe codes (Thanks to @mir-am)\n- Fix corpus pre-processing on Windows\n\n## 1.0.0\n- DOI assigned\n\n## 1.0.0-alpha.12\n- Bugfixes and minor improvements\n\n## 1.0.0-alpha.11 (NOT backward compatible with 1.0.0-alpha.10)\n\n- Include token types in the metadata\n- Expand on token type hierarchy\n- Make possible to return full token index in the iterator\n\n## 1.0.0-alpha.10 (NOT backward compatible with 1.0.0-alpha.9)\n\n- Add boundaries of comments to pre-processing metadata\n- Add Windows and OSx support\n- Switch from unittest to pytest+doctest\n- Bugfixes related to literal presentation of tokens on the disk\n- Bugfixes related to adding \u003c/t\u003e to mark the end of a full token\n\n## 1.0.0-alpha.9 (NOT backward compatible with 1.0.0-alpha.7)\n\n- Add `get_corpus_size()` method to `PreprocessedCorpus` class\n- Do BPE splitting without splitting by convention first\n- Use \u003c/t\u003e to mark the last sub-token of a token\n- Replacing non-ascii sequences with a special char\n- Follow symlinks when reading a dataset\n- make possible to preserve case when doing stemming\n- Bugfixes\n\n## 1.0.0-alpha.7 (NOT backward compatible with 1.0.0-alpha.6)\n\n- Store version in `codeprep.__version__`\n- implement `--full-strings` and `--max-str-length` options\n- replace `ronin` method/command wit`--ronin` option and apply ronin algorithm on word level instead of full identifier level\n- if `split_numbers` option is set to `True`, split numbers not only in code but also in strings and comments\n- change placeholder values to more human-readable\n- improve logging displaying\n- Bugfixes\n\n## 1.0.0-alpha.6\n\nInitial PyPI release\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fgiganticode%2Fcodeprep","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fgiganticode%2Fcodeprep","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fgiganticode%2Fcodeprep/lists"}