{"id":44787211,"url":"https://github.com/OlivierBinette/StringCompare","last_synced_at":"2026-03-01T00:00:45.293Z","repository":{"id":42024130,"uuid":"448306459","full_name":"OlivierBinette/StringCompare","owner":"OlivierBinette","description":"Efficient String Comparison Functions and Fuzzy String Matching","archived":false,"fork":false,"pushed_at":"2025-09-21T21:04:12.000Z","size":5897,"stargazers_count":20,"open_issues_count":13,"forks_count":2,"subscribers_count":1,"default_branch":"dev","last_synced_at":"2026-01-18T08:03:12.139Z","etag":null,"topics":["damerau-levenshtein","edit-distance","fuzzy-matching","jaro-winkler","levenshtein-distance","pybind11","python","string-matching"],"latest_commit_sha":null,"homepage":"https://olivierbinette.github.io/StringCompare/","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/OlivierBinette.png","metadata":{"files":{"readme":"README.ipynb","changelog":"CHANGELOG.rst","contributing":"CONTRIBUTING.rst","funding":".github/FUNDING.yml","license":null,"code_of_conduct":".github/CODE_OF_CONDUCT.md","threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null},"funding":{"github":"OlivierBinette"}},"created_at":"2022-01-15T14:53:56.000Z","updated_at":"2025-10-27T14:28:23.000Z","dependencies_parsed_at":"2025-09-21T22:11:45.478Z","dependency_job_id":"81f69577-7aa7-4240-94e7-88392eca8d22","html_url":"https://github.com/OlivierBinette/StringCompare","commit_stats":null,"previous_names":[],"tags_count":2,"template":false,"template_full_name":null,"purl":"pkg:github/OlivierBinette/StringCompare","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/OlivierBinette%2FStringCompare","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/OlivierBinette%2FStringCompare/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/OlivierBinette%2FStringCompare/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/OlivierBinette%2FStringCompare/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/OlivierBinette","download_url":"https://codeload.github.com/OlivierBinette/StringCompare/tar.gz/refs/heads/dev","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/OlivierBinette%2FStringCompare/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":29955885,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-02-28T22:53:01.873Z","status":"ssl_error","status_checked_at":"2026-02-28T22:52:50.699Z","response_time":90,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.6:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["damerau-levenshtein","edit-distance","fuzzy-matching","jaro-winkler","levenshtein-distance","pybind11","python","string-matching"],"created_at":"2026-02-16T10:00:26.931Z","updated_at":"2026-03-01T00:00:45.283Z","avatar_url":"https://github.com/OlivierBinette.png","language":"Python","readme":"{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \" \\n\",\n    \"[![Python package](https://github.com/OlivierBinette/StringCompare/actions/workflows/python-package-conda.yml/badge.svg)](https://github.com/OlivierBinette/StringCompare/actions/workflows/python-package-conda.yml) \\n\",\n    \"[![codecov](https://codecov.io/gh/OlivierBinette/StringCompare/branch/main/graph/badge.svg?token=F8ASD5R051)](https://codecov.io/gh/OlivierBinette/StringCompare)\\n\",\n    \"[![CodeFactor](https://www.codefactor.io/repository/github/olivierbinette/stringcompare/badge)](https://www.codefactor.io/repository/github/olivierbinette/stringcompare)\\n\",\n    \"[![Lifecycle Maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html)\\n\",\n    \"[![Release version](https://img.shields.io/github/v/release/olivierbinette/stringcompare)](https://github.com/OlivierBinette/StringCompare/releases) \\n\",\n    \"[![Sponsors](https://img.shields.io/github/sponsors/OlivierBinette)](https://github.com/sponsors/OlivierBinette) \\n\",\n    \"\\n\",\n    \" \\n\",\n    \"# ⚡ **StringCompare**: Efficient String Comparison Functions\\n\",\n    \"\\n\",\n    \"**StringCompare** is a Python package for efficient string similarity computation and approximate string matching. It is inspired by the excellent [*comparator*](https://github.com/ngmarchant/comparator) and [*stringdist*](https://github.com/markvanderloo/stringdist) R packages, and from the equally excellent [*py_stringmatching*](https://github.com/anhaidgroup/py_stringmatching), [*jellyfish*](https://github.com/jamesturk/jellyfish), and [*textdistance*](https://github.com/life4/textdistance) Python packages.\\n\",\n    \"\\n\",\n    \"The key feature of **StringCompare** is a focus on speed, extensibility and maintainability through its [*pybind11* ](https://github.com/pybind/pybind11) C++ implementation. **StringCompare** is faster than most other Python libraries (see benchmark below) and much more memory efficient when dealing with long strings.\\n\",\n    \"\\n\",\n    \"The [complete API documentation](https://olivierbinette.github.io/StringCompare/source/stringcompare.html) is available on the project website [olivierbinette.github.io/StringCompare](https://olivierbinette.github.io/StringCompare).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Installation\\n\",\n    \"\\n\",\n    \"Install the released version from github using the following commands:\\n\",\n    \"\\n\",\n    \"```bash\\n\",\n    \"    pip install git+https://github.com/OlivierBinette/StringCompare.git@release\\n\",\n    \"```\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Project Roadmap\\n\",\n    \"\\n\",\n    \"**StringCompare** currently implements [edit distances](https://en.wikipedia.org/wiki/Edit_distance) and similarity functions, such as the Levenshtein, Damerau-Levenshtein, Jaro, and Jaro-Winkler distances. This is *stage 1* of the following development roadmap: \\n\",\n    \"\\n\",\n    \"| Stage  | Goals | Status|\\n\",\n    \"| :-------------: | ------------- | :-------------: |\\n\",\n    \"| 1  | pybind11 framework and edit-based distances (Levenshtein, Damerau-Levenshtein, Jaro, and Jaro-Winkler) | ✔️ |\\n\",\n    \"| 2  | Token-based and hybrid distances (tf-idf similarity, LSH, Monge-Elkan, ...)  | ... |\\n\",\n    \"| 3  | Vocabulary optimizations and metric trees | ...  |\\n\",\n    \"| 4  | Embeddings and string distance learning | ...  |\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Examples\\n\",\n    \"\\n\",\n    \"Comparison algorithms are instanciated as `Comparator` object, which provides the `compare()` method (equivalent to `__call__()`) for string comparison.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"0.14285714285714285\"\n      ]\n     },\n     \"execution_count\": 1,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"from stringcompare import Levenshtein, Jaro, JaroWinkler, DamerauLevenshtein, LCSDistance\\n\",\n    \"\\n\",\n    \"lev = Levenshtein(normalize=True, similarity=False)\\n\",\n    \"\\n\",\n    \"lev(\\\"Olivier\\\", \\\"Oliver\\\") # same as lev.compare(\\\"Olivier\\\", \\\"Oliver\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Comparator objects also provide the `elementwise()` function for elementwise comparison between lists\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"array([0.14285714, 0.26666667])\"\n      ]\n     },\n     \"execution_count\": 2,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"lev.elementwise([\\\"Olivier\\\", \\\"Olivier\\\"], [\\\"Oliver\\\", \\\"Olivia\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"and the `pairwise()` function for pairwise comparisons.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"array([[0.        , 0.26666667],\\n\",\n       \"       [0.14285714, 0.28571429]])\"\n      ]\n     },\n     \"execution_count\": 3,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"lev.pairwise([\\\"Olivier\\\", \\\"Oliver\\\"], [\\\"Olivier\\\", \\\"Olivia\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Benchmark\\n\",\n    \"\\n\",\n    \"Comparison of the Damerau-Levenshtein implementation speed for different Python packages, when comparing the strings \\\"Olivier Binette\\\" and \\\"Oilvier Benet\\\":\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Package          avg runtime (ns)\\n\",\n      \"-------------  ------------------\\n\",\n      \"StringCompare             746.446\\n\",\n      \"jellyfish                 997.866\\n\",\n      \"textdistance             4205.98\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from timeit import timeit\\n\",\n    \"from tabulate import tabulate\\n\",\n    \"\\n\",\n    \"# Comparison functions\\n\",\n    \"from stringcompare import DamerauLevenshtein\\n\",\n    \"cmp = DamerauLevenshtein()\\n\",\n    \"from jellyfish import damerau_levenshtein_distance\\n\",\n    \"from textdistance import damerau_levenshtein\\n\",\n    \"\\n\",\n    \"functions = {\\n\",\n    \"    \\\"StringCompare\\\": cmp.compare,\\n\",\n    \"    \\\"jellyfish\\\": damerau_levenshtein_distance,\\n\",\n    \"    \\\"textdistance\\\": damerau_levenshtein,\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"table = [\\n\",\n    \"    [name, timeit(lambda: fun(\\\"Olivier Binette\\\", \\\"Oilvier Benet\\\"), number=1000000) * 1000]\\n\",\n    \"    for name, fun in functions.items()\\n\",\n    \"]\\n\",\n    \"print(tabulate(table, headers=[\\\"Package\\\", \\\"avg runtime (ns)\\\"]))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Performance notes\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The use of pybind11 comes with a small performance overhead. We could be faster if we directly interfaced with CPython.\\n\",\n    \"\\n\",\n    \"However, the use of pybind11 allows the library to be easily extensible and maintainable. The C++ implementation has little to worry about Python, excepted for the use of a pybind11 numpy wrapper in some places. Pybind11 takes care of the details of exposing the C++ API to Python.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Known Bugs\\n\",\n    \"\\n\",\n    \"*pybind11* has compatibility issues with gcc 11 (e.g. on Ubuntu 21.10). If running Linux and `gcc --version` is 11, then use the following commands to configure your environment before (re)installing:\\n\",\n    \"```bash\\n\",\n    \"        sudo apt install g++-9 gcc-9\\n\",\n    \"        export CC=gcc-9 CXX=g++-9\\n\",\n    \"```\\n\",\n    \"If this is unsuccessful, you might want to use **StringCompare** within a [Docker](https://www.docker.com/) container. I recommend using the python:3.7.9 base image. For example, after installing docker, you can launch an interactive bash session and install **StringCompare** as follows:\\n\",\n    \"```bash\\n\",\n    \"        sudo docker run -it python:3.7.9 bash\\n\",\n    \"        pip install git+https://github.com/OlivierBinette/StringCompare.git\\n\",\n    \"        python\\n\",\n    \"        \u003e\u003e\u003e import stringcompare\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"Please report installation issues [here](https://github.com/OlivierBinette/StringCompare/issues).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Contribute\\n\",\n    \"\\n\",\n    \"**StringCompare** is currently in early development stage and contributions are welcome! See the [contributing](https://olivierbinette.github.io/StringCompare/contributing.html) page for more information. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Acknowledgements\\n\",\n    \"\\n\",\n    \"This project is made possible by the support of the [Natural Sciences and Engineering Research Council of Canada (NSERC)](www.nserc-crsng.gc.ca) and by the support of a [G-Research](https://www.gresearch.co.uk/) grant.\\n\",\n    \"\\n\",\n    \"\u003ca href=\\\"https://www.gresearch.co.uk/\\\"\u003e\u003cimg src=\\\"https://res-1.cloudinary.com/crunchbase-production/image/upload/c_lpad,h_256,w_256,f_auto,q_auto:eco/gtqacyz2dx8jqicpnmqr\\\" height=100 style=\\\"margin:20px\\\"\u003e\u003c/a\u003e\u003ca href=\\\"https://www.nserc-crsng.gc.ca\\\"\u003e\u003cimg src=\\\"https://umanitoba.ca/faculties/engineering/media/NSERC_Logo.png\\\" height=100 style=\\\"margin:20px\\\"\u003e\u003c/a\u003e\\n\",\n    \"\\n\",\n    \"I would also like to thank the support of my individual [Github sponsors](https://github.com/sponsors/olivierbinette).\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"interpreter\": {\n   \"hash\": \"b582ae4d77d18d658cc55812e32328158e2f45884933450b1021a6ea5c0413ef\"\n  },\n  \"kernelspec\": {\n   \"display_name\": \"Python 3.9.7 64-bit ('groupbyrule': conda)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.7.9\"\n  },\n  \"orig_nbformat\": 4\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n","funding_links":["https://github.com/sponsors/OlivierBinette","https://github.com/sponsors/olivierbinette"],"categories":["Open-Source Software"],"sub_categories":["String Comparison"],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2FOlivierBinette%2FStringCompare","html_url":"https://awesome.ecosyste.ms/projects/github.com%2FOlivierBinette%2FStringCompare","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2FOlivierBinette%2FStringCompare/lists"}