{"id":44968713,"url":"https://github.com/cfhamlet/os-urlpattern","last_synced_at":"2026-02-18T15:04:00.514Z","repository":{"id":41055444,"uuid":"96843404","full_name":"cfhamlet/os-urlpattern","owner":"cfhamlet","description":"Unsupervised URLs clustering, generate and match URL pattern.","archived":false,"fork":false,"pushed_at":"2019-01-11T09:51:48.000Z","size":2041,"stargazers_count":50,"open_issues_count":1,"forks_count":8,"subscribers_count":1,"default_branch":"master","last_synced_at":"2025-09-28T23:16:50.121Z","etag":null,"topics":["cluster","pattern","regular-expression","url"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/cfhamlet.png","metadata":{"files":{"readme":"README.rst","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null}},"created_at":"2017-07-11T02:48:35.000Z","updated_at":"2025-08-14T10:47:42.000Z","dependencies_parsed_at":"2022-09-20T22:00:19.636Z","dependency_job_id":null,"html_url":"https://github.com/cfhamlet/os-urlpattern","commit_stats":null,"previous_names":[],"tags_count":32,"template":false,"template_full_name":null,"purl":"pkg:github/cfhamlet/os-urlpattern","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/cfhamlet%2Fos-urlpattern","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/cfhamlet%2Fos-urlpattern/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/cfhamlet%2Fos-urlpattern/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/cfhamlet%2Fos-urlpattern/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/cfhamlet","download_url":"https://codeload.github.com/cfhamlet/os-urlpattern/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/cfhamlet%2Fos-urlpattern/sbom","scorecard":{"id":271923,"data":{"date":"2025-08-11","repo":{"name":"github.com/cfhamlet/os-urlpattern","commit":"9311aff896ad591b2a9123d256f629f5d142dfc6"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":3,"checks":[{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Code-Review","score":0,"reason":"Found 0/21 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Vulnerabilities","score":10,"reason":"0 existing vulnerabilities detected","details":null,"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE:0","Info: FSF or OSI recognized license: MIT License: LICENSE:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 10 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}}]},"last_synced_at":"2025-08-17T13:36:52.146Z","repository_id":41055444,"created_at":"2025-08-17T13:36:52.146Z","updated_at":"2025-08-17T13:36:52.146Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":29582860,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-02-18T13:56:48.962Z","status":"ssl_error","status_checked_at":"2026-02-18T13:54:34.145Z","response_time":162,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.5:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["cluster","pattern","regular-expression","url"],"created_at":"2026-02-18T15:03:59.781Z","updated_at":"2026-02-18T15:04:00.508Z","avatar_url":"https://github.com/cfhamlet.png","language":"Python","funding_links":[],"categories":[],"sub_categories":[],"readme":"=============\nos-urlpattern\n=============\n\n.. image:: https://travis-ci.org/cfhamlet/os-urlpattern.svg?branch=master\n   :target: https://travis-ci.org/cfhamlet/os-urlpattern\n\n.. image:: https://codecov.io/gh/cfhamlet/os-urlpattern/branch/master/graph/badge.svg\n   :target: https://codecov.io/gh/cfhamlet/os-urlpattern\n\n.. image:: https://img.shields.io/pypi/pyversions/os-urlpattern.svg\n   :alt: PyPI - Python Version\n   :target: https://pypi.python.org/pypi/os-urlpattern\n  \n.. image:: https://img.shields.io/pypi/v/os-urlpattern.svg\n   :alt: PyPI\n   :target: https://pypi.python.org/pypi/os-urlpattern\n\n\nThis package is used for unsupervised URLs clustering. Furthermore, it generate URL patterns(RegEx) \nfrom clusters for matching purpose. It is a pure python package tested under python2.7 python3.6, \n`pypy \u003chttp://pypy.org/\u003e`_ can also be used for performance(4x-8x). Command line tools are provided \nfor standalone clustering and matching, APIs are also convenient. Several extra packages can be \ninstalled for additional features. Under CPython 1cpu, 100 thousand URLs clustering cost almost 1min \nand 200M memory. Built-in matching strategy is efficient enough in most use cases(4k/s, depend on \npatterns complexity).\n\n.. code:: console\n\n  $ pip install -U os-urlpattern\n  $ wget -qO- 'https://git.io/f4QlP' | pattern-make\n  /[0-9]{2}[\\.]html\n        http://example.com/01.html\n        http://example.com/02.html\n        http://example.com/03.html\n  /[0-9]{3}/test[0-9]{2}[\\.]html\n        http://example.com/123/test01.html\n        http://example.com/456/test02.html\n        http://example.com/789/test03.html\n\n\n==============\nAknowledgement\n==============\n\nSimilar URLs\n=============\n  \n* URLs with the same **URL structure**.\n\n* Components of the parsed URLs at the same position are in the same **character space**.\n\n* Different types of charactors may be in the same order in most cases.\n\n\nURL structure\n==============\n\nTypically, URL can be parsed into 6 components:\n\n``\u003cscheme\u003e://\u003cnetloc\u003e/\u003cpath\u003e;\u003cparams\u003e?\u003cquery\u003e#\u003cfragment\u003e``\n\nBecause different sites may have similar URLs structure and \u003cparams\u003e is rare, so \u003cschema\u003e \n\u003cnetloc\u003e and \u003cparams\u003e are ignored, \u003cpath\u003e \u003cquery\u003e \u003cfragment\u003e are used to define URL structure.\n\nIf the URLs have the same path levels, same query keys(also keys order) and with the same \nfragment existence, their URL structure should be the same. \n\n::\n    \n  http://example.com/p1/p2?k1=v1\u0026k2=v2#pos\n\n  URL structure:\n  path levels: 2\n  query keys: k1, k2\n  have fragment: True\n\nCharacter space\n===============\n\nConsider `RFC 3986 (Section 2: Characters) \u003chttps://tools.ietf.org/html/rfc3986#section-2\u003e`_,\nURL with the following characters would be legal:\n\n``ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$\u0026'()*+,;=%\u003c\u003e\\\"{}^|``\n\nThere are three major character space: lower-case letters(a-z), upper-case letters(A-Z), \nnumber letters(0-9). Other symbols are in their own character space.\n  \n::\n\n  HeLlOwoRd666!\n\n  character space: a-z A-Z 0-9 !\n      \nOrder consideration\n=====================\n\nSplit a string by character, consecutive character space can be joined. In most cases, order is a \ndistinguished feature.\n\n::\n\n  HELLOword666!\n\n  split into: HELLO word 666 !\n\n  character space order: A-Z a-z 0-9 !\n\n\nMix\n=====================\nComplex consecutive major character space can be mixed, order is less important.\n\n::\n\n  HellWorld666!\n\n  split into: H ell W orld 666 !\n\n  major join: HellWorld666 !\n\n  character space order: A-Za-z0-9 !\n\nBecause of URL quote, '%' can be mixed with major character space.\n\n::\n\n  %E4%BD%A0%E5%A5%BD!\n\n  split into: % E 4 % BD % A 0 % E 5 % A 5 % BD !\n\n  major join: %E4%BD%A0%E5%A5%BD !\n\n  character space order: A-Z0-9% !\n\n\nURL pattern\n============\n\nURL pattern is used to express each cluster. It is normal regex string. Each URL in \nthe same cluster can be matched with the pattern.\n\n::\n\n  pattern examples:\n\n  /news/[0-9]{8}/[a-z]+[\\\\.]html\n  /newsShow[\\\\.]asp[\\\\?]dataID=[0-9]+\n  /thread[\\\\-][0-9]+[\\\\-][0-9][\\\\-]1[\\\\.]html\n\nThe built-in matching strategy is strict, it can't tolerate incomplet matching.\n  \n::\n\n  letter: helloword\n\n  pattern01: [a-z0-9]+  # not match, because no number in the letter\n  pattern02: [a-z]+ # match\n\n\n========\nInstall\n========\n\nInstall with pip\n\n``$ pip install os-urlpattern``\n\nInstall extra packages\n\n.. list-table::\n  :header-rows: 1\n    \n  * - subpackage \n    - install command\n    - enables\n  * - memory\n    - ``pip install os-urlpattern[memroy]``\n    - Show memory useage\n  * - ete-tree\n    - ``pip install os-urlpattern[ete-tree]``\n    - Enable `ete \u003chttps://github.com/etetoolkit/ete\u003e`_ pattern tree formatter\n\n========\nUsage\n========\n\nCommand line\n=============\n\n* **pattern-make**\n    \n  Load urls, cluster and dump patterns.\n\n  .. code:: console\n    \n    $ pattern-make -h\n    usage: pattern-make [-h] [-v] [-i INPUTS [INPUTS ...]]\n                        [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] [-c CONFIG]\n                        [-f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}]\n\n    optional arguments:\n      -h, --help            show this help message and exit\n      -v, --version         show program's version number and exit\n      -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...]\n                            input files to be processed (default: stdin)\n      -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}\n                            log level (default: NOTSET)\n      -c CONFIG, --config CONFIG\n                            config file\n      -f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}, --formatter {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}\n                            output formatter (default: CLUSTER)\n  \n  Dump clustered URLs with patterns:\n\n  .. code:: console\n  \n    $ cat urls.txt | pattern-make -L debug \u003e clustered.txt\n\n  Only generate URL patterns:\n\n  .. code:: console\n  \n    $ cat urls.txt | pattern-make -L debug -F pattern \u003e patterns.txt\n  \n  Generate pattern tree from URLs(`ete \u003chttps://github.com/etetoolkit/ete\u003e`_ installed):\n\n  .. code:: console\n    \n    $ cat urls.txt | pattern-make -L debug -F ete\n\n* **pattern-match**\n\n  Load patterns, dump URLs matched results.\n\n  .. code:: console\n    \n    $ pattern-match -h\n    usage: pattern-match [-h] [-v] [-i INPUTS [INPUTS ...]]\n                         [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] -p\n                         PATTERN_FILES [PATTERN_FILES ...] [-a]\n\n    optional arguments:\n      -h, --help            show this help message and exit\n      -v, --version         show program's version number and exit\n      -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...]\n                            input files to be processed (default: stdin)\n      -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}\n                            log level (default: NOTSET)\n      -p PATTERN_FILES [PATTERN_FILES ...], --pattern-files PATTERN_FILES [PATTERN_FILES ...]\n                            pattern files to be loaded\n      -a, --all-matched     all matched patterns\n\n\n  Match URLs:\n\n  .. code:: console\n  \n    $ cat urls.txt | pattern-match -L debug -p patterns.txt\n\nAPIs\n=====\n\n* Cluster and generate URL patterns:\n\n  .. code:: python \n\n    from os_urlpattern.formatter import pformat\n    from os_urlpattern.pattern_maker import PatternMaker\n\n    pattern_maker = PatternMaker()\n\n    # load URLs(unicode)\n    for url in urls:\n        pattern_maker.load(url)\n\n    # cluster and print pattern\n    for url_meta, clustered in pattern_maker.make():\n        for pattern in pformat('pattern', url_meta, clustered):\n            # do whatever you want\n            pass\n\n\n* Match URLs:\n\n  .. code:: python \n  \n    from os_urlpattern.pattern_matcher import PatternMatcher\n\n    pattern_matcher = PatternMatcher()\n\n    # load url_pattern(unicode)\n    for url_pattern in url_patterns:\n        # meta will bind to matched result\n        pattern_matcher.load(url_pattern, meta=url_pattern)\n\n    # match URL(unicode)\n    for url in urls:\n        matched_results = patterm_matcher.match(url)\n        # the best matched result:\n        # sorted(matched_results, reverse=True)[0]\n        patterns = [n.meta for n in matched_results]\n\n\n* Low-level APIs:\n\n  It is necessary to use low-level APIs for customizing processing procdure,\n  especially for parallel computing or working on an distributed cluster(hadoop).\n\n  **Key points: same fuzzy-digest same maker and same matcher.**\n\n  Use ``os_urlpattern.parser.fuzzy_digest`` to get fuzzy digest from URL,\n  URL pattern or URLMeta and parsed pieces/patterns.\n\n  A brief All-In-One example:\n\n  .. code:: python \n  \n    from __future__ import print_function, unicode_literals\n    from os_urlpattern.formatter import pformat\n    from os_urlpattern.parser import fuzzy_digest, parse\n    from os_urlpattern.pattern_maker import Maker\n    from os_urlpattern.pattern_matcher import Matcher\n\n    urls = ['http://t.com/%02d.html' % i for i in xrange(0,10)]\n    makers = {}\n    matchers = {}\n\n    # Init makers from URLs(unicode).\n    for url in urls:\n        url_meta, parsed_pieces = parse(url)\n        \n        # same digest same maker\n        digest = fuzzy_digest(url_meta, parsed_pieces)\n        if digest not in makers:\n            makers[digest] = Maker(url_meta)\n        makers[digest].load(parsed_pieces)\n\n    # Iterate makers, do clustering, generate URL pattern and init matchers.\n    for maker in makers.values():\n        for clustered in maker.make():\n            for pattern in pformat('pattern', maker.url_meta, clustered):\n                # init matchers\n                url_meta, parsed_patterns = parse(pattern)\n                digest = fuzzy_digest(url_meta, parsed_patterns)\n                if digest not in matchers:\n                    matchers[digest] = Matcher(url_meta)\n                matchers[digest].load(parsed_patterns, pattern)\n    \n    # Match URLs(unicode).\n    for url in urls:\n        url_meta, parsed_pieces = parse(url)\n\n        # same digest same matcher\n        digest = fuzzy_digest(url_meta, parsed_pieces)\n        if digest in matchers:\n            matched = [n.meta for n in matchers[digest].match(parsed_pieces)]\n            print(url, *matched, sep=\"\\t\")        \n        else: # no matched at all\n            pass\n\n\n\n============\nUnit Tests\n============\n\n``$ tox``\n\n============\nLicense\n============\n\nMIT licensed.\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcfhamlet%2Fos-urlpattern","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fcfhamlet%2Fos-urlpattern","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcfhamlet%2Fos-urlpattern/lists"}