{"id":45828381,"url":"https://github.com/tsproisl/SoMaJo","last_synced_at":"2026-03-12T07:00:54.244Z","repository":{"id":57469259,"uuid":"96202399","full_name":"tsproisl/SoMaJo","owner":"tsproisl","description":"A tokenizer and sentence splitter for German and English web and social media texts.","archived":false,"fork":false,"pushed_at":"2024-12-09T12:52:34.000Z","size":1411,"stargazers_count":150,"open_issues_count":7,"forks_count":22,"subscribers_count":7,"default_branch":"master","last_synced_at":"2026-01-26T11:54:34.480Z","etag":null,"topics":["english","german","sentence-splitter","social-media","tokenizer"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"gpl-3.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/tsproisl.png","metadata":{"files":{"readme":"README.md","changelog":"CHANGES.txt","contributing":null,"funding":null,"license":"LICENSE.txt","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2017-07-04T09:45:44.000Z","updated_at":"2025-12-03T08:51:55.000Z","dependencies_parsed_at":"2023-09-21T21:51:34.851Z","dependency_job_id":"60c9851c-6e11-499d-9e92-7fc82590155a","html_url":"https://github.com/tsproisl/SoMaJo","commit_stats":{"total_commits":635,"total_committers":9,"mean_commits":70.55555555555556,"dds":"0.017322834645669305","last_synced_commit":"c2f9c24ded800f9d332439f6b9b3416eb5b59427"},"previous_names":[],"tags_count":48,"template":false,"template_full_name":null,"purl":"pkg:github/tsproisl/SoMaJo","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/tsproisl%2FSoMaJo","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/tsproisl%2FSoMaJo/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/tsproisl%2FSoMaJo/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/tsproisl%2FSoMaJo/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/tsproisl","download_url":"https://codeload.github.com/tsproisl/SoMaJo/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/tsproisl%2FSoMaJo/sbom","scorecard":{"id":901000,"data":{"date":"2025-08-11","repo":{"name":"github.com/tsproisl/SoMaJo","commit":"40ecf399dfed951de0436b2289a6b5174548982f"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":3.4,"checks":[{"name":"Code-Review","score":0,"reason":"Found 0/30 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Token-Permissions","score":0,"reason":"detected GitHub workflow tokens with excessive permissions","details":["Warn: no topLevel permission defined: .github/workflows/test.yml:1","Info: no jobLevel write permissions found"],"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Dangerous-Workflow","score":10,"reason":"no dangerous workflow patterns detected","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"SAST","score":0,"reason":"no SAST tool detected","details":["Warn: no pull requests merged into dev branch"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Pinned-Dependencies","score":0,"reason":"dependency not pinned by hash detected -- score normalized to 0","details":["Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/test.yml:14: update your workflow using https://app.stepsecurity.io/secureworkflow/tsproisl/SoMaJo/test.yml/master?enable=pin","Warn: GitHub-owned GitHubAction not pinned by hash: .github/workflows/test.yml:17: update your workflow using https://app.stepsecurity.io/secureworkflow/tsproisl/SoMaJo/test.yml/master?enable=pin","Warn: pipCommand not pinned by hash: .github/workflows/test.yml:23","Warn: pipCommand not pinned by hash: .github/workflows/test.yml:24","Info:   0 out of   2 GitHub-owned GitHubAction dependencies pinned","Info:   0 out of   2 pipCommand dependencies pinned"],"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE.txt:0","Info: FSF or OSI recognized license: GNU General Public License v3.0: LICENSE.txt:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Vulnerabilities","score":10,"reason":"0 existing vulnerabilities detected","details":null,"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'master'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}}]},"last_synced_at":"2025-08-24T15:33:38.068Z","repository_id":57469259,"created_at":"2025-08-24T15:33:38.069Z","updated_at":"2025-08-24T15:33:38.069Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":30417685,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-03-12T06:40:58.731Z","status":"ssl_error","status_checked_at":"2026-03-12T06:40:40.296Z","response_time":114,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.6:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["english","german","sentence-splitter","social-media","tokenizer"],"created_at":"2026-02-26T21:56:04.254Z","updated_at":"2026-03-12T07:00:54.238Z","avatar_url":"https://github.com/tsproisl.png","language":"Python","funding_links":[],"categories":["Vorverarbeitungstools","Werkzeuge"],"sub_categories":["Tokenisierung","Textverarbeitung"],"readme":"# SoMaJo\n\n[![PyPI](https://img.shields.io/pypi/v/SoMaJo)](https://pypi.org/project/SoMaJo/)\n[![Build](https://github.com/tsproisl/SoMaJo/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/tsproisl/SoMaJo/actions/workflows/test.yml?query=branch%3Amaster)\n\n  - [Introduction](#introduction)\n  - [Features](#features)\n  - [Installation](#installation)\n  - [Usage](#usage)\n      - [Using the somajo-tokenizer executable](#using-the-somajo-tokenizer-executable)\n      - [Using the module](#using-the-module)\n  - [Evaluation](#evaluation)\n  - [Tokenizing English text](#tokenizing-english-text)\n  - [Development](#development)\n  - [References](#references)\n\n\n## Introduction\n\n```\necho 'Wow, superTool!;)' | somajo-tokenizer -c -\nWow\n,\nsuper\nTool\n!\n;)\n```\n\nSoMaJo is a rule-based tokenizer and sentence splitter that implements\ntokenization guidelines for German and English. It has a strong focus\non web and social media texts (it was originally created as the\nwinning submission to the [EmpiriST 2015 shared\ntask](https://sites.google.com/site/empirist2015/) on automatic\nlinguistic annotation of computer-mediated communication / social\nmedia) and is particularly well-suited to perform tokenization on all\nkinds of written discourse, for example chats, forums, wiki talk\npages, tweets, blog comments, social networks, SMS and WhatsApp\ndialogues. Of course it also works on more formal texts.\n\nVersion 1 of the tokenizer is described in greater detail in [Proisl\nand Uhrig (2016)](https://aclanthology.org/W16-2607).\n\nFor part-of-speech tagging (in particular of German web and social\nmedia texts), we recommend\n[SoMeWeTa](https://github.com/tsproisl/SoMeWeTa):\n\n```\nsomajo-tokenizer --split_sentences \u003cfile\u003e | somewe-tagger --tag \u003cmodel\u003e -\n```\n\n\n## Features\n\n  - Rule-based tokenization and sentence-splitting:\n    - [EmpiriST 2015 tokenization\n      guidelines](https://github.com/fau-klue/empirist-corpus/blob/9f00233951f7d1503ba4c3dd4af975d3c73cba80/doc/EmpiriST_Guideline-Tokenisierung.pdf)\n      for German\n    - “New” Penn Treebank conventions for English (described, for\n      example, in the guidelines for ETTB 2.0 [(Mott et al.,\n      2009)](https://web.archive.org/web/20110727133755/http://projects.ldc.upenn.edu/gale/task_specifications/ettb_guidelines.pdf)\n      and CLEAR [(Warner et al.,\n      2012)](https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf))\n    - Optionally split camel-cased tokens\n    - Optionally output token class information for each token, i.e.\n      if it is a number, an emoticon, an abbreviation, etc.\n    - Optionally output additional information for each token, e.g. if\n      it was followed by whitespace or if it contained internal\n      whitespace\n    - Optionally split the tokenized text into sentences\n    - Optionally determine the character offsets of the tokens in the\n      input, allowing for stand-off tokenization\n  - Text preprocessing/cleaning:\n    - Normalize text to [Unicode Normalization Form C (NFC)](https://unicode.org/reports/tr15/)\n    - Remove control characters and other usually unwanted characters,\n      such as soft hyphens and zero-width spaces\n  - XML support:\n    - Transparent processing of XML: Tokenize the textual content of\n      an XML file while preserving the XML structure\n    - Optionally delimit sentence boundaries by XML tags\n    - Optionally prune tags, i.e. subtrees, from the XML before\n      tokenization (for example to remove `\u003cscript\u003e` and `\u003cstyle\u003e`\n      tags from HTML input)\n    - Optionally strip all tags from the output, effectively turning\n      the XML into plain text\n  - Parallelization: Optionally run multiple worker processes to speed\n    up tokenization\n\n\n## Installation\n\nSoMaJo can be easily installed using pip (pip3 in some distributions):\n\n```sh\npip install -U SoMaJo\n```\n\nAlternatively, you can download and decompress the [latest\nrelease](https://github.com/tsproisl/SoMaJo/releases/latest) or clone\nthe git repository:\n\n```sh\ngit clone https://github.com/tsproisl/SoMaJo.git\n```\n\nIn the new directory, run the following command:\n\n```sh\npip install -U .\n```\n\n\n## Usage\n\n### Using the somajo-tokenizer executable\n\nYou can use the tokenizer as a standalone program from the command\nline. General usage information is available via the `-h` option:\n\n```\nsomajo-tokenizer -h\nusage: somajo-tokenizer [-h] [-l {en_PTB,de_CMC}]\n                        [-s {single_newlines,empty_lines}] [-x] [--tag TAG]\n                        [--prune PRUNE] [--strip-tags] [-c]\n                        [--split_sentences] [--sentence_tag SENTENCE_TAG] [-t]\n                        [-e] [--parallel N] [-v]\n                        FILE\n\nA tokenizer and sentence splitter for German and English texts. Currently, two\ntokenization guidelines are implemented: The EmpiriST guidelines for German\nweb and social media texts (de_CMC) and the \"new\" Penn Treebank conventions\nfor English texts (en_PTB).\n\npositional arguments:\n  FILE                  The input file (UTF-8-encoded) or \"-\" to read from\n                        STDIN.\n\noptions:\n  -h, --help            show this help message and exit\n  -l {en_PTB,de_CMC}, --language {en_PTB,de_CMC}\n                        Choose a language. Currently supported are German\n                        EmpiriST-style tokenization (de_CMC) and English Penn-\n                        Treebank-style tokenization(en_PTB). (Default: de_CMC)\n  -s {single_newlines,empty_lines}, --paragraph_separator {single_newlines,empty_lines}\n                        How are paragraphs separated in the input text? Will\n                        be ignored if option -x/--xml is used. (Default:\n                        empty_lines)\n  -x, --xml             The input is an XML file. You can specify tags that\n                        always constitute a sentence break (e.g. HTML p tags)\n                        via the --tag option.\n  --tag TAG             Start and end tags of this type constitute sentence\n                        breaks, i.e. they do not occur in the middle of a\n                        sentence. Can be used multiple times to specify\n                        multiple tags, e.g. --tag p --tag br. Implies option\n                        -x/--xml. (Default: --tag title --tag h1 --tag h2\n                        --tag h3 --tag h4 --tag h5 --tag h6 --tag p --tag br\n                        --tag hr --tag div --tag ol --tag ul --tag dl --tag\n                        table)\n  --prune PRUNE         Tags of this type will be removed from the input\n                        before tokenization. Can be used multiple times to\n                        specify multiple tags, e.g. --tag script --tag style.\n                        Implies option -x/--xml. By default, no tags are\n                        pruned.\n  --strip-tags          Suppresses output of XML tags. Implies option\n                        -x/--xml.\n  -c, --split_camel_case\n                        Split items in written in camelCase (excluding\n                        established names and terms).\n  --split_sentences, --split-sentences\n                        Also split the input into sentences.\n  --sentence_tag SENTENCE_TAG, --sentence-tag SENTENCE_TAG\n                        Tag name for sentence boundaries (e.g. --sentence_tag\n                        s). If this option is specified, sentences will be\n                        delimited by XML tags (e.g. \u003cs\u003e…\u003c/s\u003e) instead of empty\n                        lines. This option implies --split_sentences\n  -t, --token_classes   Output the token classes (number, XML tag,\n                        abbreviation, etc.) in addition to the tokens.\n  -e, --extra_info      Output additional information for each token:\n                        SpaceAfter=No if the token was not followed by a space\n                        and OriginalSpelling=\"…\" if the token contained\n                        whitespace.\n  --character-offsets   Output character offsets in the input for each token.\n  --parallel N          Run N worker processes (up to the number of CPUs) to\n                        speed up tokenization.\n  -v, --version         Output version information and exit.\n```\n\nHere are some common use cases:\n\n  - To tokenize a text file according to the guidelines of the\n    EmpiriST 2015 shared task:\n    \n    ```\n    somajo-tokenizer -c \u003cfile\u003e\n    ```\n    \n    \u003cdetails\u003e\u003csummary\u003eShow example\u003c/summary\u003e\n    \n    ```\n    echo 'der beste Betreuer? - \u003eProfSmith! : )' | somajo-tokenizer -c -\n    der\n    beste\n    Betreuer\n    ?\n    -\u003e\n    Prof\n    Smith\n    !\n    :)\n    ```\n    \u003c/details\u003e\n  - If you do not want to split camel-cased tokens, simply drop the\n    `-c` option:\n    \n    ```\n    somajo-tokenizer \u003cfile\u003e\n    ```\n    \n    \u003cdetails\u003e\u003csummary\u003eShow example\u003c/summary\u003e\n    \n    ```\n    echo 'der beste Betreuer? - \u003eProfSmith! : )' | somajo-tokenizer -\n    der\n    beste\n    Betreuer\n    ?\n    -\u003e\n    ProfSmith\n    !\n    :)\n    ```\n    \u003c/details\u003e\n  - Your input delimits paragraphs by single newlines instead of empty\n    lines? Tell the tokenizer via the `-s`/`--paragraph_separator`\n    option:\n    \n    ```\n    somajo-tokenizer --paragraph_separator single_newlines \u003cfile\u003e\n    ```\n  - In addition to tokenizing the input, SoMaJo can also split it into\n    sentences:\n    \n    ```\n    somajo-tokenizer --split-sentences \u003cfile\u003e\n    ``` \n    \n    \u003cdetails\u003e\u003csummary\u003eShow example\u003c/summary\u003e\n    \n    ```\n    echo 'Palim, Palim! Ich hätte gerne eine Flasche Pommes Frites.' | somajo-tokenizer --split-sentences -\n    Palim\n    ,\n    Palim\n    !\n    \n    Ich\n    hätte\n    gerne\n    eine\n    Flasche\n    Pommes\n    Frites\n    .\n    \n    ``` \n  - To tokenize English text according to the “new” Penn Treebank\n    conventions, explicitly specify the tokenization guideline using\n    the `-l`/`--language` option:\n    \n    ```\n    somajo-tokenizer -l en_PTB \u003cfile\u003e\n    ```\n    \n    \u003cdetails\u003e\u003csummary\u003eShow example\u003c/summary\u003e\n    \n    ```\n    echo 'Dont you wanna come?' | somajo-tokenizer -l en_PTB -\n    Do\n    nt\n    you\n    wan\n    na\n    come\n    ?\n    ```\n    \u003c/details\u003e\n  - SoMaJo can also process XML files. Use the `-x`/`--xml` option to\n    tell the tokenizer that your input is an XML file:\n    \n    ```\n    somajo-tokenizer --xml \u003cxml-file\u003e\n    ```\n    \n    \u003cdetails\u003e\u003csummary\u003eShow example\u003c/summary\u003e\n    \n    ```\n    echo '\u003chtml\u003e\u003chead\u003e\u003ctitle\u003eWeihnachten\u003c/title\u003e\u003c/head\u003e\u003cbody\u003e\u003cp\u003eFr\u0026#x00fc;her war mehr Lametta!\u003c/p\u003e\u003c/body\u003e\u003c/html\u003e' | somajo-tokenizer --xml -\n    \u003chtml\u003e\n    \u003chead\u003e\n    \u003ctitle\u003e\n    Weihnachten\n    \u003c/title\u003e\n    \u003c/head\u003e\n    \u003cbody\u003e\n    \u003cp\u003e\n    Früher\n    war\n    mehr\n    Lametta\n    !\n    \u003c/p\u003e\n    \u003c/body\u003e\n    \u003c/html\u003e\n    ```\n    \u003c/details\u003e\n  - For XML input, you can use (multiple instances of) the `--tag`\n    option to specify XML tags that are always sentence breaks, i.e.\n    that can never occur in the middle of a sentence. See the help\n    message for the default list of tags.\n    \n    ```\n    somajo-tokenizer --xml --split_sentences --tag h1 --tag p --tag div \u003cxml-file\u003e\n    ```\n  - Via option `-t`/`--token_classes`, SoMaJo can output token class\n    information for each token, i.e. if it is a number, an emoticon,\n    an abbreviation, etc. Via option `-e`/`--extra_info`, additional\n    information is available, e.g. if a token was followed by\n    whitespace or if it contained internal whitespace.\n    \n    \u003cdetails\u003e\u003csummary\u003eShow example\u003c/summary\u003e\n    \n    ```\n    echo 'der beste Betreuer? - \u003eProfSmith! : )' | somajo-tokenizer -c -e -t -\n    der      regular\n    beste    regular\n    Betreuer regular    SpaceAfter=No\n    ?        symbol\n    -\u003e       symbol     SpaceAfter=No, OriginalSpelling=\"- \u003e\"\n    Prof     regular    SpaceAfter=No\n    Smith    regular    SpaceAfter=No\n    !        symbol\n    :)       emoticon   OriginalSpelling=\": )\"\n    ```\n    \u003c/details\u003e\n  - To speed up tokenization, you can specify the number of worker\n    processes used via the `--parallel` option:\n    \n    ```\n    somajo-tokenizer --parallel \u003cnumber\u003e \u003cfile\u003e\n    ```\n\n\n### Using the module\n\nTake a look at the [API documentation](doc/build/markdown/somajo.md).\n\nYou can incorporate SoMaJo into your own Python projects. All you need\nto do is importing `somajo`, creating a `SoMaJo` object and calling\none of its tokenizer functions: `tokenize_text`, `tokenize_text_file`,\n`tokenize_xml` or `tokenize_xml_file`. These functions return a\ngenerator that yields tokenized chunks of text. By default, these\nchunks of text are sentences. If you set `split_sentences=False`, then\nthe chunks of text are either paragraphs or chunks of XML. Every\ntokenized chunk of text is a list of `Token` objects.\n\nHere is an example for tokenizing and sentence splitting two\nparagraphs:\n\n```python\nfrom somajo import SoMaJo\n\ntokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)\n\n# note that paragraphs are allowed to contain newlines\nparagraphs = [\"der beste Betreuer?\\n-- ProfSmith! : )\",\n              \"Was machst du morgen Abend?! Lust auf Film?;-)\"]\n\nsentences = tokenizer.tokenize_text(paragraphs)\nfor sentence in sentences:\n    for token in sentence:\n        print(f\"{token.text}\\t{token.token_class}\\t{token.extra_info}\")\n    print()\n```\n\nAnd here is an example for tokenizing and sentence splitting a whole\nfile. The option `paragraph_separator=\"single_newlines\"` states that\nparagraphs are delimited by newlines instead of empty lines:\n\n```python\nsentences = tokenizer.tokenize_text_file(\"Beispieldatei.txt\", paragraph_separator=\"single_newlines\")\nfor sentence in sentences:\n    for token in sentence:\n        print(token.text)\n    print()\n```\n\nFor processing XML data, use the `tokenize_xml` or `tokenize_xml_file`\nmethods:\n\n```python\neos_tags = [\"title\", \"h1\", \"p\"]\n\n# you can read from an open file object\nsentences = tokenizer.tokenize_xml_file(file_object, eos_tags)\n# or you can specify a file name\nsentences = tokenizer.tokenize_xml_file(\"Beispieldatei.xml\", eos_tags)\n# or you can pass a string with XML data\nsentences = tokenizer.tokenize_xml(xml_string, eos_tags)\n\nfor sentence in sentences:\n    for token in sentence:\n        print(token.text)\n    print()\n```\n\n\n## Evaluation\n\nSoMaJo was the system with the highest average F₁ score in the\nEmpiriST 2015 shared task. The performance of the current version on\nthe two test sets is summarized in the following table (Training and\ntest sets are available from the [official\nwebsite](https://sites.google.com/site/empirist2015/gscl-shared-task-automatic-linguistic-annotation-of-computer-mediated-communication-social-media/gold-standard)):\n\n| Corpus | Precision | Recall | F₁    |\n|--------|-----------|--------|-------|\n| CMC    | 99.71     | 99.56  | 99.64 |\n| Web    | 99.91     | 99.92  | 99.91 |\n\n\n## Tokenizing English text\n\nSoMaJo can also tokenize English text. In general, we follow the “new”\nPenn Treebank conventions described, for example, in the guidelines\nfor ETTB 2.0 [(Mott et al.,\n2009)](https://web.archive.org/web/20110727133755/http://projects.ldc.upenn.edu/gale/task_specifications/ettb_guidelines.pdf)\nand CLEAR [(Warner et al.,\n2012)](https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf).\n\nFor tokenizing English text on the command line, specify the language\nvia the `-l` or `--language` option:\n\n    somajo-tokenizer -l en_PTB \u003cfile\u003e\n\nFrom Python, you can pass `language=\"en_PTB\"` to the `SoMaJo`\nconstructor, e.g.:\n\n```python\nparagraphs = [\"That aint bad!:D\"]\ntokenizer = SoMaJo(language=\"en_PTB\")\nsentences = tokenizer.tokenize_text(paragraphs)\n```\n\nPerformance of the English tokenizer:\n\n| Corpus               | Precision | Recall | F₁    |\n|----------------------|-----------|--------|-------|\n| English Web Treebank | 99.66     | 99.64  | 99.65 |\n\n\n## Development\n\nHere are some brief notes to help you get started:\n\n  - Preferably create a dedicated virtual environment.\n  - Make sure you have pip ≥ 21.3.\n  - Install the project in editable mode:\n    \n    ```sh\n    pip install -U -e .\n    ```\n  - Install the development dependencies:\n    \n    ```sh\n    pip install -r requirements_dev.txt\n    ```\n  - To run the tests:\n    \n    ```sh\n    python3 -m unittest discover\n    ```\n  - To build the documentation:\n    \n    ```sh\n    cd doc\n    make markdown\n    ```\n    Note that the created markdown is not perfect and needs some\n    manual postprocessing.\n  - To build the distribution files:\n    \n    ```sh\n    python3 -m build\n    ```\n\n## References\n\nIf you use SoMaJo for academic research, please consider citing the\nfollowing paper:\n\n  - Proisl, Thomas, and Peter Uhrig. 2016. “SoMaJo: State-of-the-Art\n    Tokenization for German Web and Social Media Texts.” In\n    *Proceedings of the 10th Web as Corpus Workshop (WAC-X) and the\n    EmpiriST Shared Task*, edited by Paul Cook, Stefan Evert, Roland\n    Schäfer, and Egon Stemle, 57–62. Berlin: Association for\n    Computational Linguistics. \u003chttps://doi.org/10.18653/v1/W16-2607\u003e.\n    \n    ```bibtex\n    @InProceedings{Proisl_Uhrig_EmpiriST:2016,\n      author    = {Proisl, Thomas and Uhrig, Peter},\n      title     = {{SoMaJo}: {S}tate-of-the-art tokenization for {G}erman web and social media texts},\n      year      = {2016},\n      booktitle = {Proceedings of the 10th {W}eb as {C}orpus Workshop ({WAC-X}) and the {EmpiriST} Shared Task},\n      editor    = {Cook, Paul and Evert, Stefan and Schäfer, Roland and Stemle, Egon},\n      address   = {Berlin},\n      publisher = {Association for Computational Linguistics},\n      pages     = {57--62},\n      doi       = {10.18653/v1/W16-2607},\n      url       = {https://aclanthology.org/W16-2607},\n    }\n    ```\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Ftsproisl%2FSoMaJo","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Ftsproisl%2FSoMaJo","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Ftsproisl%2FSoMaJo/lists"}