{"id":17017612,"url":"https://github.com/datahappy1/czech_language_sentiment_analyzer","last_synced_at":"2026-03-11T01:02:04.917Z","repository":{"id":41490273,"uuid":"197976030","full_name":"datahappy1/czech_language_sentiment_analyzer","owner":"datahappy1","description":"Czech sentiment analyzer","archived":false,"fork":false,"pushed_at":"2023-05-22T22:36:57.000Z","size":213619,"stargazers_count":3,"open_issues_count":1,"forks_count":1,"subscribers_count":0,"default_branch":"master","last_synced_at":"2025-04-22T18:48:25.561Z","etag":null,"topics":["bootstrap","chartsjs","czech","czech-language","czech-sentiment-analyzer","flask","heroku","heroku-app","logistic-regression","movie-ratings","movie-reviews","naive-bayes","postgres","python","python-3","scraper","sentiment","sentiment-analysis","sqlite3","support-vector-machine"],"latest_commit_sha":null,"homepage":"http://czester.herokuapp.com","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/datahappy1.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2019-07-20T20:02:24.000Z","updated_at":"2023-07-07T14:15:42.000Z","dependencies_parsed_at":"2025-04-22T18:40:55.082Z","dependency_job_id":null,"html_url":"https://github.com/datahappy1/czech_language_sentiment_analyzer","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/datahappy1/czech_language_sentiment_analyzer","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datahappy1%2Fczech_language_sentiment_analyzer","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datahappy1%2Fczech_language_sentiment_analyzer/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datahappy1%2Fczech_language_sentiment_analyzer/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datahappy1%2Fczech_language_sentiment_analyzer/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/datahappy1","download_url":"https://codeload.github.com/datahappy1/czech_language_sentiment_analyzer/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/datahappy1%2Fczech_language_sentiment_analyzer/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":30364607,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-03-10T21:41:54.280Z","status":"ssl_error","status_checked_at":"2026-03-10T21:40:59.357Z","response_time":106,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.5:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["bootstrap","chartsjs","czech","czech-language","czech-sentiment-analyzer","flask","heroku","heroku-app","logistic-regression","movie-ratings","movie-reviews","naive-bayes","postgres","python","python-3","scraper","sentiment","sentiment-analysis","sqlite3","support-vector-machine"],"created_at":"2024-10-14T06:37:05.022Z","updated_at":"2026-03-11T01:02:04.882Z","avatar_url":"https://github.com/datahappy1.png","language":"Python","funding_links":[],"categories":[],"sub_categories":[],"readme":"##### 10000 ft. Overview\n![10000 ft overview][10000ft_overview]\n\n[10000ft_overview]: https://github.com/datahappy1/czech_language_sentiment_analyzer/blob/master/docs/img/10000ft_project_overview.png?raw=true \"10000 ft. overview\"\n\n##### Data Collection\n56k Czech movie reviews were collected using the \u003ca href=\"https://github.com/datahappy1/czech_language_sentiment_analyzer/blob/master/data_preparation/data_collector_movie_review_scraper.py\"\u003e/data_preparation/data_collector_movie_review_scraper.py\u003c/a\u003e\nmultithreaded HTML scraping module. These reviews were scrubbed using `langdetect` module to remove reviews written in Slovak language. This dataset was also scrubbed against a collection of Czech stopwords.  To have the data balanced with the same amount of negative and positive reviews, the\nfinal dataset had to be reduced to 11.5k positive and 11.5k negative reviews. Collected data was also stemmed before training the models.\n\n##### ML Models\nFrom `Scikit-Learn` Python library, `Naive Bayes`, `Logistic regression` and `Support Vector Machine` ML models were used\nfor training and testing data for text sentiment analysis.\nThe scripts for training and testing are located here: \n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"https://github.com/datahappy1/czech_language_sentiment_analyzer/tree/master/ml_models/logistic_regression\"\u003e/ml_models/logistic_regression\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://github.com/datahappy1/czech_language_sentiment_analyzer/tree/master/ml_models/naive_bayes\"\u003e/ml_models/naive_bayes\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://github.com/datahappy1/czech_language_sentiment_analyzer/tree/master/ml_models/support_vector_machine\"\u003e/ml_models/support_vector_machine\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\nThe overall sentiment score for the specified text input is calculated as a weighted average based on the precision score accuracy of these 3 model predictions.\n\n##### Flask web application\nThe Flask web application is currently hosted at \u003ca href=\"https://czester.herokuapp.com\"\u003ehttps://czester.herokuapp.com\u003c/a\u003e, source code can be found in this location \u003ca href=\"https://github.com/datahappy1/czech_language_sentiment_analyzer/tree/master/flask_webapp\"\u003e/flask_webapp/\u003c/a\u003e.\nThis application backend is written in Python using the `Flask` framework and `Bootstrap` for the templates styling. This app also provides the users with a simple API. The stats module is a result of an integration between `Chart.js` and `Flask` where the statistics data persistence layer can be either `Sqlite3` or `Heroku Postgres`.\nIf you provide this app with a environment variable named `DATABASE_URL` containing the Heroku Postgres DB URL like `postgres://YourPostgresUrl`, then remote `Heroku Postgres` will be used, otherwise local `Sqlite3` db instance will be used.\n\n##### Input text dataflow diagram:\n![Input text dataflow diagram][input_text_dataflow]\n\n[input_text_dataflow]: https://github.com/datahappy1/czech_language_sentiment_analyzer/blob/master/docs/img/input_text_flow_diagram.png?raw=true \"input text dataflow\"\n\n##### How to run this Flask App from local environment\n1) create and activate a standard Python virtual or pipenv environment \u003cbr\u003e\n2) `pip3` install the requirements from `requirements.txt` \u003cbr\u003e\n3) set the working directory for instance to the path where you cloned this repo (Make sure it's the path where the Heroku `Procfile` file is located)\n\n##### TODOs / Future ideas\n\u003cul\u003e\n    \u003cli\u003e\u003cdel\u003eRemove reviews written in Slovak language\u003c/del\u003e \t\u0026#10003;\u003c/li\u003e\n    \u003cli\u003e\u003cdel\u003eVerify input text is written in Czech language\u003c/del\u003e \t\u0026#10003;\u003c/li\u003e\n    \u003cli\u003e\u003cdel\u003eAdd Flask web app tests\u003c/del\u003e \t\u0026#10003;\u003c/li\u003e\n    \u003cli\u003e\u003cdel\u003eAdd Czech word stemmatizer module\u003c/del\u003e \t\u0026#10003;\u003c/li\u003e\n    \u003cli\u003eEnsembling instead of weighted model precision average for overall sentiment\u003c/li\u003e\n    \u003cli\u003eRedis could replace Sqlite3 / Postgres\u003c/li\u003e\n    \u003cli\u003eMigrate from Heroku to AWS\u003c/li\u003e\n\u003c/ul\u003e\n\n##### Useful links\n\u003cul\u003e\n    \u003cli\u003e\u003ca href=\"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html\"\u003eScikit-Learn working with text data\u003c/a\u003e\u003c/li\u003e\n    \u003cli\u003e\u003ca href=\"https://www.pluralsight.com/guides/ensemble-modeling-scikit-learn\"\u003eEnsembling with Scikit-Learn\u003c/a\u003e\u003c/li\u003e\n    \u003cli\u003e\u003ca href=\"https://towardsdatascience.com/two-is-better-than-one-ensembling-models-611ee4fa9bd8\"\u003eEnsembling models\u003c/a\u003e\u003c/li\u003e\n    \u003cli\u003e\u003ca href=\"https://pypi.org/project/langdetect/\"\u003eLangdetect PyPi project homepage\u003c/a\u003e\u003c/li\u003e\n    \u003cli\u003e\u003ca href=\"https://www.chartjs.org/docs/latest/charts/\"\u003eChart.js homepage\u003c/a\u003e\u003c/li\u003e\n    \u003cli\u003e\u003ca href=\"https://medium.com/the-andela-way/deploying-a-python-flask-app-to-heroku-41250bda27d0\"\u003eDeploying Flask to Heroku tutorial\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdatahappy1%2Fczech_language_sentiment_analyzer","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fdatahappy1%2Fczech_language_sentiment_analyzer","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdatahappy1%2Fczech_language_sentiment_analyzer/lists"}