https://github.com/acbikash13/ratemyprofessorsentimentanalysis
This project applies sentiment analysis to RateMyProfessor reviews using NLP techniques. It preprocesses data with Pandas and NumPy and uses pretrained Transformer models with PyTorch for fine-tuned sentiment classification. TextBlob provides initial sentiment scores, while transfer learning improves model accuracy, capturing nuanced feedback.
https://github.com/acbikash13/ratemyprofessorsentimentanalysis
machine-learning nltk-python pytorch sentiment-analysis tensorflow transformers
Last synced: 26 days ago
JSON representation
This project applies sentiment analysis to RateMyProfessor reviews using NLP techniques. It preprocesses data with Pandas and NumPy and uses pretrained Transformer models with PyTorch for fine-tuned sentiment classification. TextBlob provides initial sentiment scores, while transfer learning improves model accuracy, capturing nuanced feedback.
- Host: GitHub
- URL: https://github.com/acbikash13/ratemyprofessorsentimentanalysis
- Owner: acbikash13
- Created: 2024-11-08T17:46:21.000Z (over 1 year ago)
- Default Branch: main
- Last Pushed: 2024-11-08T18:21:54.000Z (over 1 year ago)
- Last Synced: 2025-10-08T09:54:12.372Z (8 months ago)
- Topics: machine-learning, nltk-python, pytorch, sentiment-analysis, tensorflow, transformers
- Homepage:
- Size: 678 KB
- Stars: 0
- Watchers: 1
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: README.ipynb
Awesome Lists containing this project
README
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"toc_visible": true,
"machine_shape": "hm",
"gpuType": "V100"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"f0eed8bbb0464f0c91648c5f2bda1717": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_ea57cc0a7ed4424f8bff8b4f56893c6e",
"IPY_MODEL_4da77da5d4c346dd8680e536294184a3",
"IPY_MODEL_3c4ec79460a54b72bdccc50fe28af414"
],
"layout": "IPY_MODEL_1ab8077c160a43568f747cffd2b1acf3"
}
},
"ea57cc0a7ed4424f8bff8b4f56893c6e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_7ebceff4841042658839ea4a4ce3b085",
"placeholder": "",
"style": "IPY_MODEL_870c3d1b1366421990f4dade7cc20e12",
"value": "config.json: 100%"
}
},
"4da77da5d4c346dd8680e536294184a3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_76a4617ff7f942e7a62b9b3a85b0a47a",
"max": 747,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_adbaba87d9dc4a149d56da9ab995a0ac",
"value": 747
}
},
"3c4ec79460a54b72bdccc50fe28af414": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c38d0da48a7140d5a94fe952d1e629db",
"placeholder": "",
"style": "IPY_MODEL_a15720df8df44e2f9e522020dda20983",
"value": " 747/747 [00:00<00:00, 57.1kB/s]"
}
},
"1ab8077c160a43568f747cffd2b1acf3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"7ebceff4841042658839ea4a4ce3b085": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"870c3d1b1366421990f4dade7cc20e12": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"76a4617ff7f942e7a62b9b3a85b0a47a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"adbaba87d9dc4a149d56da9ab995a0ac": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"c38d0da48a7140d5a94fe952d1e629db": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"a15720df8df44e2f9e522020dda20983": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"f62d0103b7f74fa184aac44aa11a83fb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_3d11a61db02c4cae941248eddad6ad7b",
"IPY_MODEL_a9b546bddfa34f1baca68f60a553ed1d",
"IPY_MODEL_2c44b84b999c49028d205a13a610311c"
],
"layout": "IPY_MODEL_f2b86db5faea4e0d880cd6c1534dfce3"
}
},
"3d11a61db02c4cae941248eddad6ad7b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_93f8297833fa4c138fb311418fcdca43",
"placeholder": "",
"style": "IPY_MODEL_d0d11c7f6d914c64ae1b494edc743d01",
"value": "pytorch_model.bin: 100%"
}
},
"a9b546bddfa34f1baca68f60a553ed1d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_072b471b516747d4be28b7147c197138",
"max": 498679497,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_df4729b709284ab0a06a5542554475d3",
"value": 498679497
}
},
"2c44b84b999c49028d205a13a610311c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e3cffd92f9b544dc8cb7bdbc86bd4436",
"placeholder": "",
"style": "IPY_MODEL_e2f4ca55ad2746f99d127c865584a3a6",
"value": " 499M/499M [00:08<00:00, 64.2MB/s]"
}
},
"f2b86db5faea4e0d880cd6c1534dfce3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"93f8297833fa4c138fb311418fcdca43": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d0d11c7f6d914c64ae1b494edc743d01": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"072b471b516747d4be28b7147c197138": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"df4729b709284ab0a06a5542554475d3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"e3cffd92f9b544dc8cb7bdbc86bd4436": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e2f4ca55ad2746f99d127c865584a3a6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"bc3c933ce9b44c6786b1bc0b00fd7111": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_bf5b408db6de46908113423a28911a51",
"IPY_MODEL_52fa176e08d94ac09791fc33e6a13580",
"IPY_MODEL_3e733ad0f536475d8b7a779ef541bb66"
],
"layout": "IPY_MODEL_4126545abe0c4c329a2b90850f9dd52c"
}
},
"bf5b408db6de46908113423a28911a51": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a44169381fe64e22816c99e2fa1f6a8e",
"placeholder": "",
"style": "IPY_MODEL_8cdb213ee5a44baea79e109542ae2184",
"value": "vocab.json: 100%"
}
},
"52fa176e08d94ac09791fc33e6a13580": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b192100647d845dc89da9fd153896af1",
"max": 898822,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_e4a80d0e8ba14d58b59fb999bd4410cb",
"value": 898822
}
},
"3e733ad0f536475d8b7a779ef541bb66": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_ba264991d37d496bbe4bbc88c36b60cf",
"placeholder": "",
"style": "IPY_MODEL_29cfa745459b4f17a2fd10501fe5c846",
"value": " 899k/899k [00:00<00:00, 1.10MB/s]"
}
},
"4126545abe0c4c329a2b90850f9dd52c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"a44169381fe64e22816c99e2fa1f6a8e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"8cdb213ee5a44baea79e109542ae2184": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"b192100647d845dc89da9fd153896af1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e4a80d0e8ba14d58b59fb999bd4410cb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"ba264991d37d496bbe4bbc88c36b60cf": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"29cfa745459b4f17a2fd10501fe5c846": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"002a3013e6374846a2ff0b434b503393": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_e3f3e2983a5645b3a6a425235078534d",
"IPY_MODEL_bbd085ac324d47cd871535a57b97f863",
"IPY_MODEL_1ef50b7bf43f4ab09f810d33270fd2ae"
],
"layout": "IPY_MODEL_d2e39603a2144603ab106538f37572c0"
}
},
"e3f3e2983a5645b3a6a425235078534d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_4870546e86894b7cbe559fd290c23998",
"placeholder": "",
"style": "IPY_MODEL_0ada5989ba224174b82509278775b4ca",
"value": "merges.txt: 100%"
}
},
"bbd085ac324d47cd871535a57b97f863": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_df9f5665dcc9448cb9c38d3078013d28",
"max": 456318,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_1fcc7464292b41fc98bfc155ddf8877f",
"value": 456318
}
},
"1ef50b7bf43f4ab09f810d33270fd2ae": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_10818e7eac16492494e28569f8dedf96",
"placeholder": "",
"style": "IPY_MODEL_348393c646d04084ab1b8686043585cf",
"value": " 456k/456k [00:00<00:00, 750kB/s]"
}
},
"d2e39603a2144603ab106538f37572c0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4870546e86894b7cbe559fd290c23998": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0ada5989ba224174b82509278775b4ca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"df9f5665dcc9448cb9c38d3078013d28": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1fcc7464292b41fc98bfc155ddf8877f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"10818e7eac16492494e28569f8dedf96": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"348393c646d04084ab1b8686043585cf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"a635405626fb41ccae7e4ce23b60b6dd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_0f320cfabffe4b8ca517e1c0df81c635",
"IPY_MODEL_16a4855dc37643d5b2a459d21214d58b",
"IPY_MODEL_62a95f89c42940c1811288a305cbc885"
],
"layout": "IPY_MODEL_dcf3a616eb4c4ca286b0c75619faf701"
}
},
"0f320cfabffe4b8ca517e1c0df81c635": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b0e52493e7c0435285197601387faafb",
"placeholder": "",
"style": "IPY_MODEL_7696c27af7254d2287c0811777c47cf4",
"value": "special_tokens_map.json: 100%"
}
},
"16a4855dc37643d5b2a459d21214d58b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e24c0d17f212462e845f523208ea1f01",
"max": 150,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_34fbc295f5b84b059167d05449f61d65",
"value": 150
}
},
"62a95f89c42940c1811288a305cbc885": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_bc667cd09af34390b427ebed8ee734d4",
"placeholder": "",
"style": "IPY_MODEL_d7e45a3206004d9fa589443274955a37",
"value": " 150/150 [00:00<00:00, 14.6kB/s]"
}
},
"dcf3a616eb4c4ca286b0c75619faf701": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b0e52493e7c0435285197601387faafb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"7696c27af7254d2287c0811777c47cf4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"e24c0d17f212462e845f523208ea1f01": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"34fbc295f5b84b059167d05449f61d65": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"bc667cd09af34390b427ebed8ee734d4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d7e45a3206004d9fa589443274955a37": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Sentiment Analysis of RateMyProfessor Sample Data\n",
"\n"
],
"metadata": {
"id": "F11zz9KCg6zR"
}
},
{
"cell_type": "markdown",
"source": [
"# `1. Project Setup`\n"
],
"metadata": {
"id": "trPnxDU4Y2IR"
}
},
{
"cell_type": "markdown",
"source": [
"## `1.1 Introduction`\n"
],
"metadata": {
"id": "HhR6TI5YawEG"
}
},
{
"cell_type": "markdown",
"source": [
"## `1.2 Project Dependencies`"
],
"metadata": {
"id": "kZrfxA7ga3IK"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YPasYHfaghTg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "1e5b93e7-2356-4937-a510-7b78d9096973"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.23.5)\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.3.post1)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.4)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n",
"Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n",
"Looking in indexes: https://download.pytorch.org/whl/cu118\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.1.0+cu118)\n",
"Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.16.0+cu118)\n",
"Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (2.1.0+cu118)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.13.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.2.1)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2023.6.0)\n",
"Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.1.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.23.5)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.31.0)\n",
"Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (9.4.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2023.11.17)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n",
"Requirement already satisfied: ktrain in /usr/local/lib/python3.10/dist-packages (0.39.0)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.2.2)\n",
"Requirement already satisfied: matplotlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from ktrain) (3.7.1)\n",
"Requirement already satisfied: pandas>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.5.3)\n",
"Requirement already satisfied: fastprogress>=0.1.21 in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.0.3)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.31.0)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.3.2)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from ktrain) (23.2)\n",
"Requirement already satisfied: langdetect in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.0.9)\n",
"Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from ktrain) (0.42.1)\n",
"Requirement already satisfied: charset-normalizer in /usr/local/lib/python3.10/dist-packages (from ktrain) (3.3.2)\n",
"Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from ktrain) (5.2.0)\n",
"Requirement already satisfied: syntok>1.3.3 in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.4.4)\n",
"Requirement already satisfied: tika in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.6.0)\n",
"Requirement already satisfied: transformers>=4.17.0 in /usr/local/lib/python3.10/dist-packages (from ktrain) (4.35.2)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from ktrain) (0.1.99)\n",
"Requirement already satisfied: keras-bert>=0.86.0 in /usr/local/lib/python3.10/dist-packages (from ktrain) (0.89.0)\n",
"Requirement already satisfied: whoosh in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.7.4)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from keras-bert>=0.86.0->ktrain) (1.23.5)\n",
"Requirement already satisfied: keras-transformer==0.40.0 in /usr/local/lib/python3.10/dist-packages (from keras-bert>=0.86.0->ktrain) (0.40.0)\n",
"Requirement already satisfied: keras-pos-embd==0.13.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.13.0)\n",
"Requirement already satisfied: keras-multi-head==0.29.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.29.0)\n",
"Requirement already satisfied: keras-layer-normalization==0.16.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.16.0)\n",
"Requirement already satisfied: keras-position-wise-feed-forward==0.8.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.8.0)\n",
"Requirement already satisfied: keras-embed-sim==0.10.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.10.0)\n",
"Requirement already satisfied: keras-self-attention==0.51.0 in /usr/local/lib/python3.10/dist-packages (from keras-multi-head==0.29.0->keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.51.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (1.2.0)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (4.45.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (1.4.5)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (9.4.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (3.1.1)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.1->ktrain) (2023.3.post1)\n",
"Requirement already satisfied: regex>2016 in /usr/local/lib/python3.10/dist-packages (from syntok>1.3.3->ktrain) (2023.6.3)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (3.13.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (0.19.4)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (6.0.1)\n",
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (0.15.0)\n",
"Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (0.4.1)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (4.66.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect->ktrain) (1.16.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (2023.11.17)\n",
"Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->ktrain) (1.11.4)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->ktrain) (3.2.0)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tika->ktrain) (67.7.2)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers>=4.17.0->ktrain) (2023.6.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers>=4.17.0->ktrain) (4.5.0)\n",
"Requirement already satisfied: textblob in /usr/local/lib/python3.10/dist-packages (0.17.1)\n",
"Requirement already satisfied: nltk>=3.1 in /usr/local/lib/python3.10/dist-packages (from textblob) (3.8.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk>=3.1->textblob) (8.1.7)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk>=3.1->textblob) (1.3.2)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk>=3.1->textblob) (2023.6.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk>=3.1->textblob) (4.66.1)\n",
"Requirement already satisfied: openai in /usr/local/lib/python3.10/dist-packages (1.3.8)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n",
"Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai) (0.25.2)\n",
"Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai) (1.10.13)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.0)\n",
"Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.1)\n",
"Requirement already satisfied: typing-extensions<5,>=4.5 in /usr/local/lib/python3.10/dist-packages (from openai) (4.5.0)\n",
"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.6)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.0)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (1.0.2)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n"
]
}
],
"source": [
"!pip install pandas numpy transformers\n",
"!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
"!pip install ktrain\n",
"!pip install textblob\n",
"\n",
"!pip install openai"
]
},
{
"cell_type": "markdown",
"source": [
"We will need pandas to clean and work with our json review data, numpy for help with general mathematics operations. We are running various pre-trained sentiment models (which can be found at https://huggingface.co/) on the Transformers library, using PyTorch as our framework."
],
"metadata": {
"id": "sPwPtKtybiQZ"
}
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"import json\n",
"import re\n",
"import nltk\n",
"from transformers import pipeline\n",
"from ktrain.text.sentiment.core import SentimentAnalyzer\n",
"from textblob import TextBlob\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"nltk.download('wordnet')\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "m4wg29dzbowV",
"outputId": "ac766261-f49d-4388-bc11-134e16345730"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 104
}
]
},
{
"cell_type": "markdown",
"source": [
"# `2. Dataset Overview`"
],
"metadata": {
"id": "itjnjzKcbvWj"
}
},
{
"cell_type": "markdown",
"source": [
"This data was determined by the team to be a good source for our project due to the large volume of easily accessible review text that could be analyzed in a sentiment analysis project. In total, our scraper collected 21+ million reviews, 2+ million professor profiles, and nearly 8000 schools. The entire scope of this data includes information to filter by geographical region, single university, university department, or a unique professor. Collectively, reviews date from as early as 2001 all the way to present-day. Individual reviews contain data about what professor it’s associated with, class number, ratings data, reviewer comment, and date of review."
],
"metadata": {
"id": "FsaQySsUcAHW"
}
},
{
"cell_type": "markdown",
"source": [
"## `2.1 Data Collecition`\n"
],
"metadata": {
"id": "X-nFZBRCb2rj"
}
},
{
"cell_type": "code",
"source": [
"# Load the professor profile data\n",
"higher_professors = pd.read_json('https://raw.githubusercontent.com/Will-Alger/csc425-sentiment-analysis/main/higher_professors.json')\n",
"lower_professors = pd.read_json('https://raw.githubusercontent.com/Will-Alger/csc425-sentiment-analysis/main/lower_professors.json')\n",
"\n",
"# Load the reviews data\n",
"higher_reviews = pd.read_json('https://raw.githubusercontent.com/Will-Alger/csc425-sentiment-analysis/main/higher_professor_reviews.json')\n",
"lower_reviews = pd.read_json('https://raw.githubusercontent.com/Will-Alger/csc425-sentiment-analysis/main/lower_professor_reviews.json')\n",
"\n",
"higher_reviews_url =\"https://raw.githubusercontent.com/Will-Alger/csc425-sentiment-analysis/main/higher_professor_reviews.json\"\n",
"lower_reviews_url = \"https://raw.githubusercontent.com/Will-Alger/csc425-sentiment-analysis/main/lower_professor_reviews.json\"\n",
"\n",
"\n",
"science_professors_url = \"https://raw.githubusercontent.com/ssdtac/Professor-Reviews/master/science_professors_v2.json\"\n",
"humanities_professors_url = \"https://raw.githubusercontent.com/ssdtac/Professor-Reviews/master/humanities_professors_v2.json\"\n"
],
"metadata": {
"id": "dKcHS0D5d11K"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## `2.2 Dataset Description`\n",
"\n",
"To test the waters with a pre-trained model, 10 professors were chosen from our database.\n",
"5 were selected with an overall lower avgRating, and 5 were selected with an overall higher avgRating.\n",
"\n",
"Selection of higher rated professors:\n",
"```\n",
"select *\n",
" from professors \n",
" where avgRating between 3.5 and 4\n",
" and numRatings between 20 and 35\n",
"limit 5\n",
"```\n",
"\n",
"Selection of lower rated professors:\n",
"```\n",
"select *\n",
" from professors\n",
" where avgRating <= 2.5\n",
" and numRatings between 20 and 50\n",
"limit 5\n",
"```\n",
"\n",
"The URLs for science and humanities professors from NKU will be used later."
],
"metadata": {
"id": "e6e61xp7b6Te"
}
},
{
"cell_type": "markdown",
"source": [
"# `3. Data Preprocessing and Cleaning`\n",
"\n",
"The following preprocessing methods will be used throughout the project."
],
"metadata": {
"id": "q5GPAGaccG4c"
}
},
{
"cell_type": "code",
"source": [
"def preprocess_text(text):\n",
" text = text.lower()\n",
" text = re.sub(r'
', ' ', text)\n",
" text = re.sub(r'\\W', ' ', text)\n",
" text = re.sub(r'\\s+[a-zA-Z]\\s+', ' ', text)\n",
" text = re.sub(r'\\^[a-zA-Z]\\s+', ' ', text)\n",
" text = re.sub(r'\\s+', ' ', text)\n",
" text = re.sub(r'^b\\s+', '', text)\n",
" tokens = word_tokenize(text)\n",
" stop_words = set(stopwords.words('english'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]\n",
" lemmatized_tokens = [token for token in lemmatized_tokens if len(token) > 3]\n",
" return ' '.join(lemmatized_tokens)\n",
"\n",
"\n",
"\n",
"def load_and_preprocess_data(url):\n",
" data = pd.read_json(url)\n",
" original_row_count = len(data)\n",
" print(f\"Rows before preprocessing: {original_row_count}\")\n",
" data = data[data['comment'].ne('No Comments')]\n",
" preprocessed_row_count = len(data)\n",
" print(f\"Rows discarded for being 'No Comments': {original_row_count - preprocessed_row_count}\")\n",
" data['comment'] = data['comment'].apply(preprocess_text)\n",
" data = data[data['comment'].notna() & data['comment'].str.strip().ne('')]\n",
" filtered_row_count = len(data)\n",
" print(f\"Rows discarded for no rating: {preprocessed_row_count - filtered_row_count}\")\n",
" data = data[data['qualityRating'].between(1, 5, inclusive='both') & data['difficultyRating'].between(1, 5, inclusive='both')]\n",
"# data['qualityRating'] = data['qualityRating'].apply(preprocess_ratings)\n",
" print(f\"Rows left after preprocessing: {len(data)}\")\n",
" return data"
],
"metadata": {
"id": "-s-pWEHne1m4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## `3.1 Defining our preprocessed data`\n",
"\n",
"Its worth noting that there exists roughly twice the number of reviews for humanities as there are sciences. A more accurate approach might include balancing these two datasets."
],
"metadata": {
"id": "lx1aJ2KnKF2y"
}
},
{
"cell_type": "code",
"source": [
"science_professors = load_and_preprocess_data(science_professors_url)\n",
"humanities_professors = load_and_preprocess_data(humanities_professors_url)\n",
"professors = pd.concat([science_professors, humanities_professors], ignore_index=True)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fzP8MiJNJ93a",
"outputId": "30bde84d-017f-41ea-c4b5-3980a0be4179"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Rows before preprocessing: 6412\n",
"Rows discarded for being 'No Comments': 56\n",
"Rows discarded for no rating: 5\n",
"Rows left after preprocessing: 6338\n",
"Rows before preprocessing: 11389\n",
"Rows discarded for being 'No Comments': 178\n",
"Rows discarded for no rating: 22\n",
"Rows left after preprocessing: 11166\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# `4. Sentiment Analysis Model Proof of Concept`"
],
"metadata": {
"id": "2eoSfu68cbg6"
}
},
{
"cell_type": "markdown",
"source": [
"### `4.1 A Preliminary Model`"
],
"metadata": {
"id": "cRKGzGDOc6sS"
}
},
{
"cell_type": "markdown",
"source": [
"In the Transformers library, there is a pipeline abstraction. According to the documentation, pipelines serve as \"objects that abstract most of the complex code from the library, offering a simple API dedicated to several tasks... [such as] Sentiment Analysis.\" To start the project, we have opted to use pipelines. This will serve to simplify the start of the project, allowing us to build iteratively."
],
"metadata": {
"id": "kTBlBBIGc_MF"
}
},
{
"cell_type": "code",
"source": [
"pipe = pipeline(task='sentiment-analysis', framework='pt', model='distilbert-base-uncased-finetuned-sst-2-english')"
],
"metadata": {
"id": "6hReRnq4lBNs"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### `4.2 Clean and Preprocess data`"
],
"metadata": {
"id": "iVw2u0HvmtFs"
}
},
{
"cell_type": "code",
"source": [
"higher_reviews_processed = load_and_preprocess_data(higher_reviews_url)\n",
"lower_reviews_processed = load_and_preprocess_data(lower_reviews_url)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ufbzTW5VmsLX",
"outputId": "6d8a4a96-e1e2-4f7d-ebfe-ea2c12e237ac"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Rows before preprocessing: 151\n",
"Rows discarded for being 'No Comments': 12\n",
"Rows discarded for no rating: 1\n",
"Rows left after preprocessing: 138\n",
"Rows before preprocessing: 177\n",
"Rows discarded for being 'No Comments': 11\n",
"Rows discarded for no rating: 1\n",
"Rows left after preprocessing: 165\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### `4.3 Calculate sentiment distribution for professors`"
],
"metadata": {
"id": "CJdPmKn2Lky0"
}
},
{
"cell_type": "code",
"source": [
"positive_higher = 0\n",
"negative_higher = 0\n",
"positive_lower = 0\n",
"negative_lower = 0\n",
"\n",
"# Analyze sentiment for higher-rated professors\n",
"for index, row in higher_reviews_processed.iterrows():\n",
" sentiment = pipe(row['comment'])\n",
" if sentiment[0]['label'] == 'POSITIVE':\n",
" positive_higher += 1\n",
" else:\n",
" negative_higher += 1\n",
"\n",
"# Analyze sentiment for lower-rated professors\n",
"for index, row in lower_reviews_processed.iterrows():\n",
" sentiment = pipe(row['comment'])\n",
" if sentiment[0]['label'] == 'POSITIVE':\n",
" positive_lower += 1\n",
" else:\n",
" negative_lower += 1\n",
"\n",
"average_quality_higher_professors = higher_professors['avgRating'].mean()\n",
"average_quality_lower_professors = lower_professors['avgRating'].mean()\n",
"\n",
"# Average overall rating for higher and lower rated professors\n",
"print(f\"Average overall rating for selected higher-rated professors: {average_quality_higher_professors}\")\n",
"print(f\"Average overall rating for selected lower-rated professors: {average_quality_lower_professors}\")\n",
"\n",
"# Frequencies\n",
"print(f\"\\nHigher-rated professors - Positive: {positive_higher}, Negative: {negative_higher}\")\n",
"print(f\"Lower-rated professors - Positive: {positive_lower}, Negative: {negative_lower}\")"
],
"metadata": {
"id": "V8xUX2fvl-_s",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "7a502e8a-500d-4746-8a00-2f6853541705"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Average overall rating for selected higher-rated professors: 3.72\n",
"Average overall rating for selected lower-rated professors: 2.16\n",
"\n",
"Higher-rated professors - Positive: 69, Negative: 69\n",
"Lower-rated professors - Positive: 48, Negative: 117\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### `4.4 Display a distribution of professor review sentiment`"
],
"metadata": {
"id": "ChJSuxcwIplS"
}
},
{
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"categories = ['Higher-rated Professors', 'Lower-rated Professors']\n",
"higher_counts = [positive_higher, negative_higher]\n",
"lower_counts = [positive_lower, negative_lower]\n",
"\n",
"pos = np.arange(len(categories))\n",
"bar_width = 0.35\n",
"\n",
"fig, ax = plt.subplots()\n",
"\n",
"bar_higher = ax.bar(pos, higher_counts, bar_width, label='Positive Reviews')\n",
"bar_lower = ax.bar(pos + bar_width, lower_counts, bar_width, label='Negative Reviews')\n",
"\n",
"ax.set_xlabel('Sentiment')\n",
"ax.set_ylabel('Count')\n",
"ax.set_title('Sentiment Count Distribution by Professor Rating')\n",
"ax.set_xticks(pos + bar_width / 2)\n",
"ax.set_xticklabels(categories)\n",
"ax.legend()\n",
"\n",
"plt.show()\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 472
},
"id": "xfeZ6MIjAgLY",
"outputId": "36e45b9e-8bb6-4767-cef9-f961891486ef"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABif0lEQVR4nO3dd1gU1/s28HspC0svIi0IWBAw2I0Bo6JisMTYKyoSe42FJJJ8rVFRE0vsUSOWYDSxx17RWKIoYkVUApaIYqGIREA47x++zM+lgyAw3p/r2kt3yplnZneHe8+UVQghBIiIiIhkSqOsCyAiIiIqTQw7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtUogYMGAAHB4eyLoNKkUKhwNSpU0t9OSEhIVAoFAgJCZGGeXp64sMPPyz1ZQNATEwMFAoF1q5d+06W96Z3uZ7vyv79+1G3bl3o6upCoVAgISGhrEuqsDw9PeHp6VnWZVQoDDsV2JUrV9CtWzfY29tDV1cXtra2aN26NRYvXlyqy33w4AGmTp2K8PDwUl1OaUlJScHUqVPV/ogWxqNHj+Dv7w9nZ2fo6elBX18fDRo0wIwZM8rNjnvjxo1YuHBhoad3cHCAQqGAQqGAhoYGTExM4ObmhiFDhuDs2bNlVte7VJ5rK21Zr33W629jY4NPP/20yJ+Ngjx9+hQ9evSASqXC0qVLsWHDBujr65foMsrSgAED1Laljo4OnJycMHnyZLx8+bJYbV6/fh1Tp05FTExMyRb7nlLwt7EqptOnT6NFixaoUqUKfH19YWVlhXv37uHvv/9GVFQUbt++XWrLPn/+PBo1aoSgoCAMGDBAbVx6ejoyMzOho6NTast/W0+ePIGFhQWmTJlS6B6K0NBQtGvXDsnJyejbty8aNGgA4PW22LRpEzw8PHDw4MFSrLpwPvvsM1y9erXQO0gHBweYmppiwoQJAIDnz58jIiICf/zxBx4+fIhx48Zh/vz5avO8fPkSWlpa0NLSKrW6ACAzMxNpaWlQKpXQ0Hj9vczT0xNPnjzB1atXC91OcWsTQiA1NRXa2trQ1NQsseUVRmmsZ24UCgVat26N/v37QwiB6OhoLFu2DHFxcdizZw/atm1bIsvZv38/2rZti0OHDsHLy6tE2ixPBgwYgE2bNmH16tUAgMTEROzcuROHDh1Cnz59EBwcXOQ2t2zZgu7du+PYsWM5enHS0tIAAEql8q1rf18Ufm9F5crMmTNhbGyM0NBQmJiYqI2Li4srm6IAaGtrl9myS0tCQgI6d+4MTU1NXLx4Ec7OzmrjZ86ciVWrVpVRdW/P1tYWffv2VRs2Z84c9OnTBwsWLECNGjUwfPhwaZyurm6p1vPy5Usp4JT2svKjUCjKdPnvipOTk9rr37lzZ9SuXRsLFy7MM+y8+RoVRtY+Kfu+qqIQQuDly5dQqVR5TqOlpaW2HUeMGAEPDw/89ttvmD9/PiwtLUusHoacYhBUIdWsWVN4enoWevoNGzaI+vXrC11dXWFqaip69uwp7t69qzZN8+bNRa1atcS1a9eEp6enUKlUwsbGRsyZM0ea5tixYwJAjkdQUJAQQghfX19hb28vTR8dHS0AiB9++EEsWbJEODo6CpVKJVq3bi3u3r0rMjMzxfTp04Wtra3Q1dUVn3/+uXj69GmO+vfu3Ss++eQToaenJwwMDES7du3E1atX1abx9fUV+vr64v79+6Jjx45CX19fVKpUSUyYMEG8evVKrZ7sjylTpuS57WbPni0AiODg4EJv76VLlwpXV1ehVCqFtbW1GDFihIiPj1ebxt7eXvj6+uaYt3nz5qJ58+bS86xtvnnzZjFjxgxha2srdHR0RMuWLcWtW7fU5su+Xm++Frmxt7cX7du3z3Xc8+fPhZmZmbC1tRWZmZnS8OzbKykpSXz55ZfC3t5eKJVKYWFhIby8vMSFCxcKrCtr3X777Tfx3XffCRsbG6FQKER8fLw07tixY2rrWKtWLXH+/Hnh7u4udHV1hYODg1i+fLla7UFBQQKAiI6OVhuevc38ast6r2S9t7McOXJEei8aGxuLzz//XFy/fl1tmilTpggA4tatW8LX11cYGxsLIyMjMWDAAPHixYt8XpHCr+fz58+Fnp6eGDNmTI757927JzQ0NMSsWbPyXQ4AMXLkyBzDK1WqJGrUqCGEyP81EkKI33//Xdq3mJubCx8fH3H//n21dcm+jd983//999/C29tbGBkZCZVKJZo1ayZOnjypVk9B7zEhhLh586bo0qWLsLS0FDo6OsLW1lb07NlTJCQkSNOkp6eL6dOni6pVqwqlUins7e1FQECAePnypdrysj4X+/fvFw0aNBA6OjpiwYIFeW7HrH1Pdv7+/gKAOH36tDQsJiZGDB8+XDg5OQldXV1hZmYmunXrpvZezXr/Zn+8+b4tzj4iS9a+WFdXVzRq1EicOHEiR5tyw56dCsre3h5nzpzB1atXCzyRcebMmZg0aRJ69OiBQYMG4fHjx1i8eDGaNWuGixcvqn3bio+PR5s2bdClSxf06NEDW7ZswTfffAM3Nze0bdsWLi4umD59OiZPnowhQ4agadOmAAAPD498awgODkZaWhpGjx6NZ8+eYe7cuejRowdatmyJkJAQfPPNN7h9+zYWL14Mf39/rFmzRpp3w4YN8PX1hbe3N+bMmYOUlBQsX74cn3zyCS5evKh2QnRGRga8vb3RuHFj/Pjjjzh8+DDmzZuHatWqYfjw4bCwsMDy5csxfPhwdO7cGV26dAEA1K5dO8/ad+3aBZVKhW7duuW7jlmmTp2KadOmwcvLC8OHD0dkZCSWL1+O0NBQnDp1qti9X7Nnz4aGhgb8/f2RmJiIuXPnwsfHRzq35rvvvkNiYiLu37+PBQsWAAAMDAyKtayseTt37oxffvkF169fR61atXKdbtiwYdiyZQtGjRoFV1dXPH36FCdPnkRERATq169fqLq+//57KJVK+Pv7IzU1Nd9vrvHx8WjXrh169OiB3r174/fff8fw4cOhVCrxxRdfFGkdi7rNDh8+jLZt26Jq1aqYOnUq/vvvPyxevBhNmjRBWFhYjpPze/ToAUdHRwQGBiIsLAyrV69G5cqVMWfOnAJrK2g9s16fzZs3Y/78+WqH2n777TcIIeDj41Ok7ZG13Pj4eFSvXl1teG6v0dq1a+Hn54dGjRohMDAQjx49wk8//YRTp05J+5bvvvsONWvWxMqVKzF9+nQ4OjqiWrVqAICjR4+ibdu2aNCgAaZMmQINDQ0EBQWhZcuW+Ouvv/DRRx8BKPg9lpaWBm9vb6SmpmL06NGwsrLCv//+i927dyMhIQHGxsYAgEGDBmHdunXo1q0bJkyYgLNnzyIwMBARERHYvn272vpGRkaid+/eGDp0KAYPHoyaNWsWeVtmHRo1NTWVhoWGhuL06dPo1asXPvjgA8TExGD58uXw9PTE9evXoaenh2bNmmHMmDFYtGgRvv32W7i4uACA9G9eCtpHAMDy5csxatQoNG3aFOPGjUNMTAw6deoEU1NTfPDBB0VexwqjrNMWFc/BgweFpqam0NTUFO7u7uLrr78WBw4cEGlpaWrTxcTECE1NTTFz5ky14VeuXBFaWlpqw7O+ga1fv14alpqaKqysrETXrl2lYaGhobl+4xUi754dCwsLtW9YAQEBAoCoU6eOSE9Pl4b37t1bKJVK6ZvW8+fPhYmJiRg8eLDach4+fCiMjY3Vhvv6+goAYvr06WrT1qtXTzRo0EB6/vjx4wJ7c95kamoq6tSpU6hp4+LihFKpFJ9++qnIyMiQhi9ZskQAEGvWrJGGFbVnx8XFRaSmpkrDf/rpJwFAXLlyRRrWvn37Antz3pRfz44QQixYsEAAEDt37pSGZd92xsbGufYOvCmvurLWrWrVqiIlJSXXcdl7dgCIefPmScNSU1NF3bp1ReXKlaX3f2F7dvKrLbeenazlvNn7eOnSJaGhoSH69+8vDcvq2fniiy/U2uzcubMwNzfPsazsCrueBw4cEADEvn371OavXbt2ob6lAxADBw4Ujx8/FnFxceLs2bOiVatWasvO6zVKS0sTlStXFh9++KH477//pOG7d+8WAMTkyZOlYVmvR2hoqDQsMzNT1KhRQ3h7e6v1HKakpAhHR0fRunVraVhB77GLFy8KAOKPP/7Ic5rw8HABQAwaNEhteFbvy9GjR6Vh9vb2AoDYv39/nu29Katn5/Hjx+Lx48fi9u3b4scffxQKhUJ8+OGHOdYvuzNnzuTY9/7xxx853qtZiruPSE1NFebm5qJRo0Zq+921a9cKALLu2eHVWBVU69atcebMGXz++ee4dOkS5s6dC29vb9ja2mLXrl3SdNu2bUNmZiZ69OiBJ0+eSA8rKyvUqFEDx44dU2vXwMBA7bizUqnERx99hH/++eet6u3evbv07QoAGjduDADo27ev2omujRs3RlpaGv79918AwKFDh5CQkIDevXur1a+pqYnGjRvnqB94/S3