{"id":23173626,"url":"https://github.com/dreamjet31/yelp_json_to_csv","last_synced_at":"2025-04-05T00:25:20.227Z","repository":{"id":177176631,"uuid":"660025737","full_name":"dreamjet31/yelp_json_to_csv","owner":"dreamjet31","description":null,"archived":false,"fork":false,"pushed_at":"2023-06-29T04:58:59.000Z","size":3,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-02-10T08:49:35.480Z","etag":null,"topics":["json2csv","python","yelp-dataset"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/dreamjet31.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null}},"created_at":"2023-06-29T04:46:50.000Z","updated_at":"2023-06-29T05:00:17.000Z","dependencies_parsed_at":null,"dependency_job_id":"eb261d99-d311-4293-8472-ff66fbc39635","html_url":"https://github.com/dreamjet31/yelp_json_to_csv","commit_stats":null,"previous_names":["flurryunicorn/yelp_json_to_csv","dreamjet31/yelp_json_to_csv"],"tags_count":0,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/dreamjet31%2Fyelp_json_to_csv","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/dreamjet31%2Fyelp_json_to_csv/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/dreamjet31%2Fyelp_json_to_csv/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/dreamjet31%2Fyelp_json_to_csv/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/dreamjet31","download_url":"https://codeload.github.com/dreamjet31/yelp_json_to_csv/tar.gz/refs/heads/main","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":247268182,"owners_count":20911093,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["json2csv","python","yelp-dataset"],"created_at":"2024-12-18T05:16:42.402Z","updated_at":"2025-04-05T00:25:20.208Z","avatar_url":"https://github.com/dreamjet31.png","language":"Python","funding_links":[],"categories":[],"sub_categories":[],"readme":"# Yelp dataset JSON to CSV\n\n```code\nimport json\nimport csv\nimport re\nfrom yelp_fieldlists import get_headers, get_data\n\nBUSINESS = \"business\"\nREVIEW = \"review\"\nTIP = \"tip\"\nCHECKIN = \"checkin\"\nUSER = \"user\"\n\nTYPE_LISTS = [BUSINESS, REVIEW, TIP, CHECKIN, USER]\n\n\ndef normalize_text(text):\n    \"\"\"\n    Remove non-ASCII chars.\n    \"\"\"\n    text = re.sub('[^\\x00-\\x7F]+', ' ', text)\n    return text\n\ndef json_to_csv(json_file, csv_prefix, file_type, max_rows=100000):\n    current_row = 0\n    \"\"\"\n    json_file ==\u003e .json file to be converted\n    csv_prefix ==\u003e prefix for output CSV files\n    fileType ==\u003e 'business' or 'review' or 'tip'\n    max_rows ==\u003e maximum number of rows per CSV file (default: 1,000,000)\n    \"\"\"\n    if file_type not in TYPE_LISTS:\n        raise ValueError('Type {} not defined.'.format(file_type))\n\n    current_count = 0\n    csv_count = 1\n    csv_file = f\"{csv_prefix}_{csv_count}.csv\"\n\n    with open(csv_file, 'w', encoding='utf-8', errors='replace') as file:\n        csv_writer = csv.writer(file, lineterminator='\\n')\n        csv_writer.writerow(get_headers(file_type))\n        with open(json_file, encoding='utf-8', errors='replace') as j_file:\n            for line in j_file:\n                current_row += 1\n                print(current_row)\n                data = json.loads(line)\n                if file_type == REVIEW or file_type == TIP:  \n                    data['text'] = ''.join([normalize_text(text) for text in data['text']])\n                csv_writer.writerow(get_data(file_type, data))\n                current_count += 1\n\n                if current_count == max_rows:\n                    print(f\"File {csv_file} created successfully.\")\n                    current_count = 0\n                    csv_count += 1\n                    csv_file = f\"{csv_prefix}_{csv_count}.csv\"\n                    file = open(csv_file, 'w', encoding='utf-8', errors='replace')\n                    csv_writer = csv.writer(file, lineterminator='\\n')\n                    csv_writer.writerow(get_headers(file_type))\n\n    print(f\"File {csv_file} created successfully.\")\n\ndef main():\n    \"\"\"\n    Entry-point for the function.\n    \"\"\"\n    _type = TIP\n    json_file = \"yelp_academic_dataset_tip.json\".format(_type)\n    csv_prefix = '{0}_part'.format(json_file.split('.json')[0])\n    \n    json_to_csv(json_file, csv_prefix, _type, max_rows=100000)\n    \nif __name__ == \"__main__\":\n    main()\n```\n\n\n#### BUSINESS\n\n```\ndef main():\n    \"\"\"\n    Entry-point for the function.\n    \"\"\"\n    _type = BUSINESS\n    json_file = \"yelp_academic_dataset_business.json\".format(_type)\n    csv_prefix = '{0}_part'.format(json_file.split('.json')[0])\n    \n    json_to_csv(json_file, csv_prefix, _type, max_rows=100000)\n    \n```\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdreamjet31%2Fyelp_json_to_csv","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fdreamjet31%2Fyelp_json_to_csv","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdreamjet31%2Fyelp_json_to_csv/lists"}