https://github.com/dreamjet31/yelp_json_to_csv
https://github.com/dreamjet31/yelp_json_to_csv
json2csv python yelp-dataset
Last synced: 2 months ago
JSON representation
- Host: GitHub
- URL: https://github.com/dreamjet31/yelp_json_to_csv
- Owner: dreamjet31
- Created: 2023-06-29T04:46:50.000Z (almost 2 years ago)
- Default Branch: main
- Last Pushed: 2023-06-29T04:58:59.000Z (almost 2 years ago)
- Last Synced: 2025-02-10T08:49:35.480Z (4 months ago)
- Topics: json2csv, python, yelp-dataset
- Language: Python
- Homepage:
- Size: 2.93 KB
- Stars: 0
- Watchers: 1
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: README.md
Awesome Lists containing this project
README
# Yelp dataset JSON to CSV
```code
import json
import csv
import re
from yelp_fieldlists import get_headers, get_dataBUSINESS = "business"
REVIEW = "review"
TIP = "tip"
CHECKIN = "checkin"
USER = "user"TYPE_LISTS = [BUSINESS, REVIEW, TIP, CHECKIN, USER]
def normalize_text(text):
"""
Remove non-ASCII chars.
"""
text = re.sub('[^\x00-\x7F]+', ' ', text)
return textdef json_to_csv(json_file, csv_prefix, file_type, max_rows=100000):
current_row = 0
"""
json_file ==> .json file to be converted
csv_prefix ==> prefix for output CSV files
fileType ==> 'business' or 'review' or 'tip'
max_rows ==> maximum number of rows per CSV file (default: 1,000,000)
"""
if file_type not in TYPE_LISTS:
raise ValueError('Type {} not defined.'.format(file_type))current_count = 0
csv_count = 1
csv_file = f"{csv_prefix}_{csv_count}.csv"with open(csv_file, 'w', encoding='utf-8', errors='replace') as file:
csv_writer = csv.writer(file, lineterminator='\n')
csv_writer.writerow(get_headers(file_type))
with open(json_file, encoding='utf-8', errors='replace') as j_file:
for line in j_file:
current_row += 1
print(current_row)
data = json.loads(line)
if file_type == REVIEW or file_type == TIP:
data['text'] = ''.join([normalize_text(text) for text in data['text']])
csv_writer.writerow(get_data(file_type, data))
current_count += 1if current_count == max_rows:
print(f"File {csv_file} created successfully.")
current_count = 0
csv_count += 1
csv_file = f"{csv_prefix}_{csv_count}.csv"
file = open(csv_file, 'w', encoding='utf-8', errors='replace')
csv_writer = csv.writer(file, lineterminator='\n')
csv_writer.writerow(get_headers(file_type))print(f"File {csv_file} created successfully.")
def main():
"""
Entry-point for the function.
"""
_type = TIP
json_file = "yelp_academic_dataset_tip.json".format(_type)
csv_prefix = '{0}_part'.format(json_file.split('.json')[0])
json_to_csv(json_file, csv_prefix, _type, max_rows=100000)
if __name__ == "__main__":
main()
```#### BUSINESS
```
def main():
"""
Entry-point for the function.
"""
_type = BUSINESS
json_file = "yelp_academic_dataset_business.json".format(_type)
csv_prefix = '{0}_part'.format(json_file.split('.json')[0])
json_to_csv(json_file, csv_prefix, _type, max_rows=100000)
```