Skip to content

Instantly share code, notes, and snippets.

@temmyzeus
Created October 26, 2021 11:09
Show Gist options
  • Save temmyzeus/aa3c2c1dc2f96a798e2bf2d88f098927 to your computer and use it in GitHub Desktop.
Save temmyzeus/aa3c2c1dc2f96a798e2bf2d88f098927 to your computer and use it in GitHub Desktop.
News Classifier Datasets from json to csv format => https://www.kaggle.com/rmisra/news-category-dataset
"""Convert Data from .json to .csv easily readable by Pandas"""
import os
import sys
import ast
from pathlib import Path
from typing import List, Dict
import pandas as pd
# Set directory to file directory so other paths are easily relative to it without error
filename = sys.argv[0]
dir_name = os.path.dirname(filename)
os.chdir(dir_name)
with open(Path('../data/News_Category_Dataset_v2.json', mode='r')) as f:
data = f.readlines()
data_dict: Dict[str, List] = {
'categories': [],
'headlines': [],
'authors': [],
'links': [],
'short_descriptions': [],
'dates': []
}
for line in data:
line = ast.literal_eval(line)
data_dict['categories'].append(line.get('category', 'Null'))
data_dict['headlines'].append(line.get('headline', 'Null'))
data_dict['authors'].append(line.get('authors', 'Null'))
data_dict['links'].append(line.get('link', 'Null'))
data_dict['short_descriptions'].append(line.get('short_description', 'Null'))
data_dict['dates'].append(line.get('date', 'Null'))
df = pd.DataFrame(data_dict)
# Insert None for error in values
df.to_csv(Path('../data/News Category.csv'), index=False, errors=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment