Skip to content

Instantly share code, notes, and snippets.

@vladiant
Last active December 6, 2020 00:46
Show Gist options
  • Save vladiant/940c1639700dbb298544efafc6def61a to your computer and use it in GitHub Desktop.
Save vladiant/940c1639700dbb298544efafc6def61a to your computer and use it in GitHub Desktop.
Read csv and flatten json fields
# https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields/execution
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
def load_df(csv_path='../input/train.csv', nrows=None):
JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
df = pd.read_csv(csv_path,
converters={column: json.loads for column in JSON_COLUMNS},
dtype={'fullVisitorId': 'str'}, # Important!!
nrows=nrows)
for column in JSON_COLUMNS:
column_as_df = json_normalize(df[column])
column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
return df
# Faster
def load_df_faster(csv_path='../input/train.csv', nrows=None):
JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
df = pd.read_csv(csv_path, dtype={'fullVisitorId': 'str'}, nrows=nrows)
for column in JSON_COLUMNS:
df = df.join(pd.DataFrame(df.pop(column).apply(pd.io.json.loads).values.tolist(), index=df.index))
return df
print(os.listdir("../input"))
# https://mindtrove.info/flatten-nested-json-with-pandas/
import pandas as pd
import requests
headers = {
'Authorization': 'Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
'Content-Type': 'application/json'
}
resp = requests.get('https://api.gitter.im/v1/rooms/554d218a15522ed4b3e02126/chatMessages',
headers=headers)
resp.json()[0]
msgs = pd.io.json.json_normalize(resp.json())
msgs.dtypes
# Nested arrays
msgs.mentions.head(10)
msgs.mentions.apply(lambda mentions: len(mentions)).value_counts()
mentions = pd.io.json.json_normalize(resp.json(), record_path='mentions')
mentions.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment