Skip to content

Instantly share code, notes, and snippets.

@aminnj
Last active October 29, 2018 01:59
Show Gist options
  • Save aminnj/a02d25547809a80e6bf5947c9c789c58 to your computer and use it in GitHub Desktop.
Save aminnj/a02d25547809a80e6bf5947c9c789c58 to your computer and use it in GitHub Desktop.
Parse most of google takeout data into a single dataframe
import json
import numpy as np
import pandas as pd
pd.set_option('display.width', None)
def get_dfs(which=["android","assistant","chrome","gmail","playstore","image","maps","search","voice","location"]):
dfs = {}
def get_fname(name):
if "Location History" in name:
return "Takeout/Location History/Location History.json"
# return "../small.json"
else:
return "Takeout/My Activity/{}/MyActivity.json".format(name)
if "android" in which:
df = pd.read_json(get_fname("Android")).drop(columns=["products","titleUrl","details","header"])
df.time = pd.to_datetime(df.time)
dfs["android"] = df
if "assistant" in which:
fname = "Takeout/My Activity/Assistant/MyActivity.json"
df = pd.read_json(get_fname("Assistant")).drop(columns=["titleUrl","header","products","locations"])
df["usedHotword"] = ~(df.details.isna())
df["gotResponse"] = ~(df.subtitles.isna())
df["audioFile"] = df.audioFiles.apply(lambda x: x[0] if type(x) is list else None)
df = df.drop(columns=["details","subtitles","audioFiles"])
df.time = pd.to_datetime(df.time)
dfs["assistant"] = df
if "chrome" in which:
df = pd.read_json(get_fname("Chrome")).drop(columns=["products","header"])
df["url"] = df.titleUrl.apply(lambda x: x.split("google.com/url?q=",1)[-1] if type(x) is not float else None)
df = df.drop(columns=["titleUrl"])
df.time = pd.to_datetime(df.time)
dfs["chrome"] = df
if "gmail" in which:
df = pd.read_json(get_fname("Gmail")).drop(columns=["products","titleUrl","header"])
df.time = pd.to_datetime(df.time)
dfs["gmail"] = df
if "playstore" in which:
df = pd.read_json(get_fname("Google Play Store")).drop(columns=["products","titleUrl","header"])
df.time = pd.to_datetime(df.time)
dfs["playstore"] = df
if "image" in which:
df = pd.read_json(get_fname("Image Search")).drop(columns=["products","titleUrl","header","details"])
df.time = pd.to_datetime(df.time)
dfs["image"] = df
if "maps" in which:
df = pd.read_json(get_fname("Maps")).drop(columns=["products","titleUrl","header","locations","details","description","subtitles"])
df.time = pd.to_datetime(df.time)
dfs["maps"] = df
if "search" in which:
df = pd.read_json(get_fname("Search")).drop(columns=["products","titleUrl","header","locations","details"])
df.time = pd.to_datetime(df.time)
dfs["search"] = df
if "voice" in which:
df = pd.read_json(get_fname("Voice and Audio")).drop(columns=["products","titleUrl","header"])
df["usedHotword"] = ~(df.details.isna())
df["audioFile"] = df.audioFiles.apply(lambda x: x[0] if type(x) is list else None)
df = df.drop(columns=["details","audioFiles"])
df.time = pd.to_datetime(df.time)
dfs["voice"] = df
if "location" in which:
df = pd.DataFrame(json.load(open(get_fname("Location History")))["locations"])
best_activity = lambda y:max(y[0]["activity"], key=lambda z:z.get("confidence",0)-100*(z.get("type","")=="UNKNOWN")).get("type",None)
best_confidence = lambda y:max(y[0]["activity"], key=lambda z:z.get("confidence",0)-100*(z.get("type","")=="UNKNOWN")).get("confidence",0)
df["confidence"] = df["activity"].apply(lambda x: best_confidence(x) if type(x) is not float else 0)
df["activity"] = df["activity"].apply(lambda x: best_activity(x) if type(x) is not float else None)
df["lat"] = df.latitudeE7/1.e7
df["long"] = df.longitudeE7/1.e7
df["time"] = pd.to_datetime(pd.to_numeric(df.timestampMs), unit="ms")
df.drop(columns=["latitudeE7","longitudeE7","timestampMs"],inplace=True)
dfs["location"] = df
for k in dfs.keys():
dfs[k]["header"] = k
return dfs
def merge_and_slim(df1, df2):
"""
Return new df which is
- df1 and df2 outer-merged (union of the two, so there will be nans)
- and then df1's nan values get filled with df2's nan values and df2's
columns which were already in df1 are dropped
"""
print "Shapes before:", df1.shape, df2.shape
df = df1.merge(df2, on=["time"],how="outer", suffixes=["","_other"])
kother = [k for k in df.keys() if "_other" in k]
for k in kother:
if "_other" not in k: continue
df[k.replace("_other","")].fillna(df[k],inplace=True)
df.drop(columns=kother,inplace=True)
print "Shape after:", df.shape
return df
def make_and_save_dump(fname="dump.h5"):
dfs = get_dfs().values()
df = dfs[0]
for temp in dfs[1:]:
df = merge_and_slim(df,temp)
print df.describe()
print ">>> Saving to {}".format(fname)
store = pd.HDFStore("{}".format(fname))
store["df"] = df
print ">>> Saved to {}".format(fname)
def load_dump(fname):
return pd.HDFStore(fname)["df"]
if __name__ == "__main__":
# make_and_save_dump("dump.h5")
df = load_dump("dump.h5")
print df.header.value_counts()
print df[df.header == "location"].describe()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment