Last active
October 29, 2018 01:59
-
-
Save aminnj/a02d25547809a80e6bf5947c9c789c58 to your computer and use it in GitHub Desktop.
Parse most of google takeout data into a single dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import numpy as np | |
import pandas as pd | |
pd.set_option('display.width', None) | |
def get_dfs(which=["android","assistant","chrome","gmail","playstore","image","maps","search","voice","location"]): | |
dfs = {} | |
def get_fname(name): | |
if "Location History" in name: | |
return "Takeout/Location History/Location History.json" | |
# return "../small.json" | |
else: | |
return "Takeout/My Activity/{}/MyActivity.json".format(name) | |
if "android" in which: | |
df = pd.read_json(get_fname("Android")).drop(columns=["products","titleUrl","details","header"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["android"] = df | |
if "assistant" in which: | |
fname = "Takeout/My Activity/Assistant/MyActivity.json" | |
df = pd.read_json(get_fname("Assistant")).drop(columns=["titleUrl","header","products","locations"]) | |
df["usedHotword"] = ~(df.details.isna()) | |
df["gotResponse"] = ~(df.subtitles.isna()) | |
df["audioFile"] = df.audioFiles.apply(lambda x: x[0] if type(x) is list else None) | |
df = df.drop(columns=["details","subtitles","audioFiles"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["assistant"] = df | |
if "chrome" in which: | |
df = pd.read_json(get_fname("Chrome")).drop(columns=["products","header"]) | |
df["url"] = df.titleUrl.apply(lambda x: x.split("google.com/url?q=",1)[-1] if type(x) is not float else None) | |
df = df.drop(columns=["titleUrl"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["chrome"] = df | |
if "gmail" in which: | |
df = pd.read_json(get_fname("Gmail")).drop(columns=["products","titleUrl","header"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["gmail"] = df | |
if "playstore" in which: | |
df = pd.read_json(get_fname("Google Play Store")).drop(columns=["products","titleUrl","header"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["playstore"] = df | |
if "image" in which: | |
df = pd.read_json(get_fname("Image Search")).drop(columns=["products","titleUrl","header","details"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["image"] = df | |
if "maps" in which: | |
df = pd.read_json(get_fname("Maps")).drop(columns=["products","titleUrl","header","locations","details","description","subtitles"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["maps"] = df | |
if "search" in which: | |
df = pd.read_json(get_fname("Search")).drop(columns=["products","titleUrl","header","locations","details"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["search"] = df | |
if "voice" in which: | |
df = pd.read_json(get_fname("Voice and Audio")).drop(columns=["products","titleUrl","header"]) | |
df["usedHotword"] = ~(df.details.isna()) | |
df["audioFile"] = df.audioFiles.apply(lambda x: x[0] if type(x) is list else None) | |
df = df.drop(columns=["details","audioFiles"]) | |
df.time = pd.to_datetime(df.time) | |
dfs["voice"] = df | |
if "location" in which: | |
df = pd.DataFrame(json.load(open(get_fname("Location History")))["locations"]) | |
best_activity = lambda y:max(y[0]["activity"], key=lambda z:z.get("confidence",0)-100*(z.get("type","")=="UNKNOWN")).get("type",None) | |
best_confidence = lambda y:max(y[0]["activity"], key=lambda z:z.get("confidence",0)-100*(z.get("type","")=="UNKNOWN")).get("confidence",0) | |
df["confidence"] = df["activity"].apply(lambda x: best_confidence(x) if type(x) is not float else 0) | |
df["activity"] = df["activity"].apply(lambda x: best_activity(x) if type(x) is not float else None) | |
df["lat"] = df.latitudeE7/1.e7 | |
df["long"] = df.longitudeE7/1.e7 | |
df["time"] = pd.to_datetime(pd.to_numeric(df.timestampMs), unit="ms") | |
df.drop(columns=["latitudeE7","longitudeE7","timestampMs"],inplace=True) | |
dfs["location"] = df | |
for k in dfs.keys(): | |
dfs[k]["header"] = k | |
return dfs | |
def merge_and_slim(df1, df2): | |
""" | |
Return new df which is | |
- df1 and df2 outer-merged (union of the two, so there will be nans) | |
- and then df1's nan values get filled with df2's nan values and df2's | |
columns which were already in df1 are dropped | |
""" | |
print "Shapes before:", df1.shape, df2.shape | |
df = df1.merge(df2, on=["time"],how="outer", suffixes=["","_other"]) | |
kother = [k for k in df.keys() if "_other" in k] | |
for k in kother: | |
if "_other" not in k: continue | |
df[k.replace("_other","")].fillna(df[k],inplace=True) | |
df.drop(columns=kother,inplace=True) | |
print "Shape after:", df.shape | |
return df | |
def make_and_save_dump(fname="dump.h5"): | |
dfs = get_dfs().values() | |
df = dfs[0] | |
for temp in dfs[1:]: | |
df = merge_and_slim(df,temp) | |
print df.describe() | |
print ">>> Saving to {}".format(fname) | |
store = pd.HDFStore("{}".format(fname)) | |
store["df"] = df | |
print ">>> Saved to {}".format(fname) | |
def load_dump(fname): | |
return pd.HDFStore(fname)["df"] | |
if __name__ == "__main__": | |
# make_and_save_dump("dump.h5") | |
df = load_dump("dump.h5") | |
print df.header.value_counts() | |
print df[df.header == "location"].describe() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment