Created
July 21, 2014 13:13
-
-
Save absynthe/b08ae60d567108ac663a to your computer and use it in GitHub Desktop.
Script for splitting a raw GameAnalytics merged data file into event categories files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os.path | |
import csv | |
import cjson | |
import sys | |
import gzip | |
import ntpath | |
def write_event(prefix, event_type, headers, event): | |
data = prepare_data(headers, event) | |
path = "%s_%s.csv" % (prefix, event_type) | |
with open(path, 'ab') as fp: | |
cw = csv.writer(fp, lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC) | |
# write headers only to empty file | |
if os.stat(path)[6] == 0: | |
cw.writerow(headers) | |
cw.writerow(data) | |
def prepare_data(headers, event): | |
result = [] | |
for header in headers: | |
if "." in header: | |
# handle 2nd level fields | |
keys = header.split(".") | |
if keys[0] in event: | |
if keys[1] == "revenue": | |
revenue = event[keys[0]][keys[1]] | |
if revenue == {}: | |
result.append("") | |
else: | |
for revenue_key in revenue: | |
result.append("%s %d" % (revenue_key, revenue[revenue_key])) | |
else: | |
result.append("") | |
else: | |
if keys[0] in event: | |
if keys[1] in event[keys[0]]: | |
result.append(event[keys[0]][keys[1]]) | |
else: | |
result.append("") | |
else: | |
result.append("") | |
else: | |
# handle 1st level fields | |
if header in event: | |
result.append(event[header]) | |
else: | |
result.append("") | |
return tuple(result) | |
#TODO: check with collectors validation logic for missing fields | |
def get_csv_header(event_type): | |
header = [] | |
common_data_fields = ["data.session_id", "data.user_id", "data.build"] | |
common_fields = ["country_code","arrival_ts","game_id"] | |
common_user_meta_fields = ["user_meta.install_ts","user_meta.revenue"] | |
if event_type == "quality": | |
header = common_data_fields | |
header += ["data.value","data.event_id","data.area"] | |
header += common_fields | |
header += ["user_meta.platform","user_meta.device", | |
"user_meta.os_major", "user_meta.os_minor", | |
"user_meta.sdk_version"] | |
header += common_user_meta_fields | |
elif event_type == "design": | |
header = common_data_fields | |
header += ["data.area","data.event_id","data.message"] | |
header += common_fields | |
header += ["user_meta.platform","user_meta.device", | |
"user_meta.os_major", "user_meta.os_minor", | |
"user_meta.sdk_version","user_meta.gender"] | |
header += common_user_meta_fields | |
elif event_type == "error": | |
header = common_data_fields | |
header += ["data.severity","data.x","data.y","data.z","data.area", | |
"data.message"] | |
header += common_fields | |
header += ["user_meta.gender"] | |
header += common_user_meta_fields | |
elif event_type == "user": | |
header = common_data_fields | |
header += ["data.device","data.os_major","data.os_minor", | |
"data.platform","data.sdk_version"] | |
header += common_fields | |
header += ["user_meta.platform","user_meta.device", | |
"user_meta.os_major", "user_meta.os_minor", | |
"user_meta.sdk_version"] | |
header += common_user_meta_fields | |
elif event_type == "business": | |
header = common_data_fields | |
header += ["data.event_id","data.area"] | |
header += common_fields + ["currency", "amount"] | |
header += common_user_meta_fields | |
return header | |
def main(): | |
if len(sys.argv) < 2: | |
print "Usage: ./events2csv.py <source_file>" | |
else: | |
source_name = sys.argv[1] | |
if os.path.isfile(source_name): | |
print "Uncompressing file..." | |
with gzip.open(source_name) as f: | |
file_prefix = os.path.splitext( | |
ntpath.basename(source_name))[0].replace(".json", "") | |
print "Reading events from file..." | |
i = 0 | |
for raw_event in f: | |
event = cjson.decode(raw_event.encode("ascii", "ignore")) | |
headers = get_csv_header(event["category"]) | |
# report progress | |
sys.stdout.write("\r%d rows written..." % i) | |
sys.stdout.flush() | |
write_event(file_prefix, event["category"], headers, event) | |
i = i + 1 | |
print "\nDone" | |
else: | |
print "Supplied source file does not exists!" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment