Skip to content

Instantly share code, notes, and snippets.

@powersjcb
Last active September 7, 2015 19:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save powersjcb/ac53c796df35eecd760a to your computer and use it in GitHub Desktop.
Save powersjcb/ac53c796df35eecd760a to your computer and use it in GitHub Desktop.
import json
from pprint import pprint
from datetime import datetime
def load_json_data(path):
start_time = datetime.now()
f = open(path, 'r')
# stripping out data contaminants from Crunchbase
first_line = f.readline()
decoded = first_line #.decode('utf-8')
start_index = decoded.find('{')
end_index = decoded.rfind('}')
clean_string = decoded[start_index:(end_index + 1)]
opening_count = clean_string.count('{')
closing_count = clean_string.count('}')
escaped_open = clean_string.count('\{')
escaped_closed = clean_string.count('\}')
print opening_count
print closing_count
data = clean_string
# data = json.loads(clean_string)
end_time = datetime.now()
f.close()
return {"start_time": str(start_time), "end_time": str(end_time), "data": data}
results = load_json_data('odm.json')
print results["start_time"]
print results["end_time"]
print results['data'][:50]
print results['data'][-50:]
# OUTPUT:
# 672351 <- '{' count
# 672348 <- '}' count. Why is this not the same?
# 2015-09-07 12:28:13.457119
# 2015-09-07 12:28:18.955385
# {"root":[{"crunchbase_uuid":"ed13cd36fe2b3707197b0
# sts worldwide for technologically cutting-edge"}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment