Skip to content

Instantly share code, notes, and snippets.

@stuartlynn
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stuartlynn/f79b4f520af600a7ad73 to your computer and use it in GitHub Desktop.
Save stuartlynn/f79b4f520af600a7ad73 to your computer and use it in GitHub Desktop.
Data extract
import pprint
import json
import csv
file = open('all.txt')
j_file = open("all.json","w")
csv_file = open("all.csv","w")
writer = csv.writer(csv_file, delimiter="\t")
pp = pprint.PrettyPrinter(depth=6)
record = []
headers = ["price","productId", "title", "helpfulessScore", "helpfulnessCount", "profileName", "score", "summary", "text", "time", "userId"]
seq_no = 0
for line in file:
if line == "\n":
if(seq_no%1000)==0:
print seq_no
j_file.write(json.dumps(dict(record))+"\n")
row_data = [a[1] for a in record]
writer.writerow(row_data)
seq_no += 1
record = []
else:
keyval = line.split(":")
if keyval[0] =="review/helpfulness":
key = keyval[1].split("/")[1]
record.append( ( "helpfuless_score", int(keyval[1].replace("\n","").split("/")[0]) ))
record.append( ( "helpfuless_count", int(keyval[1].replace("\n","").split("/")[1]) ))
else:
key = keyval[0].split("/")[1]
record.append( ( key, keyval[1].replace("\n","")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment