Created
April 8, 2016 23:00
-
-
Save mdtareque/9c43fd98a499bcdc7c5242cf5c540083 to your computer and use it in GitHub Desktop.
Kaggle input data fromatter for Home Depot Product Search Relevance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
""" | |
usage: python attribute_flattener.py [include-keys=yes] [override filename] | |
generates <file_name>.out file | |
contents are all attributes are flattened to single line per key | |
NOte: Keep attributes.csv in current folder and run | |
python attribute_flattener.py | |
""" | |
include_key = False | |
file_name = 'attributes.csv' | |
if len(sys.argv) > 1 and sys.argv[1] == 'include-key=yes': | |
include_key = True | |
if len(sys.argv) > 2 : | |
file_name = sys.argv[2] | |
include_key = true | |
lines = [line.rstrip('\n') for line in open(file_name)] | |
print 'done reading file ' + file_name | |
out = {} | |
for l in lines: | |
if l == ",,": continue | |
a = l.replace("'", " ").replace("\"", " ") | |
a = a.split(",") | |
if a[0] in out: | |
if include_key: | |
out[a[0]] += " ".join(a[1:]) | |
else: | |
out[a[0]] += " ".join(a[2:]) | |
else: | |
if include_key: | |
out[a[0]] = " ".join(a[1:]) | |
else: | |
out[a[0]] = " ".join(a[2:]) | |
print 'done flattening attributes' | |
outf = open(file_name+'.out', 'w') | |
for i in out.keys(): | |
outf.write(i + "," + out[i] + "\n") | |
outf.close() | |
print 'done writing to outfile ' + file_name+'.out' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment