Skip to content

Instantly share code, notes, and snippets.

@mdtareque
Created April 8, 2016 23:00
Show Gist options
  • Save mdtareque/9c43fd98a499bcdc7c5242cf5c540083 to your computer and use it in GitHub Desktop.
Save mdtareque/9c43fd98a499bcdc7c5242cf5c540083 to your computer and use it in GitHub Desktop.
Kaggle input data fromatter for Home Depot Product Search Relevance
import sys
"""
usage: python attribute_flattener.py [include-keys=yes] [override filename]
generates <file_name>.out file
contents are all attributes are flattened to single line per key
NOte: Keep attributes.csv in current folder and run
python attribute_flattener.py
"""
include_key = False
file_name = 'attributes.csv'
if len(sys.argv) > 1 and sys.argv[1] == 'include-key=yes':
include_key = True
if len(sys.argv) > 2 :
file_name = sys.argv[2]
include_key = true
lines = [line.rstrip('\n') for line in open(file_name)]
print 'done reading file ' + file_name
out = {}
for l in lines:
if l == ",,": continue
a = l.replace("'", " ").replace("\"", " ")
a = a.split(",")
if a[0] in out:
if include_key:
out[a[0]] += " ".join(a[1:])
else:
out[a[0]] += " ".join(a[2:])
else:
if include_key:
out[a[0]] = " ".join(a[1:])
else:
out[a[0]] = " ".join(a[2:])
print 'done flattening attributes'
outf = open(file_name+'.out', 'w')
for i in out.keys():
outf.write(i + "," + out[i] + "\n")
outf.close()
print 'done writing to outfile ' + file_name+'.out'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment