Skip to content

Instantly share code, notes, and snippets.

@madhurjain
Created June 27, 2013 06:49
Show Gist options
  • Save madhurjain/5874454 to your computer and use it in GitHub Desktop.
Save madhurjain/5874454 to your computer and use it in GitHub Desktop.
CSV file processing in python
import csv, re, traceback
# Read category file to dict
categoryFile = open('categories.csv', 'rb')
try:
categoryDict = csv.DictReader(categoryFile);
cDesc = []
mainCat = []
secondCat = []
thirdCat = []
for row in categoryDict:
cDesc.append(row['Description'])
mainCat.append(row['Main Category'])
secondCat.append(row['2nd Category'])
thirdCat.append(row['3rd Category'])
finally:
categoryFile.close()
def titleIndexInCategories(title):
for ind, desc in enumerate(cDesc):
if desc in title:
return ind
return 0
# Read Output file and write to Processed removing the brand
outputFile = open('output.csv', 'rb')
samples = csv.reader(outputFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)
processedFile = open('processed.csv', 'wb')
processedWriter = csv.writer(processedFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)
try:
strippedBrandCount = 0
strippedRowCount = 0
mappedCount = 0
lenMoreThan100 = 0
rowNum = 0
for row in samples:
rowNum += 1
if rowNum == 1:
processedWriter.writerow(row)
else:
# Remove Blank Rows
if len(row) == 0:
strippedRowCount += 1
continue
# Remove rows with no AnchorText or ProductTitle or Blank Rows
if len(row[1].strip()) == 0 or len(row[2].strip()) == 0:
strippedRowCount += 1
continue
# Remove Brand Name from Product Title
if row[2].startswith(row[4]):
strippedBrandCount += 1
stripLen = len(row[4]) + 1
row[2] = row[2][stripLen:]
# Remove hyphen and space in the beginning
row[2] = re.sub('^[( \- )?]','', row[2])
# Remove text in brackets at the end
#row[2] = re.sub('\(.*?\)$','', row[2])
row[2] = re.sub('\([^)]*\)$','', row[2])
# Remove extra spaces
row[2] = row[2].strip()
#Mapping
cIndex = titleIndexInCategories(row[2])
if cIndex:
mappedCount += 1
row[11] = mainCat[cIndex]
row[12] = secondCat[cIndex]
row[13] = thirdCat[cIndex]
if len(row[2]) > 100:
lenMoreThan100 += 1
processedWriter.writerow(row)
print "Stripped Blank Rows %s" % strippedRowCount
print "Stripped Brand Rows %s" % strippedBrandCount
print "Mapped %s" % mappedCount
print "Title Length More than 100 Rows %s" % lenMoreThan100
except:
print row
print rowNum
pass
finally:
outputFile.close()
processedFile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment