Created
June 27, 2013 06:49
-
-
Save madhurjain/5874454 to your computer and use it in GitHub Desktop.
CSV file processing in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv, re, traceback | |
# Read category file to dict | |
categoryFile = open('categories.csv', 'rb') | |
try: | |
categoryDict = csv.DictReader(categoryFile); | |
cDesc = [] | |
mainCat = [] | |
secondCat = [] | |
thirdCat = [] | |
for row in categoryDict: | |
cDesc.append(row['Description']) | |
mainCat.append(row['Main Category']) | |
secondCat.append(row['2nd Category']) | |
thirdCat.append(row['3rd Category']) | |
finally: | |
categoryFile.close() | |
def titleIndexInCategories(title): | |
for ind, desc in enumerate(cDesc): | |
if desc in title: | |
return ind | |
return 0 | |
# Read Output file and write to Processed removing the brand | |
outputFile = open('output.csv', 'rb') | |
samples = csv.reader(outputFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL) | |
processedFile = open('processed.csv', 'wb') | |
processedWriter = csv.writer(processedFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL) | |
try: | |
strippedBrandCount = 0 | |
strippedRowCount = 0 | |
mappedCount = 0 | |
lenMoreThan100 = 0 | |
rowNum = 0 | |
for row in samples: | |
rowNum += 1 | |
if rowNum == 1: | |
processedWriter.writerow(row) | |
else: | |
# Remove Blank Rows | |
if len(row) == 0: | |
strippedRowCount += 1 | |
continue | |
# Remove rows with no AnchorText or ProductTitle or Blank Rows | |
if len(row[1].strip()) == 0 or len(row[2].strip()) == 0: | |
strippedRowCount += 1 | |
continue | |
# Remove Brand Name from Product Title | |
if row[2].startswith(row[4]): | |
strippedBrandCount += 1 | |
stripLen = len(row[4]) + 1 | |
row[2] = row[2][stripLen:] | |
# Remove hyphen and space in the beginning | |
row[2] = re.sub('^[( \- )?]','', row[2]) | |
# Remove text in brackets at the end | |
#row[2] = re.sub('\(.*?\)$','', row[2]) | |
row[2] = re.sub('\([^)]*\)$','', row[2]) | |
# Remove extra spaces | |
row[2] = row[2].strip() | |
#Mapping | |
cIndex = titleIndexInCategories(row[2]) | |
if cIndex: | |
mappedCount += 1 | |
row[11] = mainCat[cIndex] | |
row[12] = secondCat[cIndex] | |
row[13] = thirdCat[cIndex] | |
if len(row[2]) > 100: | |
lenMoreThan100 += 1 | |
processedWriter.writerow(row) | |
print "Stripped Blank Rows %s" % strippedRowCount | |
print "Stripped Brand Rows %s" % strippedBrandCount | |
print "Mapped %s" % mappedCount | |
print "Title Length More than 100 Rows %s" % lenMoreThan100 | |
except: | |
print row | |
print rowNum | |
pass | |
finally: | |
outputFile.close() | |
processedFile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment