madhurjain/process.py

## process.py
import csv, re, traceback

# Read category file to dict
categoryFile = open('categories.csv', 'rb')
try:
	categoryDict = csv.DictReader(categoryFile);
	cDesc = []
	mainCat = []
	secondCat = []
	thirdCat = []
	for row in categoryDict:
		cDesc.append(row['Description'])
		mainCat.append(row['Main Category'])
		secondCat.append(row['2nd Category'])
		thirdCat.append(row['3rd Category'])
finally:
	categoryFile.close()

def titleIndexInCategories(title):
	for ind, desc in enumerate(cDesc):
		if desc in title:
			return ind
	return 0

# Read Output file and write to Processed removing the brand
outputFile = open('output.csv', 'rb')
samples = csv.reader(outputFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)

processedFile = open('processed.csv', 'wb')
processedWriter = csv.writer(processedFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)
try:
	strippedBrandCount = 0
	strippedRowCount = 0
	mappedCount = 0
	lenMoreThan100 = 0
	rowNum = 0
	for row in samples:
		rowNum += 1
		if rowNum == 1:
			processedWriter.writerow(row)
		else:
			# Remove Blank Rows
			if len(row) == 0:
				strippedRowCount += 1
				continue

			# Remove rows with no AnchorText or ProductTitle or Blank Rows
			if len(row[1].strip()) == 0 or len(row[2].strip()) == 0:
				strippedRowCount += 1
				continue

			# Remove Brand Name from Product Title
			if row[2].startswith(row[4]):
				strippedBrandCount += 1
				stripLen = len(row[4]) + 1
				row[2] = row[2][stripLen:]

			# Remove hyphen and space in the beginning
			row[2] = re.sub('^[( \- )?]','', row[2])

			# Remove text in brackets at the end
			#row[2] = re.sub('\(.*?\)$','', row[2])
			row[2] = re.sub('\([^)]*\)$','', row[2])

			# Remove extra spaces
			row[2] = row[2].strip()

			#Mapping
			cIndex = titleIndexInCategories(row[2])
			if cIndex:
				mappedCount += 1
				row[11] = mainCat[cIndex]
				row[12] = secondCat[cIndex]
				row[13] = thirdCat[cIndex]

			if len(row[2]) > 100:
				lenMoreThan100 += 1

			processedWriter.writerow(row)
	print "Stripped Blank Rows %s" % strippedRowCount
	print "Stripped Brand Rows %s" % strippedBrandCount
	print "Mapped %s" % mappedCount
	print "Title Length More than 100 Rows %s" % lenMoreThan100
except:
	print row
	print rowNum
	pass
finally:
	outputFile.close()
	processedFile.close()
	import csv, re, traceback

	# Read category file to dict
	categoryFile = open('categories.csv', 'rb')
	try:
	categoryDict = csv.DictReader(categoryFile);
	cDesc = []
	mainCat = []
	secondCat = []
	thirdCat = []
	for row in categoryDict:
	cDesc.append(row['Description'])
	mainCat.append(row['Main Category'])
	secondCat.append(row['2nd Category'])
	thirdCat.append(row['3rd Category'])
	finally:
	categoryFile.close()

	def titleIndexInCategories(title):
	for ind, desc in enumerate(cDesc):
	if desc in title:
	return ind
	return 0

	# Read Output file and write to Processed removing the brand
	outputFile = open('output.csv', 'rb')
	samples = csv.reader(outputFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)

	processedFile = open('processed.csv', 'wb')
	processedWriter = csv.writer(processedFile, delimiter=',', quotechar='"', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)
	try:
	strippedBrandCount = 0
	strippedRowCount = 0
	mappedCount = 0
	lenMoreThan100 = 0
	rowNum = 0
	for row in samples:
	rowNum += 1
	if rowNum == 1:
	processedWriter.writerow(row)
	else:
	# Remove Blank Rows
	if len(row) == 0:
	strippedRowCount += 1
	continue

	# Remove rows with no AnchorText or ProductTitle or Blank Rows
	if len(row[1].strip()) == 0 or len(row[2].strip()) == 0:
	strippedRowCount += 1
	continue

	# Remove Brand Name from Product Title
	if row[2].startswith(row[4]):
	strippedBrandCount += 1
	stripLen = len(row[4]) + 1
	row[2] = row[2][stripLen:]

	# Remove hyphen and space in the beginning
	row[2] = re.sub('^[( \- )?]','', row[2])

	# Remove text in brackets at the end
	#row[2] = re.sub('\(.*?\)$','', row[2])
	row[2] = re.sub('\([^)]*\)$','', row[2])

	# Remove extra spaces
	row[2] = row[2].strip()

	#Mapping
	cIndex = titleIndexInCategories(row[2])
	if cIndex:
	mappedCount += 1
	row[11] = mainCat[cIndex]
	row[12] = secondCat[cIndex]
	row[13] = thirdCat[cIndex]

	if len(row[2]) > 100:
	lenMoreThan100 += 1

	processedWriter.writerow(row)
	print "Stripped Blank Rows %s" % strippedRowCount
	print "Stripped Brand Rows %s" % strippedBrandCount
	print "Mapped %s" % mappedCount
	print "Title Length More than 100 Rows %s" % lenMoreThan100
	except:
	print row
	print rowNum
	pass
	finally:
	outputFile.close()
	processedFile.close()