sc268/tfidf.py

## tfidf.py
##############################################################
# V2 = use bounding box information to weight words in OCR
## add imagehash to dedup
##############################################
## common functions
##############################################
import logging, os, re
import pandas as pd
import collections, struct, pickle, json, re
from ast import literal_eval
from tqdm import tqdm
from io import open
from os.path import join
from multiprocessing import Pool
from math import sqrt, log
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
from scipy.sparse import vstack
import argparse


def ocrCleanup(OCRstring, minWordLen=3):
    """ remove non alphabet/ numbers chars"""
    clean =  re.sub('[^a-zA-Z1-9]+', ' ', str(OCRstring))
    clean = [w for w in clean.split() if len(w)>=minWordLen]
    clean = ' '.join(clean)
    return clean.lower()


def extractWordROIs(OCR, WordROIs):
    OCR = OCR.split('#N#')
    Words = [w for OCRline in OCR for w in OCRline.split(' ')]
    WordROIs = list(map(float, WordROIs.split(',')))
    OCRjson = []

    for wordIdx in range(len(Words)):
        WordROI = WordROIs[wordIdx*8 : (wordIdx+1)*8]
        WordBB = {  "Words":
                    [{ "Text": Words[wordIdx],
                        "BoundingBox": {
                            "TopLeft": {"X":WordROI[0], "Y":WordROI[1]},
                            "TopRight":  {"X":WordROI[2], "Y":WordROI[3]},
                            "BottomRight":  {"X":WordROI[4], "Y":WordROI[5]},
                            "BottomLeft":  {"X":WordROI[6], "Y":WordROI[7]}
                        }
                    }]
                }
        OCRjson.append(WordBB)
    return OCRjson

def calculateWidthHeight(w):
    edges = [
        sqrt((w['BoundingBox']['BottomLeft']['X'] - w['BoundingBox']['BottomRight']['X']) ** 2 + (w['BoundingBox']['BottomLeft']['Y'] - w['BoundingBox']['BottomRight']['Y']) ** 2),
        sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['BottomRight']['X']) ** 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['BottomRight']['Y']) ** 2),
        sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['TopLeft']['X']) ** 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['TopLeft']['Y']) ** 2),
        sqrt((w['BoundingBox']['TopLeft']['X'] - w['BoundingBox']['BottomLeft']['X']) ** 2 + (w['BoundingBox']['TopLeft']['Y'] - w['BoundingBox']['BottomLeft']['Y']) ** 2)
    ]
    width = max(edges)
    height = min(edges)
    return width, height

def parseOcrRecord(ocrJson):
    words = [ y for x in ocrJson for y in x['Words'] ]
    words = [ { 'text': w['Text'],
               'wh': calculateWidthHeight(w) } for w in words ]
    words = [ {'text': w['text'], 'w': w['wh'][0], 'h': w['wh'][1], 'area': w['wh'][0] * w['wh'][1] } for w in words ]
    return words

def getNormalizedWeights(words):
    sumArea = sum([ sqrt(w['h']) for w in words ])
    weights = [ sqrt(w['h']) / sumArea for w in words ]
    texts = [w['text'] for w in words]
    return list(zip(texts, weights))
    #weightedTf = sum([ f * w for f, w in zip(wordsTf, weights) ])

import mmap
def getNumLines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines
#getNumLines(OCR_WEIGHTS_FN)

idxSources = ['OCR', 'ProductTitle', 'ProductTitle_and_OCR']

### READ Weights
def getWeightedTfIdfV3(words_w_weights, normMethod=None):
	try:
		if len(words_w_weights) == 0:
			return tfidf_transformer.transform(count_vect.transform([ '' ]))
		wordsTf = count_vect.transform([ w[0] for w in words_w_weights])
		weights = [w[1] for w in words_w_weights]
		weightedTf = sum([ f * w for f, w in zip(wordsTf, weights)])
		# get sublinear value of Tf
		tfs = sum(wordsTf)
		sublinearTfs = tfs.data.astype(float)
		sublinearTfs += 1
		# scale factor between tf and sublinear tf.
		weightedTf.data *= sublinearTfs
		weightedTf.data /= tfs.data
		textFeature = tfidf_transformer.transform(weightedTf)
		if normMethod:
			textFeature = normalize(textFeatures, norm=normMethod, axis = 1)
		return textFeature
	except Exception as e:
		print(e)

def runPipeline(line, normMethod=None):
    MurlKey, MD5String, ProductTitle, OCR, LineROIs, WordROIs = line.strip('\n').split('\t')
    ocrJson = extractWordROIs(OCR, WordROIs)
    words = parseOcrRecord(ocrJson)
    words_w_weights = getNormalizedWeights(words)
    words_w_weights = [(ocrCleanup(w[0]), w[1]) for w in words_w_weights if ocrCleanup(w[0])]
    return getWeightedTfIdfV3(words_w_weights, normMethod)

def getTextFeaturesMultiprocessor(lines):
    res = []
    for line in tqdm(lines, total=len(lines)):
        res.append(runPipeline(line))
    # list(map(runPipeline, lines))
    return res

def linspace(lower, upper, length):
    return [int(lower + x*(upper-lower)/length) for x in range(length+1)]


###########################################################################
# load processed counter vector
###########################################################################
idxSource = idxSources[2]
TFIDF_FN = '{}_tfidf_3gram.pickle'.format(idxSource)

numProcessor = 64
normMethod = 'l1'
DAT_DIR = "F:\\sechangc\\shoppingProducts\\dat\\"
#DAT_DIR = '\\\\ccpiu02\shoppingProducts\\dat\\'
os.chdir(DAT_DIR)
# TEST SMALL DATASET
#OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2.tsv'
#TFIDF_WEIGHTED_FN = 'tfidf_3gram_weighted_norm_test.pickle'
#OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2_precomputedWeights.tsv'

# LARGE DATASET
OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927.tsv'
TFIDF_WEIGHTED_FN = join(DAT_DIR, '{}_weighted_tfidf_3gram.pickle'.format(idxSource))

#OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927_precomputedWeights.tsv'

# load count_vect
with open(TFIDF_FN, 'rb') as fp:
    tfidf = pickle.load(fp)
count_vect = tfidf['count']
tfidf_transformer = tfidf['tfidf']


if __name__ == '__main__':
    # prepare to train new tfidf
    print('start reading the file')
    #lines = [ x for x in open(OCR_FN, encoding='utf-8') ]
    lines = []
    with open(OCR_FN, encoding='utf-8') as file:
        for line in tqdm(file, total=getNumLines(OCR_FN)):
            lines.append(line)
    print('file read, num of lines', len(lines))

    import pdb
    pdb.set_trace()
    with Pool(processes=numProcessor) as p:
        textFeatures = list(tqdm(p.imap(runPipeline, lines), total=len(lines) ))
    #textFeatures = list(tqdm(pool.map(runPipeline, lines), total=len(lines)))
    trainTfidf = vstack(textFeatures)
    print(trainTfidf.shape)

    print('Vocabulary size in tfidf: {}'.format(trainTfidf.get_shape()))
    tfidf = { 'count': count_vect, 'tfidf': trainTfidf }
    with open(TFIDF_WEIGHTED_FN, 'wb') as fp:
        pickle.dump(tfidf, fp)
    print('output saves in ', TFIDF_WEIGHTED_FN, ' successfully.' )
	##############################################################
	# V2 = use bounding box information to weight words in OCR
	## add imagehash to dedup
	##############################################
	## common functions
	##############################################
	import logging, os, re
	import pandas as pd
	import collections, struct, pickle, json, re
	from ast import literal_eval
	from tqdm import tqdm
	from io import open
	from os.path import join
	from multiprocessing import Pool
	from math import sqrt, log
	from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
	from sklearn.preprocessing import normalize
	from scipy.sparse import vstack
	import argparse


	def ocrCleanup(OCRstring, minWordLen=3):
	""" remove non alphabet/ numbers chars"""
	clean = re.sub('[^a-zA-Z1-9]+', ' ', str(OCRstring))
	clean = [w for w in clean.split() if len(w)>=minWordLen]
	clean = ' '.join(clean)
	return clean.lower()


	def extractWordROIs(OCR, WordROIs):
	OCR = OCR.split('#N#')
	Words = [w for OCRline in OCR for w in OCRline.split(' ')]
	WordROIs = list(map(float, WordROIs.split(',')))
	OCRjson = []

	for wordIdx in range(len(Words)):
	WordROI = WordROIs[wordIdx8 : (wordIdx+1)8]
	WordBB = { "Words":
	[{ "Text": Words[wordIdx],
	"BoundingBox": {
	"TopLeft": {"X":WordROI[0], "Y":WordROI[1]},
	"TopRight": {"X":WordROI[2], "Y":WordROI[3]},
	"BottomRight": {"X":WordROI[4], "Y":WordROI[5]},
	"BottomLeft": {"X":WordROI[6], "Y":WordROI[7]}
	}
	}]
	}
	OCRjson.append(WordBB)
	return OCRjson

	def calculateWidthHeight(w):
	edges = [
	sqrt((w['BoundingBox']['BottomLeft']['X'] - w['BoundingBox']['BottomRight']['X']) 2 + (w['BoundingBox']['BottomLeft']['Y'] - w['BoundingBox']['BottomRight']['Y']) 2),
	sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['BottomRight']['X']) 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['BottomRight']['Y']) 2),
	sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['TopLeft']['X']) 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['TopLeft']['Y']) 2),
	sqrt((w['BoundingBox']['TopLeft']['X'] - w['BoundingBox']['BottomLeft']['X']) 2 + (w['BoundingBox']['TopLeft']['Y'] - w['BoundingBox']['BottomLeft']['Y']) 2)
	]
	width = max(edges)
	height = min(edges)
	return width, height

	def parseOcrRecord(ocrJson):
	words = [ y for x in ocrJson for y in x['Words'] ]
	words = [ { 'text': w['Text'],
	'wh': calculateWidthHeight(w) } for w in words ]
	words = [ {'text': w['text'], 'w': w['wh'][0], 'h': w['wh'][1], 'area': w['wh'][0] * w['wh'][1] } for w in words ]
	return words

	def getNormalizedWeights(words):
	sumArea = sum([ sqrt(w['h']) for w in words ])
	weights = [ sqrt(w['h']) / sumArea for w in words ]
	texts = [w['text'] for w in words]
	return list(zip(texts, weights))
	#weightedTf = sum([ f * w for f, w in zip(wordsTf, weights) ])

	import mmap
	def getNumLines(file_path):
	fp = open(file_path, "r+")
	buf = mmap.mmap(fp.fileno(), 0)
	lines = 0
	while buf.readline():
	lines += 1
	return lines
	#getNumLines(OCR_WEIGHTS_FN)

	idxSources = ['OCR', 'ProductTitle', 'ProductTitle_and_OCR']

	### READ Weights
	def getWeightedTfIdfV3(words_w_weights, normMethod=None):
	try:
	if len(words_w_weights) == 0:
	return tfidf_transformer.transform(count_vect.transform([ '' ]))
	wordsTf = count_vect.transform([ w[0] for w in words_w_weights])
	weights = [w[1] for w in words_w_weights]
	weightedTf = sum([ f * w for f, w in zip(wordsTf, weights)])
	# get sublinear value of Tf
	tfs = sum(wordsTf)
	sublinearTfs = tfs.data.astype(float)
	sublinearTfs += 1
	# scale factor between tf and sublinear tf.
	weightedTf.data *= sublinearTfs
	weightedTf.data /= tfs.data
	textFeature = tfidf_transformer.transform(weightedTf)
	if normMethod:
	textFeature = normalize(textFeatures, norm=normMethod, axis = 1)
	return textFeature
	except Exception as e:
	print(e)

	def runPipeline(line, normMethod=None):
	MurlKey, MD5String, ProductTitle, OCR, LineROIs, WordROIs = line.strip('\n').split('\t')
	ocrJson = extractWordROIs(OCR, WordROIs)
	words = parseOcrRecord(ocrJson)
	words_w_weights = getNormalizedWeights(words)
	words_w_weights = [(ocrCleanup(w[0]), w[1]) for w in words_w_weights if ocrCleanup(w[0])]
	return getWeightedTfIdfV3(words_w_weights, normMethod)

	def getTextFeaturesMultiprocessor(lines):
	res = []
	for line in tqdm(lines, total=len(lines)):
	res.append(runPipeline(line))
	# list(map(runPipeline, lines))
	return res

	def linspace(lower, upper, length):
	return [int(lower + x*(upper-lower)/length) for x in range(length+1)]


	###########################################################################
	# load processed counter vector
	###########################################################################
	idxSource = idxSources[2]
	TFIDF_FN = '{}_tfidf_3gram.pickle'.format(idxSource)

	numProcessor = 64
	normMethod = 'l1'
	DAT_DIR = "F:\\sechangc\\shoppingProducts\\dat\\"
	#DAT_DIR = '\\\\ccpiu02\shoppingProducts\\dat\\'
	os.chdir(DAT_DIR)
	# TEST SMALL DATASET
	#OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2.tsv'
	#TFIDF_WEIGHTED_FN = 'tfidf_3gram_weighted_norm_test.pickle'
	#OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2_precomputedWeights.tsv'

	# LARGE DATASET
	OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927.tsv'
	TFIDF_WEIGHTED_FN = join(DAT_DIR, '{}_weighted_tfidf_3gram.pickle'.format(idxSource))

	#OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927_precomputedWeights.tsv'

	# load count_vect
	with open(TFIDF_FN, 'rb') as fp:
	tfidf = pickle.load(fp)
	count_vect = tfidf['count']
	tfidf_transformer = tfidf['tfidf']



	if __name__ == '__main__':
	# prepare to train new tfidf
	print('start reading the file')
	#lines = [ x for x in open(OCR_FN, encoding='utf-8') ]
	lines = []
	with open(OCR_FN, encoding='utf-8') as file:
	for line in tqdm(file, total=getNumLines(OCR_FN)):
	lines.append(line)
	print('file read, num of lines', len(lines))

	import pdb
	pdb.set_trace()
	with Pool(processes=numProcessor) as p:
	textFeatures = list(tqdm(p.imap(runPipeline, lines), total=len(lines) ))
	#textFeatures = list(tqdm(pool.map(runPipeline, lines), total=len(lines)))
	trainTfidf = vstack(textFeatures)
	print(trainTfidf.shape)

	print('Vocabulary size in tfidf: {}'.format(trainTfidf.get_shape()))
	tfidf = { 'count': count_vect, 'tfidf': trainTfidf }
	with open(TFIDF_WEIGHTED_FN, 'wb') as fp:
	pickle.dump(tfidf, fp)
	print('output saves in ', TFIDF_WEIGHTED_FN, ' successfully.' )