# V2 = use bounding box information to weight words in OCR
## add imagehash to dedup
## common functions
import logging, os, re
import pandas as pd
import collections, struct, pickle, json, re
from ast import literal_eval
from tqdm import tqdm
from io import open
from os.path import join
from multiprocessing import Pool
from math import sqrt, log
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
from scipy.sparse import vstack
import argparse
def ocrCleanup(OCRstring, minWordLen=3):
""" remove non alphabet/ numbers chars"""
clean = re.sub('[^a-zA-Z1-9]+', ' ', str(OCRstring))
clean = [w for w in clean.split() if len(w)>=minWordLen]
clean = ' '.join(clean)
return clean.lower()
def extractWordROIs(OCR, WordROIs):
OCR = OCR.split('#N#')
Words = [w for OCRline in OCR for w in OCRline.split(' ')]
WordROIs = list(map(float, WordROIs.split(',')))
OCRjson = []
for wordIdx in range(len(Words)):
WordROI = WordROIs[wordIdx*8 : (wordIdx+1)*8]
WordBB = { "Words":
[{ "Text": Words[wordIdx],
"BoundingBox": {
"TopLeft": {"X":WordROI[0], "Y":WordROI[1]},
"TopRight": {"X":WordROI[2], "Y":WordROI[3]},
"BottomRight": {"X":WordROI[4], "Y":WordROI[5]},
"BottomLeft": {"X":WordROI[6], "Y":WordROI[7]}
return OCRjson
def calculateWidthHeight(w):
edges = [
sqrt((w['BoundingBox']['BottomLeft']['X'] - w['BoundingBox']['BottomRight']['X']) ** 2 + (w['BoundingBox']['BottomLeft']['Y'] - w['BoundingBox']['BottomRight']['Y']) ** 2),
sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['BottomRight']['X']) ** 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['BottomRight']['Y']) ** 2),
sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['TopLeft']['X']) ** 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['TopLeft']['Y']) ** 2),
sqrt((w['BoundingBox']['TopLeft']['X'] - w['BoundingBox']['BottomLeft']['X']) ** 2 + (w['BoundingBox']['TopLeft']['Y'] - w['BoundingBox']['BottomLeft']['Y']) ** 2)
width = max(edges)
height = min(edges)
return width, height
def parseOcrRecord(ocrJson):
words = [ y for x in ocrJson for y in x['Words'] ]
words = [ { 'text': w['Text'],
'wh': calculateWidthHeight(w) } for w in words ]
words = [ {'text': w['text'], 'w': w['wh'][0], 'h': w['wh'][1], 'area': w['wh'][0] * w['wh'][1] } for w in words ]
return words
def getNormalizedWeights(words):
sumArea = sum([ sqrt(w['h']) for w in words ])
weights = [ sqrt(w['h']) / sumArea for w in words ]
texts = [w['text'] for w in words]
return list(zip(texts, weights))
#weightedTf = sum([ f * w for f, w in zip(wordsTf, weights) ])
import mmap
def getNumLines(file_path):
fp = open(file_path, "r+")
buf = mmap.mmap(fp.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
idxSources = ['OCR', 'ProductTitle', 'ProductTitle_and_OCR']
### READ Weights
def getWeightedTfIdfV3(words_w_weights, normMethod=None):
if len(words_w_weights) == 0:
return tfidf_transformer.transform(count_vect.transform([ '' ]))
wordsTf = count_vect.transform([ w[0] for w in words_w_weights])
weights = [w[1] for w in words_w_weights]
weightedTf = sum([ f * w for f, w in zip(wordsTf, weights)])
# get sublinear value of Tf
tfs = sum(wordsTf)
sublinearTfs =
sublinearTfs += 1
# scale factor between tf and sublinear tf. *= sublinearTfs /=
textFeature = tfidf_transformer.transform(weightedTf)
if normMethod:
textFeature = normalize(textFeatures, norm=normMethod, axis = 1)
return textFeature
except Exception as e:
def runPipeline(line, normMethod=None):
MurlKey, MD5String, ProductTitle, OCR, LineROIs, WordROIs = line.strip('\n').split('\t')
ocrJson = extractWordROIs(OCR, WordROIs)
words = parseOcrRecord(ocrJson)
words_w_weights = getNormalizedWeights(words)
words_w_weights = [(ocrCleanup(w[0]), w[1]) for w in words_w_weights if ocrCleanup(w[0])]
return getWeightedTfIdfV3(words_w_weights, normMethod)
def getTextFeaturesMultiprocessor(lines):
res = []
for line in tqdm(lines, total=len(lines)):
# list(map(runPipeline, lines))
return res
def linspace(lower, upper, length):
return [int(lower + x*(upper-lower)/length) for x in range(length+1)]
# load processed counter vector
idxSource = idxSources[2]
TFIDF_FN = '{}_tfidf_3gram.pickle'.format(idxSource)
numProcessor = 64
normMethod = 'l1'
DAT_DIR = "F:\\sechangc\\shoppingProducts\\dat\\"
#DAT_DIR = '\\\\ccpiu02\shoppingProducts\\dat\\'
#OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2.tsv'
#TFIDF_WEIGHTED_FN = 'tfidf_3gram_weighted_norm_test.pickle'
#OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2_precomputedWeights.tsv'
OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927.tsv'
TFIDF_WEIGHTED_FN = join(DAT_DIR, '{}_weighted_tfidf_3gram.pickle'.format(idxSource))
#OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927_precomputedWeights.tsv'
# load count_vect
with open(TFIDF_FN, 'rb') as fp:
tfidf = pickle.load(fp)
count_vect = tfidf['count']
tfidf_transformer = tfidf['tfidf']
if __name__ == '__main__':
# prepare to train new tfidf
print('start reading the file')
#lines = [ x for x in open(OCR_FN, encoding='utf-8') ]
lines = []
with open(OCR_FN, encoding='utf-8') as file:
for line in tqdm(file, total=getNumLines(OCR_FN)):
print('file read, num of lines', len(lines))
import pdb
with Pool(processes=numProcessor) as p:
textFeatures = list(tqdm(p.imap(runPipeline, lines), total=len(lines) ))
#textFeatures = list(tqdm(, lines), total=len(lines)))
trainTfidf = vstack(textFeatures)
print('Vocabulary size in tfidf: {}'.format(trainTfidf.get_shape()))
tfidf = { 'count': count_vect, 'tfidf': trainTfidf }
with open(TFIDF_WEIGHTED_FN, 'wb') as fp:
pickle.dump(tfidf, fp)
print('output saves in ', TFIDF_WEIGHTED_FN, ' successfully.' )
