ThinkCode/search.py

## search.py
import os, sys
import re, MySQLdb

db = MySQLdb.connect("localhost","testuser","test123","TESTDB" )

#
# MySQL Code
# CREATE TABLE keyword (keywordid integer primary key auto_increment, keyword VARCHAR(255) NOT NULL);
#
# CREATE TABLE location (companyid integer, pdfid integer, keywordid integer references keyword(keywordid),
# data TEXT, line integer, span tinyint, createdate timestamp default current_timestamp,
# primary key (companyid, pdfid, keywordid, line));  ## Chose this primary key to eliminate duplicates of the same line number and keyword
#

cursor = db.cursor()

def iterwords(fh):
    for number, line in enumerate(fh):
        for word in re.split(r'\s+', line.strip()):
            # Preprocess the words here, for example to strip out punctuation
            # (the following example is sloooow, compile this regex if you
            # really want to use it):
            #
            #     word = re.sub(r'[,.:]', '', word)
            #
	    word = re.sub(r'[,.:()]', '', word)
            word = word.lower()
            yield number, word


def search(fh, query):
    query = re.split(r'\s+', query.strip().lower())
    matches = []
    words = iterwords(fh)

    for line, word in words:
        lines_count = 1
        current_line = line
        for keyword in query:
            if keyword == word:
                next_line, word = next(words)
                if next_line > current_line:
                    lines_count += 1
                    current_line = next_line
            else:
                break
        else:
            matches.append((line, lines_count))

    return tuple(matches)

if __name__ == '__main__':

    filepath = sys.argv[1]

    cursor.execute("select keywordid, keyword from keyword")
    keywordlist = cursor.fetchall()

    for htmfile in os.listdir(filepath):
    	if htmfile.endswith(".txt"):
		print "companyid : " + htmfile.split('_')[0], "---> PDF ID " + htmfile.split('_')[1].split('.')[0]
		companyid = htmfile.split('_')[0]
		pdfid = htmfile.split('_')[1].split('.')[0]
		fh = open(filepath + "/" + htmfile)
		for keyword in keywordlist:
			matches = search(fh, keyword[1])
			fh.seek(0)
			lines = enumerate(fh)
			for lineno, linecount in matches:
				number, line = next(lines)
				while number < lineno:
				    number, line = next(lines)

				result_lines = [line]

				for i in range(linecount-1):
				    result_lines.append(next(lines)[1])
				print keyword[0]
				print "Match found on line {0} (spawning {1} lines):\n > {2}".format(
				    lineno+1, linecount, ' > '.join(result_lines).strip())

				cursor.execute("""insert ignore into location (companyid, pdfid, keywordid, data, line, span) values (%s,%s,%s,%s,%s,%s)""", (companyid, pdfid, keyword[0], result_lines[0], lineno+1, linecount))

    db.close()
	import os, sys
	import re, MySQLdb

	db = MySQLdb.connect("localhost","testuser","test123","TESTDB" )

	#
	# MySQL Code
	# CREATE TABLE keyword (keywordid integer primary key auto_increment, keyword VARCHAR(255) NOT NULL);
	#
	# CREATE TABLE location (companyid integer, pdfid integer, keywordid integer references keyword(keywordid),
	# data TEXT, line integer, span tinyint, createdate timestamp default current_timestamp,
	# primary key (companyid, pdfid, keywordid, line)); ## Chose this primary key to eliminate duplicates of the same line number and keyword
	#

	cursor = db.cursor()

	def iterwords(fh):
	for number, line in enumerate(fh):
	for word in re.split(r'\s+', line.strip()):
	# Preprocess the words here, for example to strip out punctuation
	# (the following example is sloooow, compile this regex if you
	# really want to use it):
	#
	# word = re.sub(r'[,.:]', '', word)
	#
	word = re.sub(r'[,.:()]', '', word)
	word = word.lower()
	yield number, word



	def search(fh, query):
	query = re.split(r'\s+', query.strip().lower())
	matches = []
	words = iterwords(fh)

	for line, word in words:
	lines_count = 1
	current_line = line
	for keyword in query:
	if keyword == word:
	next_line, word = next(words)
	if next_line > current_line:
	lines_count += 1
	current_line = next_line
	else:
	break
	else:
	matches.append((line, lines_count))

	return tuple(matches)

	if __name__ == '__main__':

	filepath = sys.argv[1]

	cursor.execute("select keywordid, keyword from keyword")
	keywordlist = cursor.fetchall()

	for htmfile in os.listdir(filepath):
	if htmfile.endswith(".txt"):
	print "companyid : " + htmfile.split('_')[0], "---> PDF ID " + htmfile.split('_')[1].split('.')[0]
	companyid = htmfile.split('_')[0]
	pdfid = htmfile.split('_')[1].split('.')[0]
	fh = open(filepath + "/" + htmfile)
	for keyword in keywordlist:
	matches = search(fh, keyword[1])
	fh.seek(0)
	lines = enumerate(fh)
	for lineno, linecount in matches:
	number, line = next(lines)
	while number < lineno:
	number, line = next(lines)

	result_lines = [line]

	for i in range(linecount-1):
	result_lines.append(next(lines)[1])
	print keyword[0]
	print "Match found on line {0} (spawning {1} lines):\n > {2}".format(
	lineno+1, linecount, ' > '.join(result_lines).strip())

	cursor.execute("""insert ignore into location (companyid, pdfid, keywordid, data, line, span) values (%s,%s,%s,%s,%s,%s)""", (companyid, pdfid, keyword[0], result_lines[0], lineno+1, linecount))

	db.close()