Skip to content

Instantly share code, notes, and snippets.

@ThinkCode
Forked from GaretJax/search.py
Created August 1, 2011 21:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ThinkCode/1119047 to your computer and use it in GitHub Desktop.
Save ThinkCode/1119047 to your computer and use it in GitHub Desktop.
Search for Keyword in Text File and return line number
import os, sys
import re, MySQLdb
db = MySQLdb.connect("localhost","testuser","test123","TESTDB" )
#
# MySQL Code
# CREATE TABLE keyword (keywordid integer primary key auto_increment, keyword VARCHAR(255) NOT NULL);
#
# CREATE TABLE location (companyid integer, pdfid integer, keywordid integer references keyword(keywordid),
# data TEXT, line integer, span tinyint, createdate timestamp default current_timestamp,
# primary key (companyid, pdfid, keywordid, line)); ## Chose this primary key to eliminate duplicates of the same line number and keyword
#
cursor = db.cursor()
def iterwords(fh):
for number, line in enumerate(fh):
for word in re.split(r'\s+', line.strip()):
# Preprocess the words here, for example to strip out punctuation
# (the following example is sloooow, compile this regex if you
# really want to use it):
#
# word = re.sub(r'[,.:]', '', word)
#
word = re.sub(r'[,.:()]', '', word)
word = word.lower()
yield number, word
def search(fh, query):
query = re.split(r'\s+', query.strip().lower())
matches = []
words = iterwords(fh)
for line, word in words:
lines_count = 1
current_line = line
for keyword in query:
if keyword == word:
next_line, word = next(words)
if next_line > current_line:
lines_count += 1
current_line = next_line
else:
break
else:
matches.append((line, lines_count))
return tuple(matches)
if __name__ == '__main__':
filepath = sys.argv[1]
cursor.execute("select keywordid, keyword from keyword")
keywordlist = cursor.fetchall()
for htmfile in os.listdir(filepath):
if htmfile.endswith(".txt"):
print "companyid : " + htmfile.split('_')[0], "---> PDF ID " + htmfile.split('_')[1].split('.')[0]
companyid = htmfile.split('_')[0]
pdfid = htmfile.split('_')[1].split('.')[0]
fh = open(filepath + "/" + htmfile)
for keyword in keywordlist:
matches = search(fh, keyword[1])
fh.seek(0)
lines = enumerate(fh)
for lineno, linecount in matches:
number, line = next(lines)
while number < lineno:
number, line = next(lines)
result_lines = [line]
for i in range(linecount-1):
result_lines.append(next(lines)[1])
print keyword[0]
print "Match found on line {0} (spawning {1} lines):\n > {2}".format(
lineno+1, linecount, ' > '.join(result_lines).strip())
cursor.execute("""insert ignore into location (companyid, pdfid, keywordid, data, line, span) values (%s,%s,%s,%s,%s,%s)""", (companyid, pdfid, keyword[0], result_lines[0], lineno+1, linecount))
db.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment