Created
April 24, 2012 02:33
-
-
Save darkf/2475601 to your computer and use it in GitHub Desktop.
Very basic text indexer/searcher in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os, glob, re, pickle | |
if len(sys.argv) != 2: | |
print "USAGE: %s DIR" % sys.argv[0] | |
sys.exit(1) | |
INDEX = {} | |
FILES = [] | |
for path,_,dirs in os.walk(sys.argv[1]): | |
FILES.extend([os.path.join(path, x) for x in dirs]) | |
def tokenizeLine(s): | |
return [x.lower() for x in re.split(r"[\s'\".,!?;:]", s) if x != ""] | |
for file in FILES: | |
for line in open(file, "r"): | |
for lineno,word in enumerate(tokenizeLine(line)): | |
INDEX.setdefault(word, []).append((file, lineno+1)) | |
pickle.dump(INDEX, open("index.p", "wb")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, pickle, os | |
if len(sys.argv) != 2: | |
print "USAGE: %s WORD" % sys.argv[0] | |
sys.exit(1) | |
WORD = sys.argv[1].lower() | |
INDEX = None | |
def places(l): | |
s = "" | |
FILES = {} | |
for file,lineno in l: | |
FILES.setdefault(file, []).append(lineno) | |
for file,linenums in FILES.iteritems(): | |
s += "%s:\n" % file | |
lines = [] | |
for line in sorted(linenums): | |
if line in lines: continue # skip duplicate lines | |
lines.append(line) | |
s += " line %d\n" % line | |
return s | |
INDEX = pickle.load(open("index.p", "rb")) | |
if not WORD in INDEX: | |
print "Word '%s' not found" % WORD | |
sys.exit(1) | |
print "Word '%s' found in %s" % (WORD, places(INDEX[WORD])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment