Skip to content

Instantly share code, notes, and snippets.

@darkf
Created April 24, 2012 02:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save darkf/2475601 to your computer and use it in GitHub Desktop.
Save darkf/2475601 to your computer and use it in GitHub Desktop.
Very basic text indexer/searcher in Python
import sys, os, glob, re, pickle
if len(sys.argv) != 2:
print "USAGE: %s DIR" % sys.argv[0]
sys.exit(1)
INDEX = {}
FILES = []
for path,_,dirs in os.walk(sys.argv[1]):
FILES.extend([os.path.join(path, x) for x in dirs])
def tokenizeLine(s):
return [x.lower() for x in re.split(r"[\s'\".,!?;:]", s) if x != ""]
for file in FILES:
for line in open(file, "r"):
for lineno,word in enumerate(tokenizeLine(line)):
INDEX.setdefault(word, []).append((file, lineno+1))
pickle.dump(INDEX, open("index.p", "wb"))
import sys, pickle, os
if len(sys.argv) != 2:
print "USAGE: %s WORD" % sys.argv[0]
sys.exit(1)
WORD = sys.argv[1].lower()
INDEX = None
def places(l):
s = ""
FILES = {}
for file,lineno in l:
FILES.setdefault(file, []).append(lineno)
for file,linenums in FILES.iteritems():
s += "%s:\n" % file
lines = []
for line in sorted(linenums):
if line in lines: continue # skip duplicate lines
lines.append(line)
s += " line %d\n" % line
return s
INDEX = pickle.load(open("index.p", "rb"))
if not WORD in INDEX:
print "Word '%s' not found" % WORD
sys.exit(1)
print "Word '%s' found in %s" % (WORD, places(INDEX[WORD]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment