-
-
Save DM-/2475861 to your computer and use it in GitHub Desktop.
Very basic text indexer/searcher in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys, os, glob, re, pickle | |
| if len(sys.argv) != 2: | |
| print "USAGE: %s DIR" % sys.argv[0] | |
| sys.exit(1) | |
| DIR = sys.argv[1] | |
| INDEX = {} | |
| FILES = [] | |
| for path,_,dirs in os.walk(DIR): | |
| FILES.extend([os.path.join(path, x) for x in dirs]) | |
| def tokenizeLine(s): | |
| return [x.lower() for x in re.split(r"[\s'\".,!?;:]", s) if x != ""] | |
| for file in FILES: | |
| with open(file, "r") as f: | |
| for line in f: | |
| for lineno,word in enumerate(tokenizeLine(line)): | |
| if not word in INDEX: | |
| INDEX[word] = [] | |
| INDEX[word].append((file, lineno+1)) | |
| with open("index.p", "wb") as f: | |
| pickle.dump(INDEX, f) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys, pickle, os | |
| if len(sys.argv) != 2: | |
| print "USAGE: %s WORD" % sys.argv[0] | |
| sys.exit(1) | |
| WORD = sys.argv[1].lower() | |
| INDEX = None | |
| def places(l): | |
| s = "" | |
| FILES = {} | |
| for i,(file,lineno) in enumerate(l): | |
| if not file in FILES: | |
| FILES[file] = [] | |
| FILES[file].append(lineno) | |
| for file in FILES: | |
| s += file | |
| s += ":\n" | |
| lines = [] | |
| for line in sorted(FILES[file]): | |
| if line in lines: continue | |
| lines.append(line) | |
| s += " " | |
| s += "line " | |
| s += str(line) | |
| s += "\n" | |
| return s | |
| with open("index.p", "rb") as f: | |
| INDEX = pickle.load(f) | |
| if not INDEX[WORD]: | |
| print "Word '%s' not found" % WORD | |
| sys.exit(1) | |
| else: | |
| print "Word '%s' found in %s" % (WORD, places(INDEX[WORD])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment