Created
September 6, 2011 17:21
-
-
Save danielrichman/1198284 to your computer and use it in GitHub Desktop.
ZeusWhoosh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2011 (C) Daniel Richman. License: GNU GPL 3; see LICENSE | |
import os | |
import sys | |
import time | |
import re | |
import whoosh.index | |
import whoosh.fields | |
import whoosh.query | |
import whoosh.qparser | |
import whoosh.highlight | |
import logging | |
import itertools | |
import flask | |
logger = logging.getLogger("zeuswhoosh") | |
def unicode_please(string, encoding="utf8", errors="ignore"): | |
if not isinstance(string, unicode): | |
return unicode(string, encoding, errors) | |
return string | |
class ZeusWhoosh: | |
def __init__(self, config): | |
self._config = config | |
self._create_schema() | |
if not whoosh.index.exists_in(self._config["index_dir"]): | |
self._create_index() | |
self._update_index() | |
else: | |
self._open_index() | |
self.check_update() | |
def _create_schema(self): | |
w = whoosh.fields | |
self._schema = w.Schema(filename=w.ID(unique=True, stored=True), | |
mtime=w.STORED, | |
content=w.TEXT(stored=True, chars=True, | |
phrase=True)) | |
def _open_index(self): | |
logger.debug("Opening index") | |
self._index = whoosh.index.open_dir(self._config["index_dir"]) | |
def check_update(self): | |
update_check_time = time.time() - self._index.last_modified() | |
if update_check_time > self._config["update_check_period"]: | |
self._update_index() | |
else: | |
logger.debug("Not updating index now.") | |
def _create_index(self): | |
logger.debug("Creating index") | |
os.mkdir(self._config["index_dir"]) | |
self._index = whoosh.index.create_in(self._config["index_dir"], | |
self._schema) | |
def _examine_index(self): | |
with self._index.searcher() as searcher: | |
files = {} | |
for fields in searcher.all_stored_fields(): | |
files[fields["filename"]] = fields["mtime"] | |
return files | |
def _file_location(self, filename): | |
return os.path.join(self._config["logs_dir"], filename) | |
def _examine_directory(self): | |
files = {} | |
for filename in os.listdir(self._config["logs_dir"]): | |
if self._config["log_match"].match(filename): | |
fpath = self._file_location(filename) | |
files[filename] = os.path.getmtime(fpath) | |
return files | |
def _update_index(self): | |
logger.info("Begin index update") | |
indexed_files = self._examine_index() | |
files = self._examine_directory() | |
indexed_files_set = set(indexed_files) | |
files_set = set(files) | |
self._writer = self._index.writer() | |
# Add new files | |
for filename in files_set - indexed_files_set: | |
mtime = files[filename] | |
logger.debug("Adding {0} {1}".format(filename, mtime)) | |
self._index_file(filename, mtime) | |
# Remove deleted files | |
for filename in indexed_files_set - files_set: | |
logger.debug("Removing {0}".format(filename)) | |
self._writer.delete_by_term('filename', filename) | |
# Update modified files | |
for filename in files_set & indexed_files_set: | |
if files[filename] != indexed_files[filename]: | |
mtime = files[filename] | |
logger.debug("Updating {0} {1}".format(filename, mtime)) | |
self._index_file(filename, mtime) | |
logger.debug("Committing index update") | |
self._writer.commit() | |
del self._writer | |
logger.info("Finish index update") | |
def _file_contents(self, filename): | |
with open(self._file_location(filename)) as f: | |
return unicode_please(f.read()) | |
def _index_file(self, filename, mtime): | |
content = self._file_contents(filename) | |
filename = unicode_please(filename) | |
self._writer.add_document(filename=filename, mtime=mtime, | |
content=content) | |
def search(self, query_string, page=1): | |
query_string = unicode_please(query_string) | |
parser = whoosh.qparser.QueryParser("content", self._schema) | |
parser.remove_plugin_class(whoosh.qparser.WildcardPlugin) | |
query = parser.parse(query_string) | |
page = self.searcher.search_page(query, page, pagelen=20, | |
terms=True) | |
return page | |
def __enter__(self): | |
self.searcher = self._index.searcher() | |
def __exit__(self, *args): | |
self.searcher.close() | |
class IRCHTMLFormatter(whoosh.highlight.HtmlFormatter): | |
def __init__(self, *args, **kwargs): | |
super(IRCHTMLFormatter, self).__init__(*args, **kwargs) | |
self.between = "\n" | |
def format_fragment(self, *args, **kwargs): | |
d = super(IRCHTMLFormatter, self).format_fragment(*args, **kwargs) | |
return "<div class='fragment'>" + d + "</div>" | |
class LineFragmenter(whoosh.highlight.Fragmenter): | |
def __init__(self, linesbefore=2, linesafter=2, charlimit=None): | |
self.linesbefore = 2 | |
self.linesafter = 2 | |
self.charlimit = charlimit | |
def must_retokenize(self): | |
return False | |
def _matched_only(self, tokens): | |
for t in tokens: | |
if t.matched: | |
yield t | |
def fragment_tokens(self, text, tokens): | |
raise AssertionError("Should use fragment_matches") | |
for frag in self.fragment_matches(text, self._matched_only(tokens)): | |
yield frag | |
def fragment_matches(self, text, tokens): | |
position = 0 | |
fragment_start = None | |
for token in tokens: | |
start = self._find_lines_before(text, token.startchar) | |
end = self._find_lines_after(text, token.endchar) | |
if fragment_start == None or start > fragment_end: | |
if fragment_start != None: | |
yield whoosh.highlight.Fragment(text, tokens_in, | |
fragment_start, fragment_end) | |
tokens_in = [] | |
fragment_start = start | |
fragment_end = end | |
tokens_in.append(token.copy()) | |
if fragment_start != None: | |
yield whoosh.highlight.Fragment(text, tokens_in, | |
fragment_start, fragment_end) | |
def _find_lines_before(self, text, start): | |
try: | |
for i in xrange(self.linesbefore + 1): | |
start = text.rindex("\n", 0, start) | |
except ValueError: | |
start -= 1 | |
return start + 1 | |
def _find_lines_after(self, text, end): | |
try: | |
for i in xrange(self.linesafter + 1): | |
end = text.index("\n", end + 1) | |
except ValueError: | |
pass | |
return end | |
if __name__ == "__main__": | |
logging.basicConfig(level=logging.DEBUG) | |
config = { | |
"index_dir": "logs_index", | |
"logs_dir": "zeusbot_logs", | |
"log_match": re.compile("^highaltitude\\.log\\.[0-9]{8}$"), | |
"update_check_period": 60 | |
} | |
app = flask.Flask("zeuswhoosh") | |
zw = ZeusWhoosh(config) | |
@app.route("/", methods=["GET", "POST"]) | |
def search(): | |
zw.check_update() | |
if "query" in flask.request.form: | |
query = flask.request.form["query"] | |
else: | |
query = "" | |
response = """ | |
<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
<html> | |
<head> | |
<title>Zeusbot Log Search (zeuswhoosh)</title> | |
<link href="http://habhub.org/simple.css" rel="stylesheet" | |
type="text/css" /> | |
<style type='text/css'> | |
input {{ font-size: 150%; }} | |
input#query {{ width: 80%; border: 2px #000 solid; }} | |
form#search {{ text-align: center; margin-bottom: 2em; }} | |
div#resultscount {{ text-align: center; color: #00f; | |
font-size: 140%; margin-bottom: 1em; }} | |
form#prevpg {{ float: left; }} | |
form#nextpg {{ float: right; }} | |
form#prevpg input, form#nextpg input {{ font-size: 100%; }} | |
span.filename {{ font-size: 120%; color: #f00; }} | |
div.fragment {{ white-space: pre-wrap; | |
border: 2px #f88 dashed; | |
background-color: #fdd; | |
font-family: monospace; | |
margin-bottom: 1em; }} | |
strong.match {{ color: #00f; }} | |
</style> | |
</head> | |
<body> | |
<div id="title">Zeusbot Log Search (zeuswhoosh)</div> | |
<div id="page"> | |
<form id="search" action="#" method="POST"> | |
<input id="query" type="text" name="query" | |
value="{query}"> | |
<input type="submit" value="Go"> | |
</form> | |
""".format(query=flask.escape(query)) | |
if query: | |
with zw: | |
if "page" in flask.request.form: | |
p = int(flask.request.form["page"]) | |
else: | |
p = 1 | |
page = zw.search(flask.request.form["query"], p) | |
page.results.fragmenter = LineFragmenter() | |
page.results.formatter = IRCHTMLFormatter() | |
html = [] | |
if p != 1: | |
html.append("""<form id='prevpg' action="#" method="POST"> | |
<input type="submit" value="<<"> | |
<input type="hidden" name="page" value="{prev}"> | |
<input type="hidden" name="query" value="{query}"> | |
</form>""".format(prev=p-1, query=flask.escape(query))) | |
if p != page.pagecount: | |
html.append("""<form id='nextpg' action="#" method="POST"> | |
<input type="submit" value=">>"> | |
<input type="hidden" name="page" value="{next}"> | |
<input type="hidden" name="query" value="{query}"> | |
</form>""".format(next=p+1, query=flask.escape(query))) | |
html.append("<div id='resultscount'>") | |
html.append("Found {n} result{s}! This is page {p} of {pages}"\ | |
.format(n=page.total, p=page.pagenum, | |
s="s" if page.total is not 1 else "", | |
pages=page.pagecount)) | |
html.append("</div>") | |
for hit in page: | |
html.append("<div class='file'>") | |
html.append("<span class='filename'>{0}</span>"\ | |
.format(hit["filename"])) | |
html.append(hit.highlights("content")) | |
html.append("</div>") | |
response += "".join(html) | |
response += """ | |
</div> | |
</body> | |
</html> | |
""" | |
return response | |
app.run(debug=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment