Skip to content

Instantly share code, notes, and snippets.

@danielrichman
Created September 6, 2011 17:21
Show Gist options
  • Save danielrichman/1198284 to your computer and use it in GitHub Desktop.
Save danielrichman/1198284 to your computer and use it in GitHub Desktop.
ZeusWhoosh
# Copyright 2011 (C) Daniel Richman. License: GNU GPL 3; see LICENSE
import os
import sys
import time
import re
import whoosh.index
import whoosh.fields
import whoosh.query
import whoosh.qparser
import whoosh.highlight
import logging
import itertools
import flask
logger = logging.getLogger("zeuswhoosh")
def unicode_please(string, encoding="utf8", errors="ignore"):
if not isinstance(string, unicode):
return unicode(string, encoding, errors)
return string
class ZeusWhoosh:
def __init__(self, config):
self._config = config
self._create_schema()
if not whoosh.index.exists_in(self._config["index_dir"]):
self._create_index()
self._update_index()
else:
self._open_index()
self.check_update()
def _create_schema(self):
w = whoosh.fields
self._schema = w.Schema(filename=w.ID(unique=True, stored=True),
mtime=w.STORED,
content=w.TEXT(stored=True, chars=True,
phrase=True))
def _open_index(self):
logger.debug("Opening index")
self._index = whoosh.index.open_dir(self._config["index_dir"])
def check_update(self):
update_check_time = time.time() - self._index.last_modified()
if update_check_time > self._config["update_check_period"]:
self._update_index()
else:
logger.debug("Not updating index now.")
def _create_index(self):
logger.debug("Creating index")
os.mkdir(self._config["index_dir"])
self._index = whoosh.index.create_in(self._config["index_dir"],
self._schema)
def _examine_index(self):
with self._index.searcher() as searcher:
files = {}
for fields in searcher.all_stored_fields():
files[fields["filename"]] = fields["mtime"]
return files
def _file_location(self, filename):
return os.path.join(self._config["logs_dir"], filename)
def _examine_directory(self):
files = {}
for filename in os.listdir(self._config["logs_dir"]):
if self._config["log_match"].match(filename):
fpath = self._file_location(filename)
files[filename] = os.path.getmtime(fpath)
return files
def _update_index(self):
logger.info("Begin index update")
indexed_files = self._examine_index()
files = self._examine_directory()
indexed_files_set = set(indexed_files)
files_set = set(files)
self._writer = self._index.writer()
# Add new files
for filename in files_set - indexed_files_set:
mtime = files[filename]
logger.debug("Adding {0} {1}".format(filename, mtime))
self._index_file(filename, mtime)
# Remove deleted files
for filename in indexed_files_set - files_set:
logger.debug("Removing {0}".format(filename))
self._writer.delete_by_term('filename', filename)
# Update modified files
for filename in files_set & indexed_files_set:
if files[filename] != indexed_files[filename]:
mtime = files[filename]
logger.debug("Updating {0} {1}".format(filename, mtime))
self._index_file(filename, mtime)
logger.debug("Committing index update")
self._writer.commit()
del self._writer
logger.info("Finish index update")
def _file_contents(self, filename):
with open(self._file_location(filename)) as f:
return unicode_please(f.read())
def _index_file(self, filename, mtime):
content = self._file_contents(filename)
filename = unicode_please(filename)
self._writer.add_document(filename=filename, mtime=mtime,
content=content)
def search(self, query_string, page=1):
query_string = unicode_please(query_string)
parser = whoosh.qparser.QueryParser("content", self._schema)
parser.remove_plugin_class(whoosh.qparser.WildcardPlugin)
query = parser.parse(query_string)
page = self.searcher.search_page(query, page, pagelen=20,
terms=True)
return page
def __enter__(self):
self.searcher = self._index.searcher()
def __exit__(self, *args):
self.searcher.close()
class IRCHTMLFormatter(whoosh.highlight.HtmlFormatter):
def __init__(self, *args, **kwargs):
super(IRCHTMLFormatter, self).__init__(*args, **kwargs)
self.between = "\n"
def format_fragment(self, *args, **kwargs):
d = super(IRCHTMLFormatter, self).format_fragment(*args, **kwargs)
return "<div class='fragment'>" + d + "</div>"
class LineFragmenter(whoosh.highlight.Fragmenter):
def __init__(self, linesbefore=2, linesafter=2, charlimit=None):
self.linesbefore = 2
self.linesafter = 2
self.charlimit = charlimit
def must_retokenize(self):
return False
def _matched_only(self, tokens):
for t in tokens:
if t.matched:
yield t
def fragment_tokens(self, text, tokens):
raise AssertionError("Should use fragment_matches")
for frag in self.fragment_matches(text, self._matched_only(tokens)):
yield frag
def fragment_matches(self, text, tokens):
position = 0
fragment_start = None
for token in tokens:
start = self._find_lines_before(text, token.startchar)
end = self._find_lines_after(text, token.endchar)
if fragment_start == None or start > fragment_end:
if fragment_start != None:
yield whoosh.highlight.Fragment(text, tokens_in,
fragment_start, fragment_end)
tokens_in = []
fragment_start = start
fragment_end = end
tokens_in.append(token.copy())
if fragment_start != None:
yield whoosh.highlight.Fragment(text, tokens_in,
fragment_start, fragment_end)
def _find_lines_before(self, text, start):
try:
for i in xrange(self.linesbefore + 1):
start = text.rindex("\n", 0, start)
except ValueError:
start -= 1
return start + 1
def _find_lines_after(self, text, end):
try:
for i in xrange(self.linesafter + 1):
end = text.index("\n", end + 1)
except ValueError:
pass
return end
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
config = {
"index_dir": "logs_index",
"logs_dir": "zeusbot_logs",
"log_match": re.compile("^highaltitude\\.log\\.[0-9]{8}$"),
"update_check_period": 60
}
app = flask.Flask("zeuswhoosh")
zw = ZeusWhoosh(config)
@app.route("/", methods=["GET", "POST"])
def search():
zw.check_update()
if "query" in flask.request.form:
query = flask.request.form["query"]
else:
query = ""
response = """
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>Zeusbot Log Search (zeuswhoosh)</title>
<link href="http://habhub.org/simple.css" rel="stylesheet"
type="text/css" />
<style type='text/css'>
input {{ font-size: 150%; }}
input#query {{ width: 80%; border: 2px #000 solid; }}
form#search {{ text-align: center; margin-bottom: 2em; }}
div#resultscount {{ text-align: center; color: #00f;
font-size: 140%; margin-bottom: 1em; }}
form#prevpg {{ float: left; }}
form#nextpg {{ float: right; }}
form#prevpg input, form#nextpg input {{ font-size: 100%; }}
span.filename {{ font-size: 120%; color: #f00; }}
div.fragment {{ white-space: pre-wrap;
border: 2px #f88 dashed;
background-color: #fdd;
font-family: monospace;
margin-bottom: 1em; }}
strong.match {{ color: #00f; }}
</style>
</head>
<body>
<div id="title">Zeusbot Log Search (zeuswhoosh)</div>
<div id="page">
<form id="search" action="#" method="POST">
<input id="query" type="text" name="query"
value="{query}">
<input type="submit" value="Go">
</form>
""".format(query=flask.escape(query))
if query:
with zw:
if "page" in flask.request.form:
p = int(flask.request.form["page"])
else:
p = 1
page = zw.search(flask.request.form["query"], p)
page.results.fragmenter = LineFragmenter()
page.results.formatter = IRCHTMLFormatter()
html = []
if p != 1:
html.append("""<form id='prevpg' action="#" method="POST">
<input type="submit" value="<<">
<input type="hidden" name="page" value="{prev}">
<input type="hidden" name="query" value="{query}">
</form>""".format(prev=p-1, query=flask.escape(query)))
if p != page.pagecount:
html.append("""<form id='nextpg' action="#" method="POST">
<input type="submit" value=">>">
<input type="hidden" name="page" value="{next}">
<input type="hidden" name="query" value="{query}">
</form>""".format(next=p+1, query=flask.escape(query)))
html.append("<div id='resultscount'>")
html.append("Found {n} result{s}! This is page {p} of {pages}"\
.format(n=page.total, p=page.pagenum,
s="s" if page.total is not 1 else "",
pages=page.pagecount))
html.append("</div>")
for hit in page:
html.append("<div class='file'>")
html.append("<span class='filename'>{0}</span>"\
.format(hit["filename"]))
html.append(hit.highlights("content"))
html.append("</div>")
response += "".join(html)
response += """
</div>
</body>
</html>
"""
return response
app.run(debug=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment