Created
April 9, 2014 11:46
-
-
Save lisovskyvlad/10259439 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
# coding: utf-8 | |
import codecs | |
import re | |
from string import punctuation | |
import collections | |
import sys | |
punctuation_regexp = re.compile(r'[\s{}]+'.format(re.escape(punctuation))) | |
digit_regexp = re.compile(r'\d+') | |
# получаем уникальные токены для текущего документа | |
def get_tokens(text): | |
tokens = set() | |
sum_length = 0.0 | |
for t in punctuation_regexp.split(text): | |
t = digit_regexp.sub('', t) | |
if t != '': | |
tokens.add(t) | |
sum_length = sum(len(t) for t in tokens) | |
count = len(tokens) | |
return tokens, count, sum_length | |
# получаем уникальные термы для текущего документа | |
def get_terms(tokens): | |
terms = set() | |
sum_length = 0.0 | |
for t in tokens: | |
terms.add(t.lower()) | |
sum_length = sum(len(t) for t in terms) | |
count = len(terms) | |
return terms, count, sum_length | |
def get_reverse_index(paragraphs): | |
pars_seq = set() | |
count_tokens, count_terms = 0, 0 | |
sum_len_tokens, sum_len_terms = 0.0, 0.0 | |
for doc_id, paragraph in enumerate(paragraphs): | |
# fetching tokens and token`s statistics | |
tokens, token_len, sum_len_token = get_tokens(paragraph) | |
count_tokens += token_len | |
sum_len_tokens += sum_len_token | |
# fetching tems and tems`s statistics | |
terms, term_len, sum_len_term = get_terms(tokens) | |
count_terms += term_len | |
sum_len_terms += sum_len_term | |
# set of (term, doc_id) | |
for term in terms: | |
pars_seq.add((term, doc_id)) | |
index_dict = collections.defaultdict(lambda: (0, list())) | |
for term, docid in pars_seq: | |
freq, docids = index_dict[term] | |
# frequency and list of paragraphs | |
index_dict[term] = freq + 1, sorted(docids + [docid]) | |
# reverse_index = sorted(index_dict.items()) | |
# for t, (f, d) in reverse_index: | |
# print t, f, d | |
return index_dict | |
def parse_query(q, rindex, splited_text, count_of_searches): | |
total = set() | |
left, right, oper= False, False, False | |
splited_query = q.split() | |
if len(splited_query) <= 1: | |
total = set(rindex[splited_query[0].lower()][1]) | |
else: | |
for qitem in q.split(): | |
if qitem in '&|': | |
oper = qitem | |
elif not left: | |
left = rindex[qitem][1] | |
# print left | |
elif not right: | |
right = rindex[qitem][1] | |
# print right | |
if oper and left and right: | |
# print 'debug' | |
# print left, oper, right | |
if oper == '&': | |
total.update(set(left) & set(right)) | |
elif oper == '|': | |
total.update(set(left) | set(right)) | |
left = total | |
right = False | |
oper = False | |
# print 'total', total | |
# print snippets | |
print 'Слова для поиска:' | |
for qitem in q.split(): | |
if qitem not in '&|': | |
print qitem | |
count_of_searches_already = 0 | |
for parag in total: | |
for qitem in q.split(): | |
if qitem in '&|': | |
continue | |
paragraph = splited_text[parag] | |
try: | |
ind = paragraph.find(' '+qitem) | |
if ind is not -1: | |
print '%d: %s' % (parag, qitem) | |
print paragraph[ind-60: ind+60] | |
count_of_searches_already += 1 | |
if count_of_searches_already >= count_of_searches: | |
return total | |
print '' | |
except: | |
continue | |
return total | |
def split_by_div_text(text): | |
paragraphs = [] | |
paragraph = [] | |
par_counter = -1 | |
for line in text: | |
if line.strip() == "<div align=\"center\" >": | |
if par_counter >=0: | |
paragraphs.append( '\n'.join(paragraph) ) | |
par_counter += 1 | |
paragraph = [] | |
else: | |
paragraph.append(line) | |
return paragraphs | |
def main(): | |
with codecs.open('text_0080.shtml', 'r', 'cp1251') as text_file: | |
data = text_file.readlines() | |
splited_text = split_by_div_text(data) | |
print 'Процесс постороение индекса' | |
index_dict = get_reverse_index(splited_text) | |
q = sys.argv[2].decode(sys.stdin.encoding) # query | |
count_of_searches = 0 | |
if len(sys.argv) > 3: | |
count_of_searches = int(sys.argv[4]) | |
else: | |
count_of_searches = 10000 | |
parse_query(q, index_dict, splited_text, count_of_searches) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment