Skip to content

Instantly share code, notes, and snippets.

@lisovskyvlad
Created April 9, 2014 11:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lisovskyvlad/10259439 to your computer and use it in GitHub Desktop.
Save lisovskyvlad/10259439 to your computer and use it in GitHub Desktop.
#! /usr/bin/python
# coding: utf-8
import codecs
import re
from string import punctuation
import collections
import sys
punctuation_regexp = re.compile(r'[\s{}]+'.format(re.escape(punctuation)))
digit_regexp = re.compile(r'\d+')
# получаем уникальные токены для текущего документа
def get_tokens(text):
tokens = set()
sum_length = 0.0
for t in punctuation_regexp.split(text):
t = digit_regexp.sub('', t)
if t != '':
tokens.add(t)
sum_length = sum(len(t) for t in tokens)
count = len(tokens)
return tokens, count, sum_length
# получаем уникальные термы для текущего документа
def get_terms(tokens):
terms = set()
sum_length = 0.0
for t in tokens:
terms.add(t.lower())
sum_length = sum(len(t) for t in terms)
count = len(terms)
return terms, count, sum_length
def get_reverse_index(paragraphs):
pars_seq = set()
count_tokens, count_terms = 0, 0
sum_len_tokens, sum_len_terms = 0.0, 0.0
for doc_id, paragraph in enumerate(paragraphs):
# fetching tokens and token`s statistics
tokens, token_len, sum_len_token = get_tokens(paragraph)
count_tokens += token_len
sum_len_tokens += sum_len_token
# fetching tems and tems`s statistics
terms, term_len, sum_len_term = get_terms(tokens)
count_terms += term_len
sum_len_terms += sum_len_term
# set of (term, doc_id)
for term in terms:
pars_seq.add((term, doc_id))
index_dict = collections.defaultdict(lambda: (0, list()))
for term, docid in pars_seq:
freq, docids = index_dict[term]
# frequency and list of paragraphs
index_dict[term] = freq + 1, sorted(docids + [docid])
# reverse_index = sorted(index_dict.items())
# for t, (f, d) in reverse_index:
# print t, f, d
return index_dict
def parse_query(q, rindex, splited_text, count_of_searches):
total = set()
left, right, oper= False, False, False
splited_query = q.split()
if len(splited_query) <= 1:
total = set(rindex[splited_query[0].lower()][1])
else:
for qitem in q.split():
if qitem in '&|':
oper = qitem
elif not left:
left = rindex[qitem][1]
# print left
elif not right:
right = rindex[qitem][1]
# print right
if oper and left and right:
# print 'debug'
# print left, oper, right
if oper == '&':
total.update(set(left) & set(right))
elif oper == '|':
total.update(set(left) | set(right))
left = total
right = False
oper = False
# print 'total', total
# print snippets
print 'Слова для поиска:'
for qitem in q.split():
if qitem not in '&|':
print qitem
print
count_of_searches_already = 0
for parag in total:
for qitem in q.split():
if qitem in '&|':
continue
paragraph = splited_text[parag]
try:
ind = paragraph.find(' '+qitem)
if ind is not -1:
print '%d: %s' % (parag, qitem)
print paragraph[ind-60: ind+60]
count_of_searches_already += 1
if count_of_searches_already >= count_of_searches:
return total
print ''
except:
continue
return total
def split_by_div_text(text):
paragraphs = []
paragraph = []
par_counter = -1
for line in text:
if line.strip() == "<div align=\"center\" >":
if par_counter >=0:
paragraphs.append( '\n'.join(paragraph) )
par_counter += 1
paragraph = []
else:
paragraph.append(line)
return paragraphs
def main():
with codecs.open('text_0080.shtml', 'r', 'cp1251') as text_file:
data = text_file.readlines()
splited_text = split_by_div_text(data)
print 'Процесс постороение индекса'
index_dict = get_reverse_index(splited_text)
q = sys.argv[2].decode(sys.stdin.encoding) # query
count_of_searches = 0
if len(sys.argv) > 3:
count_of_searches = int(sys.argv[4])
else:
count_of_searches = 10000
parse_query(q, index_dict, splited_text, count_of_searches)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment