lisovskyvlad/reverse_index.py

## reverse_index.py
#! /usr/bin/python
# coding: utf-8

import codecs
import re
from string import punctuation
import collections
import sys

punctuation_regexp = re.compile(r'[\s{}]+'.format(re.escape(punctuation)))
digit_regexp = re.compile(r'\d+')

# получаем уникальные токены для текущего документа
def get_tokens(text):
  tokens = set()
  sum_length = 0.0
  for t in punctuation_regexp.split(text):
    t = digit_regexp.sub('', t)
    if t != '':
      tokens.add(t)

  sum_length = sum(len(t) for t in tokens)
  count = len(tokens)
  return tokens, count, sum_length

# получаем уникальные термы для текущего документа
def get_terms(tokens):
  terms = set()
  sum_length = 0.0
  for t in tokens:
    terms.add(t.lower())

  sum_length = sum(len(t) for t in terms)
  count = len(terms)
  return terms, count, sum_length

def get_reverse_index(paragraphs):
  pars_seq = set()
  count_tokens, count_terms = 0, 0
  sum_len_tokens, sum_len_terms = 0.0, 0.0

  for doc_id, paragraph in enumerate(paragraphs):
    # fetching tokens and token`s statistics
    tokens, token_len, sum_len_token = get_tokens(paragraph)
    count_tokens += token_len
    sum_len_tokens += sum_len_token

    # fetching tems and tems`s statistics
    terms, term_len, sum_len_term = get_terms(tokens)
    count_terms += term_len
    sum_len_terms += sum_len_term

    # set of (term, doc_id)
    for term in terms:
      pars_seq.add((term, doc_id))

  index_dict = collections.defaultdict(lambda: (0, list()))
  for term, docid in pars_seq:
    freq, docids = index_dict[term]
    # frequency and list of paragraphs
    index_dict[term] = freq + 1, sorted(docids + [docid])

  # reverse_index = sorted(index_dict.items())
  # for t, (f, d) in reverse_index:
  #   print t, f, d

  return index_dict

def parse_query(q, rindex, splited_text, count_of_searches):
  total = set()
  left, right, oper= False, False, False

  splited_query = q.split()
  if len(splited_query) <= 1:
    total = set(rindex[splited_query[0].lower()][1])
  else:
    for qitem in q.split():
      if qitem in '&|':
        oper = qitem
      elif not left:
        left = rindex[qitem][1]
        # print left
      elif not right:
        right = rindex[qitem][1]
        # print right

      if oper and left and right:
        # print 'debug'
        # print left, oper, right
        if oper == '&':
          total.update(set(left) & set(right))
        elif oper == '|':
          total.update(set(left) | set(right))
        left = total
        right = False
        oper = False
        # print 'total', total

  # print snippets
  print 'Слова для поиска:'
  for qitem in q.split():
    if qitem not in '&|':
      print qitem
  print

  count_of_searches_already = 0
  for parag in total:
    for qitem in q.split():
      if qitem in '&|':
        continue
      paragraph = splited_text[parag]
      try:
        ind = paragraph.find(' '+qitem)
        if ind is not -1:
          print '%d: %s' % (parag, qitem)
          print paragraph[ind-60: ind+60]
          count_of_searches_already += 1
          if count_of_searches_already >= count_of_searches:
            return total
        print ''
      except:
        continue
  return total

def split_by_div_text(text):
  paragraphs = []
  paragraph = []
  par_counter = -1
  for line in text:
    if line.strip() == "<div align=\"center\" >":
      if par_counter >=0:
        paragraphs.append( '\n'.join(paragraph) )
      par_counter += 1
      paragraph = []
    else:
      paragraph.append(line)
  return paragraphs

def main():
  with codecs.open('text_0080.shtml', 'r', 'cp1251') as text_file:
    data = text_file.readlines()

  splited_text = split_by_div_text(data)
  print 'Процесс постороение индекса'
  index_dict = get_reverse_index(splited_text)

  q = sys.argv[2].decode(sys.stdin.encoding) # query
  count_of_searches = 0
  if len(sys.argv) > 3:
    count_of_searches = int(sys.argv[4])
  else:
    count_of_searches = 10000
  parse_query(q, index_dict, splited_text, count_of_searches)

if __name__ == '__main__':
  main()
	#! /usr/bin/python
	# coding: utf-8

	import codecs
	import re
	from string import punctuation
	import collections
	import sys

	punctuation_regexp = re.compile(r'[\s{}]+'.format(re.escape(punctuation)))
	digit_regexp = re.compile(r'\d+')

	# получаем уникальные токены для текущего документа
	def get_tokens(text):
	tokens = set()
	sum_length = 0.0
	for t in punctuation_regexp.split(text):
	t = digit_regexp.sub('', t)
	if t != '':
	tokens.add(t)

	sum_length = sum(len(t) for t in tokens)
	count = len(tokens)
	return tokens, count, sum_length

	# получаем уникальные термы для текущего документа
	def get_terms(tokens):
	terms = set()
	sum_length = 0.0
	for t in tokens:
	terms.add(t.lower())

	sum_length = sum(len(t) for t in terms)
	count = len(terms)
	return terms, count, sum_length

	def get_reverse_index(paragraphs):
	pars_seq = set()
	count_tokens, count_terms = 0, 0
	sum_len_tokens, sum_len_terms = 0.0, 0.0

	for doc_id, paragraph in enumerate(paragraphs):
	# fetching tokens and token`s statistics
	tokens, token_len, sum_len_token = get_tokens(paragraph)
	count_tokens += token_len
	sum_len_tokens += sum_len_token

	# fetching tems and tems`s statistics
	terms, term_len, sum_len_term = get_terms(tokens)
	count_terms += term_len
	sum_len_terms += sum_len_term

	# set of (term, doc_id)
	for term in terms:
	pars_seq.add((term, doc_id))

	index_dict = collections.defaultdict(lambda: (0, list()))
	for term, docid in pars_seq:
	freq, docids = index_dict[term]
	# frequency and list of paragraphs
	index_dict[term] = freq + 1, sorted(docids + [docid])

	# reverse_index = sorted(index_dict.items())
	# for t, (f, d) in reverse_index:
	# print t, f, d

	return index_dict

	def parse_query(q, rindex, splited_text, count_of_searches):
	total = set()
	left, right, oper= False, False, False

	splited_query = q.split()
	if len(splited_query) <= 1:
	total = set(rindex[splited_query[0].lower()][1])
	else:
	for qitem in q.split():
	if qitem in '&\|':
	oper = qitem
	elif not left:
	left = rindex[qitem][1]
	# print left
	elif not right:
	right = rindex[qitem][1]
	# print right

	if oper and left and right:
	# print 'debug'
	# print left, oper, right
	if oper == '&':
	total.update(set(left) & set(right))
	elif oper == '\|':
	total.update(set(left) \| set(right))
	left = total
	right = False
	oper = False
	# print 'total', total

	# print snippets
	print 'Слова для поиска:'
	for qitem in q.split():
	if qitem not in '&\|':
	print qitem
	print

	count_of_searches_already = 0
	for parag in total:
	for qitem in q.split():
	if qitem in '&\|':
	continue
	paragraph = splited_text[parag]
	try:
	ind = paragraph.find(' '+qitem)
	if ind is not -1:
	print '%d: %s' % (parag, qitem)
	print paragraph[ind-60: ind+60]
	count_of_searches_already += 1
	if count_of_searches_already >= count_of_searches:
	return total
	print ''
	except:
	continue
	return total

	def split_by_div_text(text):
	paragraphs = []
	paragraph = []
	par_counter = -1
	for line in text:
	if line.strip() == "<div align=\"center\" >":
	if par_counter >=0:
	paragraphs.append( '\n'.join(paragraph) )
	par_counter += 1
	paragraph = []
	else:
	paragraph.append(line)
	return paragraphs

	def main():
	with codecs.open('text_0080.shtml', 'r', 'cp1251') as text_file:
	data = text_file.readlines()

	splited_text = split_by_div_text(data)
	print 'Процесс постороение индекса'
	index_dict = get_reverse_index(splited_text)

	q = sys.argv[2].decode(sys.stdin.encoding) # query
	count_of_searches = 0
	if len(sys.argv) > 3:
	count_of_searches = int(sys.argv[4])
	else:
	count_of_searches = 10000
	parse_query(q, index_dict, splited_text, count_of_searches)

	if __name__ == '__main__':
	main()