mdamien/query.py

## query.py
import sys, os, collections, string, math

Result = collections.namedtuple('Result',
    ['file', 'matches', 'n_lines',
    'n_exact_match', 'n_exact_length',
    'title_match'])


def remove_accents(s):
    table = collections.defaultdict(lambda: None)
    table.update({
        ord('é'):'e',
        ord('ô'):'o',
        ord(' '):' ',
        ord('\N{NO-BREAK SPACE}'): ' ',
        ord('\N{EN SPACE}'): ' ',
        ord('\N{EM SPACE}'): ' ',
        ord('\N{THREE-PER-EM SPACE}'): ' ',
        ord('\N{FOUR-PER-EM SPACE}'): ' ',
        ord('\N{SIX-PER-EM SPACE}'): ' ',
        ord('\N{FIGURE SPACE}'): ' ',
        ord('\N{PUNCTUATION SPACE}'): ' ',
        ord('\N{THIN SPACE}'): ' ',
        ord('\N{HAIR SPACE}'): ' ',
        ord('\N{ZERO WIDTH SPACE}'): ' ',
        ord('\N{NARROW NO-BREAK SPACE}'): ' ',
        ord('\N{MEDIUM MATHEMATICAL SPACE}'): ' ',
        ord('\N{IDEOGRAPHIC SPACE}'): ' ',
        ord('\N{IDEOGRAPHIC HALF FILL SPACE}'): ' ',
        ord('\N{ZERO WIDTH NO-BREAK SPACE}'): ' ',
        ord('\N{TAG SPACE}'): ' ',
        })
    table.update(dict(zip(map(ord,string.ascii_uppercase), string.ascii_lowercase)))
    table.update(dict(zip(map(ord,string.ascii_lowercase), string.ascii_lowercase)))
    table.update(dict(zip(map(ord,string.digits), string.digits)))

    return s.translate(table)

DIR = 'data/poemes/'
DIR = 'data/pcdwiki/'

FILES = [(file, list(open(DIR + file))) for file in os.listdir(DIR)]
print('index built')

def search(q):
    q = remove_accents(q.lower())

    results = []

    print('searching..')
    for file, content in FILES:
        matches = []
        n_lines = 0
        n_exact_match = 0
        n_exact_length = 0
        title_match = q in remove_accents(file.lower())
        for line in content:
            line = line.strip()
            clean_line = remove_accents(line.lower())
            for word in clean_line.split(' '):
                if q in word:
                    n_exact_match += 1
            if q in clean_line:
                matches.append(line)
            n_lines += 1
            n_exact_length += len(line)
        if matches:
            results.append(Result(file=file,
                matches=matches, n_lines=n_lines,
                n_exact_match=n_exact_match, n_exact_length=n_exact_length,
                title_match=title_match))
    print('search finished')

    # density: -len(r.matches) / r.n_lines)
    # ponderation of scores: -len(r.matches) * (1 + 0.2/r.n_lines))

    def score(r):
        doc_score = math.log(len(r.matches)) * 0.5 \
            + math.log(1 + len(r.matches) / r.n_lines) * 4 \
            + math.log(r.n_lines) * 0.2
            # + len(r.matches) / r.n_lines * math.log(len(r.matches)) * 2 \
        if r.title_match:
            return doc_score*1.6
        return doc_score

    sorted_results = list(sorted(results, key=lambda r: -score(r)))

    print(len(sorted_results), 'results')

    for r in sorted_results[:5]:
        print()
        print(r.file)
        print('density:', len(r.matches) / r.n_lines)
        print('  - log density:', math.log(1 + len(r.matches) / r.n_lines))
        print('  - composite density', len(r.matches) / r.n_lines * math.log(len(r.matches)))
        print('length:', r.n_lines)
        print('  - log length:', math.log(r.n_lines))
        print('matches:', len(r.matches))
        print('  - log matches:', math.log(len(r.matches)))
        print('n_exact_match:', r.n_exact_match)
        print('n_exact_length:', r.n_exact_length)
        print('score:', score(r))

    return sorted_results

if __name__ == '__main__':
    search(sys.argv[1])
	import sys, os, collections, string, math

	Result = collections.namedtuple('Result',
	['file', 'matches', 'n_lines',
	'n_exact_match', 'n_exact_length',
	'title_match'])


	def remove_accents(s):
	table = collections.defaultdict(lambda: None)
	table.update({
	ord('é'):'e',
	ord('ô'):'o',
	ord(' '):' ',
	ord('\N{NO-BREAK SPACE}'): ' ',
	ord('\N{EN SPACE}'): ' ',
	ord('\N{EM SPACE}'): ' ',
	ord('\N{THREE-PER-EM SPACE}'): ' ',
	ord('\N{FOUR-PER-EM SPACE}'): ' ',
	ord('\N{SIX-PER-EM SPACE}'): ' ',
	ord('\N{FIGURE SPACE}'): ' ',
	ord('\N{PUNCTUATION SPACE}'): ' ',
	ord('\N{THIN SPACE}'): ' ',
	ord('\N{HAIR SPACE}'): ' ',
	ord('\N{ZERO WIDTH SPACE}'): ' ',
	ord('\N{NARROW NO-BREAK SPACE}'): ' ',
	ord('\N{MEDIUM MATHEMATICAL SPACE}'): ' ',
	ord('\N{IDEOGRAPHIC SPACE}'): ' ',
	ord('\N{IDEOGRAPHIC HALF FILL SPACE}'): ' ',
	ord('\N{ZERO WIDTH NO-BREAK SPACE}'): ' ',
	ord('\N{TAG SPACE}'): ' ',
	})
	table.update(dict(zip(map(ord,string.ascii_uppercase), string.ascii_lowercase)))
	table.update(dict(zip(map(ord,string.ascii_lowercase), string.ascii_lowercase)))
	table.update(dict(zip(map(ord,string.digits), string.digits)))

	return s.translate(table)

	DIR = 'data/poemes/'
	DIR = 'data/pcdwiki/'

	FILES = [(file, list(open(DIR + file))) for file in os.listdir(DIR)]
	print('index built')

	def search(q):
	q = remove_accents(q.lower())

	results = []

	print('searching..')
	for file, content in FILES:
	matches = []
	n_lines = 0
	n_exact_match = 0
	n_exact_length = 0
	title_match = q in remove_accents(file.lower())
	for line in content:
	line = line.strip()
	clean_line = remove_accents(line.lower())
	for word in clean_line.split(' '):
	if q in word:
	n_exact_match += 1
	if q in clean_line:
	matches.append(line)
	n_lines += 1
	n_exact_length += len(line)
	if matches:
	results.append(Result(file=file,
	matches=matches, n_lines=n_lines,
	n_exact_match=n_exact_match, n_exact_length=n_exact_length,
	title_match=title_match))
	print('search finished')

	# density: -len(r.matches) / r.n_lines)
	# ponderation of scores: -len(r.matches) * (1 + 0.2/r.n_lines))

	def score(r):
	doc_score = math.log(len(r.matches)) * 0.5 \
	+ math.log(1 + len(r.matches) / r.n_lines) * 4 \
	+ math.log(r.n_lines) * 0.2
	# + len(r.matches) / r.n_lines * math.log(len(r.matches)) * 2 \
	if r.title_match:
	return doc_score*1.6
	return doc_score

	sorted_results = list(sorted(results, key=lambda r: -score(r)))

	print(len(sorted_results), 'results')

	for r in sorted_results[:5]:
	print()
	print(r.file)
	print('density:', len(r.matches) / r.n_lines)
	print(' - log density:', math.log(1 + len(r.matches) / r.n_lines))
	print(' - composite density', len(r.matches) / r.n_lines * math.log(len(r.matches)))
	print('length:', r.n_lines)
	print(' - log length:', math.log(r.n_lines))
	print('matches:', len(r.matches))
	print(' - log matches:', math.log(len(r.matches)))
	print('n_exact_match:', r.n_exact_match)
	print('n_exact_length:', r.n_exact_length)
	print('score:', score(r))

	return sorted_results

	if __name__ == '__main__':
	search(sys.argv[1])