Skip to content

Instantly share code, notes, and snippets.

@mdamien
Created September 19, 2017 11:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdamien/fc5d366534a68c6eb6ddb894e9201b10 to your computer and use it in GitHub Desktop.
Save mdamien/fc5d366534a68c6eb6ddb894e9201b10 to your computer and use it in GitHub Desktop.
import sys, os, collections, string, math
Result = collections.namedtuple('Result',
['file', 'matches', 'n_lines',
'n_exact_match', 'n_exact_length',
'title_match'])
def remove_accents(s):
table = collections.defaultdict(lambda: None)
table.update({
ord('é'):'e',
ord('ô'):'o',
ord(' '):' ',
ord('\N{NO-BREAK SPACE}'): ' ',
ord('\N{EN SPACE}'): ' ',
ord('\N{EM SPACE}'): ' ',
ord('\N{THREE-PER-EM SPACE}'): ' ',
ord('\N{FOUR-PER-EM SPACE}'): ' ',
ord('\N{SIX-PER-EM SPACE}'): ' ',
ord('\N{FIGURE SPACE}'): ' ',
ord('\N{PUNCTUATION SPACE}'): ' ',
ord('\N{THIN SPACE}'): ' ',
ord('\N{HAIR SPACE}'): ' ',
ord('\N{ZERO WIDTH SPACE}'): ' ',
ord('\N{NARROW NO-BREAK SPACE}'): ' ',
ord('\N{MEDIUM MATHEMATICAL SPACE}'): ' ',
ord('\N{IDEOGRAPHIC SPACE}'): ' ',
ord('\N{IDEOGRAPHIC HALF FILL SPACE}'): ' ',
ord('\N{ZERO WIDTH NO-BREAK SPACE}'): ' ',
ord('\N{TAG SPACE}'): ' ',
})
table.update(dict(zip(map(ord,string.ascii_uppercase), string.ascii_lowercase)))
table.update(dict(zip(map(ord,string.ascii_lowercase), string.ascii_lowercase)))
table.update(dict(zip(map(ord,string.digits), string.digits)))
return s.translate(table)
DIR = 'data/poemes/'
DIR = 'data/pcdwiki/'
FILES = [(file, list(open(DIR + file))) for file in os.listdir(DIR)]
print('index built')
def search(q):
q = remove_accents(q.lower())
results = []
print('searching..')
for file, content in FILES:
matches = []
n_lines = 0
n_exact_match = 0
n_exact_length = 0
title_match = q in remove_accents(file.lower())
for line in content:
line = line.strip()
clean_line = remove_accents(line.lower())
for word in clean_line.split(' '):
if q in word:
n_exact_match += 1
if q in clean_line:
matches.append(line)
n_lines += 1
n_exact_length += len(line)
if matches:
results.append(Result(file=file,
matches=matches, n_lines=n_lines,
n_exact_match=n_exact_match, n_exact_length=n_exact_length,
title_match=title_match))
print('search finished')
# density: -len(r.matches) / r.n_lines)
# ponderation of scores: -len(r.matches) * (1 + 0.2/r.n_lines))
def score(r):
doc_score = math.log(len(r.matches)) * 0.5 \
+ math.log(1 + len(r.matches) / r.n_lines) * 4 \
+ math.log(r.n_lines) * 0.2
# + len(r.matches) / r.n_lines * math.log(len(r.matches)) * 2 \
if r.title_match:
return doc_score*1.6
return doc_score
sorted_results = list(sorted(results, key=lambda r: -score(r)))
print(len(sorted_results), 'results')
for r in sorted_results[:5]:
print()
print(r.file)
print('density:', len(r.matches) / r.n_lines)
print(' - log density:', math.log(1 + len(r.matches) / r.n_lines))
print(' - composite density', len(r.matches) / r.n_lines * math.log(len(r.matches)))
print('length:', r.n_lines)
print(' - log length:', math.log(r.n_lines))
print('matches:', len(r.matches))
print(' - log matches:', math.log(len(r.matches)))
print('n_exact_match:', r.n_exact_match)
print('n_exact_length:', r.n_exact_length)
print('score:', score(r))
return sorted_results
if __name__ == '__main__':
search(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment