Created
November 19, 2015 23:00
-
-
Save jgsogo/61975297ff6ca3f01a71 to your computer and use it in GitHub Desktop.
Naïve search engine implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Naïve search engine implementation done as example for @lingwars meeting | |
print("============================") | |
print("=== Mi motor de búsqueda ===") | |
print("============================") | |
print("\n") | |
# Creamos nuestra base de datos | |
docs = [ | |
"los reyes de España fueron a la sede de el BBVA en Madrid", # Es intencionado el "de+el" | |
"Isabel y Fernando reyes de España", | |
"el Real Madrid gana un partido al Manchester", | |
"Fernando Torres no juega en el Real Madrid", | |
] | |
# Algunos datos sobre el "corpus" | |
## Número de documentos | |
n_docs = len(docs) | |
print(" - Tengo %d documentos" % n_docs) | |
for doc, i in zip(docs, xrange(len(docs))): | |
print(" + doc#%d: %s" % (i, docs[i])) | |
print("") | |
## Vamos a por las palabras ("yo soy yo" son 3 palabras) | |
def get_words(doc): | |
words = doc.split() | |
return words | |
tokens = [get_words(doc) for doc in docs] | |
n_tokens = sum(len(t) for t in tokens) | |
print(" - Tengo %d palabras en total" % n_tokens) | |
for i in xrange(len(docs)): | |
print(" + doc#%d tiene %d palabras: %s" % (i, len(tokens[i]), tokens[i])) | |
print("") | |
## Y ahora a por las formas ("yo soy yo" tiene dos formas: "yo" (2), "soy" (1)) | |
types = [set(t) for t in tokens] | |
all_types = list(set([w for d in tokens for w in d])) | |
n_types = len(all_types) | |
print(" - Tengo %d formas en total" % (n_types)) | |
print(" + doc#0 tiene %d formas: %s" % (len(types[0]), types[0])) | |
print("") | |
## Pero queremos almacenar frecuencias | |
def count_freqs(doc, types): | |
words = get_words(doc) | |
freqs = [0 for t in types] | |
for w in words: | |
try: | |
freqs[types.index(w)] += 1 | |
except ValueError: | |
pass | |
return freqs | |
counts = [] | |
for doc in docs: | |
freqs = count_freqs(doc, all_types) | |
counts.append(freqs) | |
for t,idx in zip(all_types, xrange(len(all_types))): | |
aux_str = "" | |
for i in xrange(len(docs)): | |
aux_str += "\t%d" % counts[i][idx] | |
print("%20s\t%s" % (t, aux_str)) | |
## Implementemos un algoritmo de comparación de documentos | |
def compare(doc1, doc2, types): | |
import math | |
freq1 = count_freqs(doc1, types) | |
freq2 = count_freqs(doc2, types) | |
suma = 0 | |
for v1, v2 in zip(freq1, freq2): | |
suma += v1*v2 | |
mod1 = sum([v*v for v in freq1]) | |
mod2 = sum([v*v for v in freq2]) | |
return suma/(math.sqrt(mod1)*math.sqrt(mod2)) | |
## Busquemos pues | |
q = raw_input("Introduzca las palabras clave que quiere buscar: ") | |
print("-> tu query es: %s" % q) | |
words = get_words(q) | |
print(" + words: %s" % words) | |
freqs = count_freqs(q, all_types) | |
print(" + freqs: %s" % freqs) | |
points = [] | |
for doc, i in zip(docs, xrange(len(docs))): | |
val = compare(q, doc, all_types) | |
print(" > doc#%d: %f" % (i, val)) | |
points.append(val) | |
winner = points.index(max(points)) | |
print("Mejor resultado: doc#%d" % winner) | |
print(">> %s" % docs[winner]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment