Skip to content

Instantly share code, notes, and snippets.

@jgsogo
Created November 19, 2015 23:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jgsogo/61975297ff6ca3f01a71 to your computer and use it in GitHub Desktop.
Save jgsogo/61975297ff6ca3f01a71 to your computer and use it in GitHub Desktop.
Naïve search engine implementation
# -*- coding: utf-8 -*-
# Naïve search engine implementation done as example for @lingwars meeting
print("============================")
print("=== Mi motor de búsqueda ===")
print("============================")
print("\n")
# Creamos nuestra base de datos
docs = [
"los reyes de España fueron a la sede de el BBVA en Madrid", # Es intencionado el "de+el"
"Isabel y Fernando reyes de España",
"el Real Madrid gana un partido al Manchester",
"Fernando Torres no juega en el Real Madrid",
]
# Algunos datos sobre el "corpus"
## Número de documentos
n_docs = len(docs)
print(" - Tengo %d documentos" % n_docs)
for doc, i in zip(docs, xrange(len(docs))):
print(" + doc#%d: %s" % (i, docs[i]))
print("")
## Vamos a por las palabras ("yo soy yo" son 3 palabras)
def get_words(doc):
words = doc.split()
return words
tokens = [get_words(doc) for doc in docs]
n_tokens = sum(len(t) for t in tokens)
print(" - Tengo %d palabras en total" % n_tokens)
for i in xrange(len(docs)):
print(" + doc#%d tiene %d palabras: %s" % (i, len(tokens[i]), tokens[i]))
print("")
## Y ahora a por las formas ("yo soy yo" tiene dos formas: "yo" (2), "soy" (1))
types = [set(t) for t in tokens]
all_types = list(set([w for d in tokens for w in d]))
n_types = len(all_types)
print(" - Tengo %d formas en total" % (n_types))
print(" + doc#0 tiene %d formas: %s" % (len(types[0]), types[0]))
print("")
## Pero queremos almacenar frecuencias
def count_freqs(doc, types):
words = get_words(doc)
freqs = [0 for t in types]
for w in words:
try:
freqs[types.index(w)] += 1
except ValueError:
pass
return freqs
counts = []
for doc in docs:
freqs = count_freqs(doc, all_types)
counts.append(freqs)
for t,idx in zip(all_types, xrange(len(all_types))):
aux_str = ""
for i in xrange(len(docs)):
aux_str += "\t%d" % counts[i][idx]
print("%20s\t%s" % (t, aux_str))
## Implementemos un algoritmo de comparación de documentos
def compare(doc1, doc2, types):
import math
freq1 = count_freqs(doc1, types)
freq2 = count_freqs(doc2, types)
suma = 0
for v1, v2 in zip(freq1, freq2):
suma += v1*v2
mod1 = sum([v*v for v in freq1])
mod2 = sum([v*v for v in freq2])
return suma/(math.sqrt(mod1)*math.sqrt(mod2))
## Busquemos pues
q = raw_input("Introduzca las palabras clave que quiere buscar: ")
print("-> tu query es: %s" % q)
words = get_words(q)
print(" + words: %s" % words)
freqs = count_freqs(q, all_types)
print(" + freqs: %s" % freqs)
points = []
for doc, i in zip(docs, xrange(len(docs))):
val = compare(q, doc, all_types)
print(" > doc#%d: %f" % (i, val))
points.append(val)
winner = points.index(max(points))
print("Mejor resultado: doc#%d" % winner)
print(">> %s" % docs[winner])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment