Skip to content

Instantly share code, notes, and snippets.

@yohanboniface
Last active December 2, 2018 11:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yohanboniface/d7d9472c0dac7eb5955e71bdeeb92453 to your computer and use it in GitHub Desktop.
Save yohanboniface/d7d9472c0dac7eb5955e71bdeeb92453 to your computer and use it in GitHub Desktop.
import re
import time
from lxml import etree
from pymongo import MongoClient
from minicli import cli, run
client = MongoClient()
db = client.mydb
collection = db.disorders
def trigrams(value, n=3):
value = f"^{value}"
return [value[i : i + n] for i in range(0, len(value) - (n - 1))]
def clean_text(text):
text = re.sub(" {2,}", " ", re.sub(r"[^\w]", " ", text)).lower().strip()
rules = (
("c(?=[^ieyw])", "k"),
("c$", "k"),
("(?<=[aeiouy])s(?=[aeiouy])", "z"),
("ph", "f"),
("(?<=[^sc])h", ""),
("^h(?=.)+", ""),
("(?<=[^0-9])y", "i"),
("(\\D)(?=\\1)", ""), # Remove duplicate letters.
)
for pattern, repl in rules:
text = re.sub(pattern, repl, text)
return text
@cli
def create_index():
collection.create_index("trigrams")
@cli(name="import")
def import_():
"""Import dataset in MongoDB."""
# From http://www.orphadata.org/data/xml/en_product1.xml
root = etree.parse("en_product1.xml")
docs = []
for disorder in root.iterfind("//Disorder"):
name = disorder.find("Name").text
docs.append(
{
"name": name,
"trigrams": trigrams(clean_text(name)),
"_id": int(disorder.find("OrphaNumber").text),
}
)
if len(docs) == 100:
collection.insert_many(docs)
docs = []
if docs:
collection.insert_many(docs)
@cli
def search(text):
"""Run a search using the complete algorithm.
Usage: python mongocomplte.py search cistic
"""
text = trigrams(clean_text(text))
start = time.perf_counter()
docs = collection.aggregate(
[
{"$match": {"trigrams": {"$in": text}}},
{
"$project": {
"name": "$name",
"found": {
"$size": {
# Dedupe the matched trigrams.
"$setUnion": [
{
# Extract the trigram that matched.
"$filter": {
"input": "$trigrams",
"as": "item",
"cond": {"$in": ["$$item", text]},
}
}
]
}
},
"length": {"$size": "$trigrams"},
}
},
{
"$project": {
"score": {
"$divide": [
{
"$add": [
# How much the query matched THAT document.
{"$divide": ["$found", "$length"]},
# How much THAT document match the query
{"$divide": ["$found", len(text)]},
]
},
2,
]
},
"name": "$name",
}
},
{"$match": {"score": {"$gt": 0.5}}},
{"$sort": {"score": -1}},
{"$limit": 3},
]
)
# Force evaluation for timing.
docs = list(docs)
duration = time.perf_counter() - start
print(f"Searching {text}")
for doc in docs:
print(doc)
print(f"Duration: {duration:.5f}")
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment