import re
import time
from lxml import etree
from pymongo import MongoClient
from minicli import cli, run
client = MongoClient()
db = client.mydb
collection = db.disorders
def trigrams(value, n=3):
value = f"^{value}"
return [value[i : i + n] for i in range(0, len(value) - (n - 1))]
def clean_text(text):
text = re.sub(" {2,}", " ", re.sub(r"[^\w]", " ", text)).lower().strip()
rules = (
("c(?=[^ieyw])", "k"),
("c$", "k"),
("(?<=[aeiouy])s(?=[aeiouy])", "z"),
("ph", "f"),
("(?<=[^sc])h", ""),
("^h(?=.)+", ""),
("(?<=[^0-9])y", "i"),
("(\\D)(?=\\1)", ""), # Remove duplicate letters.
for pattern, repl in rules:
text = re.sub(pattern, repl, text)
return text
def create_index():
def import_():
"""Import dataset in MongoDB."""
# From
root = etree.parse("en_product1.xml")
docs = []
for disorder in root.iterfind("//Disorder"):
name = disorder.find("Name").text
"name": name,
"trigrams": trigrams(clean_text(name)),
"_id": int(disorder.find("OrphaNumber").text),
if len(docs) == 100:
docs = []
if docs:
def search(text):
"""Run a search using the complete algorithm.
Usage: python search cistic
text = trigrams(clean_text(text))
start = time.perf_counter()
docs = collection.aggregate(
{"$match": {"trigrams": {"$in": text}}},
"$project": {
"name": "$name",
"found": {
"$size": {
# Dedupe the matched trigrams.
"$setUnion": [
# Extract the trigram that matched.
"$filter": {
"input": "$trigrams",
"as": "item",
"cond": {"$in": ["$$item", text]},
"length": {"$size": "$trigrams"},
"$project": {
"score": {
"$divide": [
"$add": [
# How much the query matched THAT document.
{"$divide": ["$found", "$length"]},
# How much THAT document match the query
{"$divide": ["$found", len(text)]},
"name": "$name",
{"$match": {"score": {"$gt": 0.5}}},
{"$sort": {"score": -1}},
{"$limit": 3},
# Force evaluation for timing.
docs = list(docs)
duration = time.perf_counter() - start
print(f"Searching {text}")
for doc in docs:
print(f"Duration: {duration:.5f}")
if __name__ == "__main__":
