yohanboniface/mongocomplete.py

## mongocomplete.py
import re
import time

from lxml import etree
from pymongo import MongoClient
from minicli import cli, run

client = MongoClient()
db = client.mydb
collection = db.disorders


def trigrams(value, n=3):
    value = f"^{value}"
    return [value[i : i + n] for i in range(0, len(value) - (n - 1))]


def clean_text(text):
    text = re.sub(" {2,}", " ", re.sub(r"[^\w]", " ", text)).lower().strip()
    rules = (
        ("c(?=[^ieyw])", "k"),
        ("c$", "k"),
        ("(?<=[aeiouy])s(?=[aeiouy])", "z"),
        ("ph", "f"),
        ("(?<=[^sc])h", ""),
        ("^h(?=.)+", ""),
        ("(?<=[^0-9])y", "i"),
        ("(\\D)(?=\\1)", ""),  # Remove duplicate letters.
    )
    for pattern, repl in rules:
        text = re.sub(pattern, repl, text)
    return text


@cli
def create_index():
    collection.create_index("trigrams")


@cli(name="import")
def import_():
    """Import dataset in MongoDB."""
    # From http://www.orphadata.org/data/xml/en_product1.xml
    root = etree.parse("en_product1.xml")
    docs = []
    for disorder in root.iterfind("//Disorder"):
        name = disorder.find("Name").text
        docs.append(
            {
                "name": name,
                "trigrams": trigrams(clean_text(name)),
                "_id": int(disorder.find("OrphaNumber").text),
            }
        )
        if len(docs) == 100:
            collection.insert_many(docs)
            docs = []
    if docs:
        collection.insert_many(docs)


@cli
def search(text):
    """Run a search using the complete algorithm.

    Usage: python mongocomplte.py search cistic
    """
    text = trigrams(clean_text(text))
    start = time.perf_counter()
    docs = collection.aggregate(
        [
            {"$match": {"trigrams": {"$in": text}}},
            {
                "$project": {
                    "name": "$name",
                    "found": {
                        "$size": {
                            # Dedupe the matched trigrams.
                            "$setUnion": [
                                {
                                    # Extract the trigram that matched.
                                    "$filter": {
                                        "input": "$trigrams",
                                        "as": "item",
                                        "cond": {"$in": ["$$item", text]},
                                    }
                                }
                            ]
                        }
                    },
                    "length": {"$size": "$trigrams"},
                }
            },
            {
                "$project": {
                    "score": {
                        "$divide": [
                            {
                                "$add": [
                                    # How much the query matched THAT document.
                                    {"$divide": ["$found", "$length"]},
                                    # How much THAT document match the query
                                    {"$divide": ["$found", len(text)]},
                                ]
                            },
                            2,
                        ]
                    },
                    "name": "$name",
                }
            },
            {"$match": {"score": {"$gt": 0.5}}},
            {"$sort": {"score": -1}},
            {"$limit": 3},
        ]
    )
    # Force evaluation for timing.
    docs = list(docs)
    duration = time.perf_counter() - start
    print(f"Searching {text}")
    for doc in docs:
        print(doc)
    print(f"Duration: {duration:.5f}")


if __name__ == "__main__":
    run()
	import re
	import time

	from lxml import etree
	from pymongo import MongoClient
	from minicli import cli, run

	client = MongoClient()
	db = client.mydb
	collection = db.disorders


	def trigrams(value, n=3):
	value = f"^{value}"
	return [value[i : i + n] for i in range(0, len(value) - (n - 1))]


	def clean_text(text):
	text = re.sub(" {2,}", " ", re.sub(r"[^\w]", " ", text)).lower().strip()
	rules = (
	("c(?=[^ieyw])", "k"),
	("c$", "k"),
	("(?<=[aeiouy])s(?=[aeiouy])", "z"),
	("ph", "f"),
	("(?<=[^sc])h", ""),
	("^h(?=.)+", ""),
	("(?<=[^0-9])y", "i"),
	("(\\D)(?=\\1)", ""), # Remove duplicate letters.
	)
	for pattern, repl in rules:
	text = re.sub(pattern, repl, text)
	return text


	@cli
	def create_index():
	collection.create_index("trigrams")


	@cli(name="import")
	def import_():
	"""Import dataset in MongoDB."""
	# From http://www.orphadata.org/data/xml/en_product1.xml
	root = etree.parse("en_product1.xml")
	docs = []
	for disorder in root.iterfind("//Disorder"):
	name = disorder.find("Name").text
	docs.append(
	{
	"name": name,
	"trigrams": trigrams(clean_text(name)),
	"_id": int(disorder.find("OrphaNumber").text),
	}
	)
	if len(docs) == 100:
	collection.insert_many(docs)
	docs = []
	if docs:
	collection.insert_many(docs)


	@cli
	def search(text):
	"""Run a search using the complete algorithm.

	Usage: python mongocomplte.py search cistic
	"""
	text = trigrams(clean_text(text))
	start = time.perf_counter()
	docs = collection.aggregate(
	[
	{"$match": {"trigrams": {"$in": text}}},
	{
	"$project": {
	"name": "$name",
	"found": {
	"$size": {
	# Dedupe the matched trigrams.
	"$setUnion": [
	{
	# Extract the trigram that matched.
	"$filter": {
	"input": "$trigrams",
	"as": "item",
	"cond": {"$in": ["$$item", text]},
	}
	}
	]
	}
	},
	"length": {"$size": "$trigrams"},
	}
	},
	{
	"$project": {
	"score": {
	"$divide": [
	{
	"$add": [
	# How much the query matched THAT document.
	{"$divide": ["$found", "$length"]},
	# How much THAT document match the query
	{"$divide": ["$found", len(text)]},
	]
	},
	2,
	]
	},
	"name": "$name",
	}
	},
	{"$match": {"score": {"$gt": 0.5}}},
	{"$sort": {"score": -1}},
	{"$limit": 3},
	]
	)
	# Force evaluation for timing.
	docs = list(docs)
	duration = time.perf_counter() - start
	print(f"Searching {text}")
	for doc in docs:
	print(doc)
	print(f"Duration: {duration:.5f}")


	if __name__ == "__main__":
	run()