Last active
August 4, 2019 18:44
-
-
Save bsod90/8c272ee9c8f14cf99b420cabcc7ef53a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Connect to the semantic-search service and run the query | |
con = get_victor_connection() | |
response = con.FindTopKProblems( | |
victor_pb2.TopKProblemsRequest( | |
query=search_query, | |
# In our implementation -1 means "return all matches" | |
k=-1, | |
), | |
) | |
# Because we're using Cosine Similarity to find closes vectors, | |
# the resulting distance will always be in the range from -1 to 1. | |
# This allows us to easily define a confidence threshold and | |
# consider anything above this threshold to be a | |
# "High Confidence Result" | |
CONFIDENCE_THRESHOLD = 0.65 | |
# A custom scoring function we're goint to pass to ElasticSearch | |
# More information here: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html | |
scroring_functions = [ | |
{ | |
"filter": {"term": {"document_id": match.id}}, | |
# This function will re-shape elastic search results in a way | |
# that those, that scored well in similarity search, will move higher | |
"weight": math.pow( | |
document_count, match.distance | |
) | |
} for match in response.matches | |
] | |
high_confidence_docs = [ | |
match.id for match in response.matches | |
if match.distance > CONFIDENCE_THRESHOLD | |
] | |
elastic_query = { | |
'bool': { | |
'should': [{ | |
"match": { | |
"title": search_query, | |
} | |
}, | |
# | |
# You can include any extra fields that you want to | |
# query for your documents here | |
# ... | |
# | |
# | |
# The "terms" part of the query would ensure that anything that | |
# has been found by Victor will always be returned by ElasticSearch | |
{ | |
"terms": { | |
"document_id": high_confidence_docs, | |
# You can add extra boost to high confidence results | |
"boost": 10, | |
} | |
}] | |
} | |
} | |
params = { | |
'query': { | |
"function_score": { | |
"query": elastic_query | |
"functions": scoring_functions | |
} | |
} | |
} | |
# We use https://github.com/elastic/elasticsearch-dsl-py | |
# It's a python wrapper around narive ES DSL | |
from elasticsearch_dsl import Search | |
from our_project.search import Document | |
search = Search.from_dict(params) | |
search = search.index(Document.get_index_name()) | |
search.execute() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment