Skip to content

Instantly share code, notes, and snippets.

@markharwood
Last active September 5, 2019 05:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markharwood/b769aca14890414799f76820cf364a4f to your computer and use it in GitHub Desktop.
Save markharwood/b769aca14890414799f76820cf364a4f to your computer and use it in GitHub Desktop.
Find brand names in a product catalogue that potentially have alternative meanings.
from elasticsearch.client import Elasticsearch
import json
es = Elasticsearch()
# ==== Configure your data's names here:
structuredFieldnameForBrand = "brand.keyword"
indexNameForProducts = "myproducts"
unstructuredFieldNamePossiblyMentioningBrands = "name"
# Load existing ruleset - start with a blank json file with just {} in it
rulesFile = "/my_rules/categorysnaps.json"
minBrandUniqueness=0.98
minNumProductsTaggedWithBrand=10
json_data = open(rulesFile).read()
filterRulesByField = json.loads(json_data)
topBrands = {
"size": 0,
"aggs": {
"numUniqueBrands": {
"cardinality": {
"field": structuredFieldnameForBrand
}
},
"topBrands": {
"terms": {
"field": structuredFieldnameForBrand,
"size": 5000
}
}
}
}
existingRules={}
if structuredFieldnameForBrand in filterRulesByField:
existingRules = filterRulesByField[structuredFieldnameForBrand]
else:
filterRulesByField[structuredFieldnameForBrand]=existingRules
results = es.search(index=indexNameForProducts, body=topBrands)
brandResults = results["aggregations"]["topBrands"]["buckets"]
for bucket in brandResults:
brandName = bucket["key"]
q = {
"size": 0,
"query": {
"bool": {
"should": [
{
"match_phrase": {
unstructuredFieldNamePossiblyMentioningBrands: {
"query": brandName
}
}
},
{
"term": {
structuredFieldnameForBrand: brandName
}
}
]
}
},
"aggs": {
"brands": {
"terms": {
"field": structuredFieldnameForBrand
},
"aggregations": {
"product_names": {
"top_hits": {
"size": 1,
"_source": unstructuredFieldNamePossiblyMentioningBrands
}
},
"structured_only": {
"filter": {
"bool": {
"must_not": [
{
"match_phrase": {
unstructuredFieldNamePossiblyMentioningBrands: {
"query": brandName
}
}
}
]
}
}
}
}
}
}
}
results = es.search(index=indexNameForProducts, body=q)
brands = results["aggregations"]["brands"]["buckets"]
numProductsTaggedWithBrand = 0
numOtherBrandedProductsMentioningBrand = 0
queryExpansions = 0
if len(brands) > 1:
print "ambiguous brand name:", brandName
for brand in brands:
print "\t", brand["key"], "(", brand["doc_count"], "products)"
if brand["key"] == brandName:
numProductsTaggedWithBrand = brand["doc_count"]
queryExpansions = brand["structured_only"]["doc_count"]
else:
numOtherBrandedProductsMentioningBrand += brand["doc_count"]
for product in brand["product_names"]["hits"]["hits"]:
print "\t\t", product["_source"][unstructuredFieldNamePossiblyMentioningBrands]
else:
print "unambiguous brand name:", brandName, "(", brands[0]["doc_count"], "products)"
numProductsTaggedWithBrand += brands[0]["doc_count"]
queryExpansions = brands[0]["structured_only"]["doc_count"]
for product in brands[0]["product_names"]["hits"]["hits"]:
print "\t\t", product["_source"][unstructuredFieldNamePossiblyMentioningBrands]
brandnameUniqueness = float(numProductsTaggedWithBrand) / float(numProductsTaggedWithBrand + numOtherBrandedProductsMentioningBrand)
if brandnameUniqueness>=minBrandUniqueness and numProductsTaggedWithBrand>minNumProductsTaggedWithBrand:
# add new rule if not already there
if brandName not in existingRules:
existingRules[brandName]={
"patterns":[brandName.lower()]
}
else:
if brandName in existingRules:
print "!!!!Warning!!! existing rule for [",brandName,"] now scoring only ",brandnameUniqueness," uniqueness"
print "\t===== brandnameUniqueness=",brandnameUniqueness,"Facet snapping would score true positives=", numProductsTaggedWithBrand, "max num false negatives=", numOtherBrandedProductsMentioningBrand, "query expansions from adding structured brandname filter = ", queryExpansions
with open(rulesFile, 'wb') as f:
f.write(json.dumps(filterRulesByField, indent=2) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment