markharwood/AmbiguousBrandNameFinder.py Secret

## AmbiguousBrandNameFinder.py
from elasticsearch.client import Elasticsearch
import json

es = Elasticsearch()


# ==== Configure your data's names here:
structuredFieldnameForBrand = "brand.keyword"
indexNameForProducts = "myproducts"
unstructuredFieldNamePossiblyMentioningBrands = "name"
# Load existing ruleset - start with a blank json file with just {} in it
rulesFile = "/my_rules/categorysnaps.json"
minBrandUniqueness=0.98
minNumProductsTaggedWithBrand=10

json_data = open(rulesFile).read()
filterRulesByField = json.loads(json_data)


topBrands = {
    "size": 0,
    "aggs": {
        "numUniqueBrands": {
            "cardinality": {
                "field": structuredFieldnameForBrand
            }
        },
        "topBrands": {
            "terms": {
                "field": structuredFieldnameForBrand,
                "size": 5000
            }
        }
    }
}
existingRules={}
if structuredFieldnameForBrand in filterRulesByField:
    existingRules = filterRulesByField[structuredFieldnameForBrand]
else:
    filterRulesByField[structuredFieldnameForBrand]=existingRules


results = es.search(index=indexNameForProducts, body=topBrands)
brandResults = results["aggregations"]["topBrands"]["buckets"]
for bucket in brandResults:
    brandName = bucket["key"]
    q = {
        "size": 0,
        "query": {
            "bool": {
                "should": [
                    {
                        "match_phrase": {
                            unstructuredFieldNamePossiblyMentioningBrands: {
                                "query": brandName
                            }
                        }
                    },
                    {
                        "term": {
                            structuredFieldnameForBrand: brandName
                        }
                    }
                ]
            }
        },
        "aggs": {
            "brands": {
                "terms": {
                    "field": structuredFieldnameForBrand
                },

                "aggregations": {
                    "product_names": {
                        "top_hits": {
                            "size": 1,
                            "_source": unstructuredFieldNamePossiblyMentioningBrands
                        }
                    },
                    "structured_only": {
                        "filter": {
                            "bool": {
                                "must_not": [
                                    {
                                        "match_phrase": {
                                            unstructuredFieldNamePossiblyMentioningBrands: {
                                                "query": brandName
                                            }
                                        }
                                    }
                                ]
                            }
                        }
                    }
                }
            }
        }
    }
    results = es.search(index=indexNameForProducts, body=q)
    brands = results["aggregations"]["brands"]["buckets"]
    numProductsTaggedWithBrand = 0
    numOtherBrandedProductsMentioningBrand = 0
    queryExpansions = 0
    if len(brands) > 1:
        print "ambiguous brand name:", brandName
        for brand in brands:
            print "\t", brand["key"], "(", brand["doc_count"], "products)"
            if brand["key"] == brandName:
                numProductsTaggedWithBrand = brand["doc_count"]
                queryExpansions = brand["structured_only"]["doc_count"]
            else:
                numOtherBrandedProductsMentioningBrand += brand["doc_count"]
            for product in brand["product_names"]["hits"]["hits"]:
                print "\t\t", product["_source"][unstructuredFieldNamePossiblyMentioningBrands]
    else:
        print "unambiguous brand name:", brandName, "(", brands[0]["doc_count"], "products)"
        numProductsTaggedWithBrand += brands[0]["doc_count"]
        queryExpansions = brands[0]["structured_only"]["doc_count"]

        for product in brands[0]["product_names"]["hits"]["hits"]:
            print "\t\t", product["_source"][unstructuredFieldNamePossiblyMentioningBrands]
    brandnameUniqueness = float(numProductsTaggedWithBrand) / float(numProductsTaggedWithBrand + numOtherBrandedProductsMentioningBrand)

    if brandnameUniqueness>=minBrandUniqueness and numProductsTaggedWithBrand>minNumProductsTaggedWithBrand:
    #     add new rule if not already there
        if brandName not in existingRules:
            existingRules[brandName]={
                "patterns":[brandName.lower()]
            }
    else:
        if brandName in existingRules:
            print "!!!!Warning!!! existing rule for [",brandName,"] now scoring only ",brandnameUniqueness," uniqueness"

    print "\t===== brandnameUniqueness=",brandnameUniqueness,"Facet snapping would score true positives=", numProductsTaggedWithBrand, "max num false negatives=", numOtherBrandedProductsMentioningBrand, "query expansions from adding structured brandname filter = ", queryExpansions


with open(rulesFile, 'wb') as f:
    f.write(json.dumps(filterRulesByField, indent=2) + '\n')
	from elasticsearch.client import Elasticsearch
	import json

	es = Elasticsearch()





	# ==== Configure your data's names here:
	structuredFieldnameForBrand = "brand.keyword"
	indexNameForProducts = "myproducts"
	unstructuredFieldNamePossiblyMentioningBrands = "name"
	# Load existing ruleset - start with a blank json file with just {} in it
	rulesFile = "/my_rules/categorysnaps.json"
	minBrandUniqueness=0.98
	minNumProductsTaggedWithBrand=10

	json_data = open(rulesFile).read()
	filterRulesByField = json.loads(json_data)



	topBrands = {
	"size": 0,
	"aggs": {
	"numUniqueBrands": {
	"cardinality": {
	"field": structuredFieldnameForBrand
	}
	},
	"topBrands": {
	"terms": {
	"field": structuredFieldnameForBrand,
	"size": 5000
	}
	}
	}
	}
	existingRules={}
	if structuredFieldnameForBrand in filterRulesByField:
	existingRules = filterRulesByField[structuredFieldnameForBrand]
	else:
	filterRulesByField[structuredFieldnameForBrand]=existingRules


	results = es.search(index=indexNameForProducts, body=topBrands)
	brandResults = results["aggregations"]["topBrands"]["buckets"]
	for bucket in brandResults:
	brandName = bucket["key"]
	q = {
	"size": 0,
	"query": {
	"bool": {
	"should": [
	{
	"match_phrase": {
	unstructuredFieldNamePossiblyMentioningBrands: {
	"query": brandName
	}
	}
	},
	{
	"term": {
	structuredFieldnameForBrand: brandName
	}
	}
	]
	}
	},
	"aggs": {
	"brands": {
	"terms": {
	"field": structuredFieldnameForBrand
	},

	"aggregations": {
	"product_names": {
	"top_hits": {
	"size": 1,
	"_source": unstructuredFieldNamePossiblyMentioningBrands
	}
	},
	"structured_only": {
	"filter": {
	"bool": {
	"must_not": [
	{
	"match_phrase": {
	unstructuredFieldNamePossiblyMentioningBrands: {
	"query": brandName
	}
	}
	}
	]
	}
	}
	}
	}
	}
	}
	}
	results = es.search(index=indexNameForProducts, body=q)
	brands = results["aggregations"]["brands"]["buckets"]
	numProductsTaggedWithBrand = 0
	numOtherBrandedProductsMentioningBrand = 0
	queryExpansions = 0
	if len(brands) > 1:
	print "ambiguous brand name:", brandName
	for brand in brands:
	print "\t", brand["key"], "(", brand["doc_count"], "products)"
	if brand["key"] == brandName:
	numProductsTaggedWithBrand = brand["doc_count"]
	queryExpansions = brand["structured_only"]["doc_count"]
	else:
	numOtherBrandedProductsMentioningBrand += brand["doc_count"]
	for product in brand["product_names"]["hits"]["hits"]:
	print "\t\t", product["_source"][unstructuredFieldNamePossiblyMentioningBrands]
	else:
	print "unambiguous brand name:", brandName, "(", brands[0]["doc_count"], "products)"
	numProductsTaggedWithBrand += brands[0]["doc_count"]
	queryExpansions = brands[0]["structured_only"]["doc_count"]

	for product in brands[0]["product_names"]["hits"]["hits"]:
	print "\t\t", product["_source"][unstructuredFieldNamePossiblyMentioningBrands]
	brandnameUniqueness = float(numProductsTaggedWithBrand) / float(numProductsTaggedWithBrand + numOtherBrandedProductsMentioningBrand)

	if brandnameUniqueness>=minBrandUniqueness and numProductsTaggedWithBrand>minNumProductsTaggedWithBrand:
	# add new rule if not already there
	if brandName not in existingRules:
	existingRules[brandName]={
	"patterns":[brandName.lower()]
	}
	else:
	if brandName in existingRules:
	print "!!!!Warning!!! existing rule for [",brandName,"] now scoring only ",brandnameUniqueness," uniqueness"

	print "\t===== brandnameUniqueness=",brandnameUniqueness,"Facet snapping would score true positives=", numProductsTaggedWithBrand, "max num false negatives=", numOtherBrandedProductsMentioningBrand, "query expansions from adding structured brandname filter = ", queryExpansions


	with open(rulesFile, 'wb') as f:
	f.write(json.dumps(filterRulesByField, indent=2) + '\n')