Skip to content

Instantly share code, notes, and snippets.

@markharwood
Created April 3, 2014 09:17
Show Gist options
  • Save markharwood/9951177 to your computer and use it in GitHub Desktop.
Save markharwood/9951177 to your computer and use it in GitHub Desktop.
Precision/Recall measures for a query using aggs
//================================
// Here a script for gathering the precision/recall stats for a query (see http://en.wikipedia.org/wiki/Precision_and_recall)
// A candidate classifier query's effectiveness is determined by counting hits on pre-classified content
// If we compute the F-measure we can potentially use it as the fitness function for a genetic algo that mutates our query
// (introducing phrases, minShouldMatch clauses etc) to move us towards our target goal of balancing precision/recall in our classifier.
//=================================
// Our candidate query for classifying documents in a category
var candidateQuery={ "terms": {"body": ["vs", "shr", "cts", "net", "revs", "note", "loss", "mths", "shrs", "avg", "profit"]}};
// Our filter criteria for identifying documents in our target category
var categoryTest= { "term" : { "topics" : "earn" } };
// The name of our category field (used for summarising false positives)
var categoryField="topics";
var queryJson={
"query" :candidateQuery,
"aggs" : {
"globals":{
"global":{},
"aggs":{
"requiredHits":{
// The count of docs in this bucket minus that of "truePositive" bucket gives our false negative figure
"filter" : categoryTest,
},
"requiredMisses":{
// The count of docs in this bucket minus that of "falsePositive" bucket gives our true negative figure
"filter":{ "bool" : {"mustNot" : categoryTest } }
}
}
},
"truePositive" : {
"filter" : categoryTest,
"aggs":{
//For documents with >1 classification this will summarise any "also-connected" categories
"relatedCategories":{
"terms" : {"field" : categoryField}
}
}
},
"falsePositive":{
"filter":{
"bool" : { "mustNot" : categoryTest }
},
"aggs":{
//This will list all of the category fields we got a false positive on - identifies categories that are "close"
"fpCollateral":{"terms" : {"field" : categoryField} }
}
}
}
};
return queryJson;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment