Skip to content

Instantly share code, notes, and snippets.

@markharwood
Created January 29, 2014 14:29
Show Gist options
  • Save markharwood/8689136 to your computer and use it in GitHub Desktop.
Save markharwood/8689136 to your computer and use it in GitHub Desktop.
//Groups a related set of terms, typically from the results of some auto-expansion,
// and provides the average DocFreq of the set in order to avoid Lucene's IDF ranking
// favouring the rarest interpretation, which is often a poor choice for auto-expanded
// terms e.g. the terms produced by a fuzzy query or trying alternative fields
class CommonIDFContext {
int commonDf = -1;
Map<Term, Integer> balancedDfs;
List<Term> commonTerms = new ArrayList<Term>();
public void add(Term unbalancedQueryTerm) {
commonTerms.add(unbalancedQueryTerm);
}
//Gets a "balanced" docFreq for a Term by using averaged DF across related Terms. Rather than a completely
// level playing field returning the average for all related terms, a small bias is added for the more popular terms.
// Lucene naturally favours the rarest terms so this "popularity" bias is (somewhat perversely) manifested as smaller docfreqs.
// This approach allows us to use a balanced DF that plays nicely with other sets of terms but, where all other things are
// equal, the small "popularity" bias helps ensure that the various terms in this set are ranked correctly in relation to each
// other (i.e. the most likely interpretation comes first).
public int getBalancedDf(IndexReader reader, Term queryTerm) throws IOException {
if (balancedDfs == null) {
balancedDfs = new HashMap<Term, Integer>();
long totalDocFreq = 0;
int totalNumNonZeroDfTerms = 0;
int dfs[] = new int[commonTerms.size()];
for (int i = 0; i < commonTerms.size(); i++) {
dfs[i] = reader.docFreq(commonTerms.get(i));
if (dfs[i] > 0) {
totalDocFreq += dfs[i];
totalNumNonZeroDfTerms++;
}
}
// Average the DF only for those fields that actually have the
// term
if (totalNumNonZeroDfTerms == 0) {
commonDf = 0;
} else {
commonDf = (int) (totalDocFreq / totalNumNonZeroDfTerms);
}
// Rather than share a common DF for all fields, we introduce a
// little bias towards the more popular interpretation
// We use the average as the basis for DF and add 1 to doc freq
// for every other field that is more popular
for (int i = 0; i < dfs.length; i++) {
if (dfs[i] == 0) {
continue;
}
Term thisTerm = commonTerms.get(i);
int numOfOtherFieldsMorePopular = 0;
for (int j = 0; j < dfs.length; j++) {
if ((dfs[j] > 0) && (dfs[i] < dfs[j])) {
// Other field is more popular
numOfOtherFieldsMorePopular++;
}
}
// Adjust from the average DF - add more docs (effectively
// penalizing) where there are more popular alternatives
balancedDfs.put(thisTerm, commonDf + numOfOtherFieldsMorePopular);
}
}
Integer result = balancedDfs.get(queryTerm);
if (result == null) {
return 0;
}
return result;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment