Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Slight tweak of the Solr SpellCheckComponent to make its output quieter when you only care about the collations
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.healthonnet.solr</groupId>
<artifactId>quiet-spellcheck-component</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>3.6.2</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.6.2</version>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-core</artifactId>
<version>3.6.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
package org.healthonnet.solr;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SpellCheckComponent;
import org.apache.solr.spelling.SolrSpellChecker;
import org.apache.solr.spelling.SpellingOptions;
import org.apache.solr.spelling.SpellingResult;
/**
* Wrapper around the SpellCheckComponent that gives a more quiet output. It
* only gives collations; does not give any suggestions. This is useful when you
* set spellcheck.count to a large number, but you really only want to check the
* collations. It avoids writing a ton of output and therefore cuts down on I/O.
*
* @author nolan
*
*/
public class QuietSpellCheckComponent extends SpellCheckComponent {
@Override
@SuppressWarnings("unchecked")
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
return;
}
boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
String q = params.get(SPELLCHECK_Q);
SolrSpellChecker spellChecker = getSpellChecker(params);
Collection<Token> tokens = null;
if (q != null) {
// we have a spell check param, tokenize it with the query analyzer
// applicable for this spellchecker
tokens = getTokens(q, spellChecker.getQueryAnalyzer());
} else {
q = rb.getQueryString();
if (q == null) {
q = params.get(CommonParams.Q);
}
tokens = queryConverter.convert(q);
}
if (tokens != null && tokens.isEmpty() == false) {
if (spellChecker != null) {
int count = params.getInt(SPELLCHECK_COUNT, 1);
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR, DEFAULT_ONLY_MORE_POPULAR);
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false);
NamedList response = new SimpleOrderedMap();
IndexReader reader = rb.req.getSearcher().getReader();
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest);
SpellingOptions options = new SpellingOptions(tokens, reader,
count, onlyMorePopular, extendedResults, accuracy,
customParams);
SpellingResult spellingResult = spellChecker.getSuggestions(options);
if (spellingResult != null) {
NamedList suggestions = toNamedList(shardRequest,
spellingResult, q, extendedResults, collate);
if (collate) {
addCollationsToResponse(params, spellingResult, rb, q, suggestions);
}
// clear the spellingResult after making collations from it.
// This is the only change I'm making compared to
// SpellCheckComponent!
for (int i = suggestions.size() - 1; i >= 0; i--) {
if ("collation".equals(suggestions.getName(i))) {
Object val = suggestions.getVal(i);
if (val instanceof NamedList
&& ((NamedList) val).size() > 0
&& "collationQuery".equals(((NamedList) val).getName(0))) {
continue;
}
}
// otherwise it's a non-collation; delete it
suggestions.remove(i);
}
response.add("suggestions", suggestions);
rb.rsp.add("spellcheck", response);
}
} else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
"Specified dictionary does not exist.");
}
}
}
private Collection<Token> getTokens(String q, Analyzer analyzer)
throws IOException {
Collection<Token> result = new ArrayList<Token>();
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
ts.reset();
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts
.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
ts.end();
ts.close();
return result;
}
private String getDictionaryName(SolrParams params) {
String dictName = params.get(SPELLCHECK_DICT);
if (dictName == null) {
dictName = SolrSpellChecker.DEFAULT_DICTIONARY_NAME;
}
return dictName;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment