Instantly share code, notes, and snippets.

Embed
What would you like to do?
Solr search component for language detection
/**
* Solr search component for language detection. It has the following configuration parameters (are configured in solrconfig.xml):
* <ul>
* <li>list of parameters language: supported languages</li>
* <li>languageDetectionProbabilityThreshold: the minimum detection probability for which the language detection is considered reliable (default is 0.09)</li>
* </ul>
*/
public class LanguageDetectionComponent extends SearchComponent {
public static final String LANGUAGE_REQUEST_PARAM_NAME = "language";
private static final String LANGUAGE_PARAM_NAME_IN_SOLR_CONFIG = "language";
private static final float DEFAULT_LANGUAGE_DETECTION_PROBABILITY_THRESHOLD = 0.90F;
private static final String LANGUAGE_DETECTION_PROBABILITY_THRESHOLD_PARAM_NAME_IN_SOLR_CONFIG = "languageDetectionProbabilityThreshold";
private static final String LANGUAGE_PROFILES_ZXCONF_SUBDIR_NAME = "languagesProfiles";
private static final String PRODUCTSEARCH_SOLR_ZXCONF_SUBDIR_NAME = "productsearch-solr";
private static final Logger LOGGER = Logger.getLogger(LanguageDetectionComponent.class.getName());
public static final String MIN_LENGTH_PARAM_IN_SOLR_CONFIG = "minLength";
private Set<String> supportedLanguages = null;
private float languageDetectionProbabilityThreshold = DEFAULT_LANGUAGE_DETECTION_PROBABILITY_THRESHOLD;
private Integer minLength;
@Override
public void init(NamedList args) {
super.init(args);
SolrParams initParams = SolrParams.toSolrParams(args);
supportedLanguages = new HashSet<String>(Arrays.asList(initParams.getParams(LANGUAGE_PARAM_NAME_IN_SOLR_CONFIG)));
languageDetectionProbabilityThreshold = initParams.getFloat(LANGUAGE_DETECTION_PROBABILITY_THRESHOLD_PARAM_NAME_IN_SOLR_CONFIG);
minLength = initParams.getInt(MIN_LENGTH_PARAM_IN_SOLR_CONFIG);
// initialize the language detector
// language profiles can't be load from the classpath due to issue https://code.google.com/p/language-detection/issues/detail?id=9
File languagesProfilesDir = getLanguagesProfilesDir();
try {
DetectorFactory.loadProfile(languagesProfilesDir);
} catch (LangDetectException e) {
throw new IllegalStateException("Error during initialization of the language detector factory " +
"with the language detector profiles in " + languagesProfilesDir.getAbsolutePath() + ".", e);
}
// do further initialization according to the parameters given in solrconfig.xml
}
private File getLanguagesProfilesDir() {
// TODO: return the directory with the language profiles for https://code.google.com/p/language-detection/.
// This directory can't be put in the classpath due to https://code.google.com/p/language-detection/issues/detail?id=9.
}
@Override
public void prepare(ResponseBuilder rb) throws IOException {
SolrParams origParams = rb.req.getParams();
if (origParams.get(LANGUAGE_REQUEST_PARAM_NAME) == null) {
final String query = origParams.get("q");
// don't do language detection for short queries as it is not reliable
if (minLength != null && query != null && query.length() <= minLength) {
return;
}
// do language detection from the query string
try {
final Detector languageDetector = DetectorFactory.create();
languageDetector.append(query);
final List<Language> probabilities = languageDetector.getProbabilities();
if (!probabilities.isEmpty()) {
final Language mostLikelyLanguageWithProbability = probabilities.get(0);
double probabilityOfMostLikelyLanguage = mostLikelyLanguageWithProbability.prob;
final String mostLikelyLanguage = mostLikelyLanguageWithProbability.lang;
if (probabilityOfMostLikelyLanguage > languageDetectionProbabilityThreshold
&& this.supportedLanguages.contains(mostLikelyLanguage)) {
LOGGER.finest("language " + mostLikelyLanguage + " detected for '"
+ query + "' with probability " + probabilityOfMostLikelyLanguage);
ModifiableSolrParams modifiableSolrParams = new ModifiableSolrParams(origParams);
modifiableSolrParams.set(LANGUAGE_REQUEST_PARAM_NAME, mostLikelyLanguage);
// set the language as request parameter
rb.req.setParams(modifiableSolrParams);
}
}
} catch (LangDetectException e) {
LOGGER.warning("Error during language detection. No language is set. || action=\"LANG_DETECTIONS_ERROR\" query='" + query + " error=" + e.getMessage());
}
}
}
@Override
public void process(ResponseBuilder rb) throws IOException {
// Does nothing because we only modify the request.
}
@Override
public String getDescription() {
return "Language detection search component.";
}
@Override
public String getSource() {
return LanguageDetectionComponent.class.getCanonicalName();
}
}
<searchComponent name="language-detector" class="com.zanox.service.productsearch.solr.LanguageDetectionComponent">
<str name="language">en</str>
<str name="language">de</str>
<str name="language">es</str>
<str name="language">fr</str>
<str name="language">it</str>
<int name="minLength">10</int>
<!-- not yet supported: <int name="maxLength">200</int> -->
<float name="languageDetectionProbabilityThreshold">0.90</float>
</searchComponent>
<requestHandler name="/mySearchHandler" class="solr.SearchHandler">
...
<arr name="first-components">
<str>language-detector</str>
<str>query-parser-selector</str>
<str>search-field-setter</str>
</arr>
</requestHandler>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment