Created
January 13, 2014 09:26
-
-
Save vsdev1/8397079 to your computer and use it in GitHub Desktop.
Solr search component for language detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Solr search component for language detection. It has the following configuration parameters (are configured in solrconfig.xml): | |
* <ul> | |
* <li>list of parameters language: supported languages</li> | |
* <li>languageDetectionProbabilityThreshold: the minimum detection probability for which the language detection is considered reliable (default is 0.09)</li> | |
* </ul> | |
*/ | |
public class LanguageDetectionComponent extends SearchComponent { | |
public static final String LANGUAGE_REQUEST_PARAM_NAME = "language"; | |
private static final String LANGUAGE_PARAM_NAME_IN_SOLR_CONFIG = "language"; | |
private static final float DEFAULT_LANGUAGE_DETECTION_PROBABILITY_THRESHOLD = 0.90F; | |
private static final String LANGUAGE_DETECTION_PROBABILITY_THRESHOLD_PARAM_NAME_IN_SOLR_CONFIG = "languageDetectionProbabilityThreshold"; | |
private static final String LANGUAGE_PROFILES_ZXCONF_SUBDIR_NAME = "languagesProfiles"; | |
private static final String PRODUCTSEARCH_SOLR_ZXCONF_SUBDIR_NAME = "productsearch-solr"; | |
private static final Logger LOGGER = Logger.getLogger(LanguageDetectionComponent.class.getName()); | |
public static final String MIN_LENGTH_PARAM_IN_SOLR_CONFIG = "minLength"; | |
private Set<String> supportedLanguages = null; | |
private float languageDetectionProbabilityThreshold = DEFAULT_LANGUAGE_DETECTION_PROBABILITY_THRESHOLD; | |
private Integer minLength; | |
@Override | |
public void init(NamedList args) { | |
super.init(args); | |
SolrParams initParams = SolrParams.toSolrParams(args); | |
supportedLanguages = new HashSet<String>(Arrays.asList(initParams.getParams(LANGUAGE_PARAM_NAME_IN_SOLR_CONFIG))); | |
languageDetectionProbabilityThreshold = initParams.getFloat(LANGUAGE_DETECTION_PROBABILITY_THRESHOLD_PARAM_NAME_IN_SOLR_CONFIG); | |
minLength = initParams.getInt(MIN_LENGTH_PARAM_IN_SOLR_CONFIG); | |
// initialize the language detector | |
// language profiles can't be load from the classpath due to issue https://code.google.com/p/language-detection/issues/detail?id=9 | |
File languagesProfilesDir = getLanguagesProfilesDir(); | |
try { | |
DetectorFactory.loadProfile(languagesProfilesDir); | |
} catch (LangDetectException e) { | |
throw new IllegalStateException("Error during initialization of the language detector factory " + | |
"with the language detector profiles in " + languagesProfilesDir.getAbsolutePath() + ".", e); | |
} | |
// do further initialization according to the parameters given in solrconfig.xml | |
} | |
private File getLanguagesProfilesDir() { | |
// TODO: return the directory with the language profiles for https://code.google.com/p/language-detection/. | |
// This directory can't be put in the classpath due to https://code.google.com/p/language-detection/issues/detail?id=9. | |
} | |
@Override | |
public void prepare(ResponseBuilder rb) throws IOException { | |
SolrParams origParams = rb.req.getParams(); | |
if (origParams.get(LANGUAGE_REQUEST_PARAM_NAME) == null) { | |
final String query = origParams.get("q"); | |
// don't do language detection for short queries as it is not reliable | |
if (minLength != null && query != null && query.length() <= minLength) { | |
return; | |
} | |
// do language detection from the query string | |
try { | |
final Detector languageDetector = DetectorFactory.create(); | |
languageDetector.append(query); | |
final List<Language> probabilities = languageDetector.getProbabilities(); | |
if (!probabilities.isEmpty()) { | |
final Language mostLikelyLanguageWithProbability = probabilities.get(0); | |
double probabilityOfMostLikelyLanguage = mostLikelyLanguageWithProbability.prob; | |
final String mostLikelyLanguage = mostLikelyLanguageWithProbability.lang; | |
if (probabilityOfMostLikelyLanguage > languageDetectionProbabilityThreshold | |
&& this.supportedLanguages.contains(mostLikelyLanguage)) { | |
LOGGER.finest("language " + mostLikelyLanguage + " detected for '" | |
+ query + "' with probability " + probabilityOfMostLikelyLanguage); | |
ModifiableSolrParams modifiableSolrParams = new ModifiableSolrParams(origParams); | |
modifiableSolrParams.set(LANGUAGE_REQUEST_PARAM_NAME, mostLikelyLanguage); | |
// set the language as request parameter | |
rb.req.setParams(modifiableSolrParams); | |
} | |
} | |
} catch (LangDetectException e) { | |
LOGGER.warning("Error during language detection. No language is set. || action=\"LANG_DETECTIONS_ERROR\" query='" + query + " error=" + e.getMessage()); | |
} | |
} | |
} | |
@Override | |
public void process(ResponseBuilder rb) throws IOException { | |
// Does nothing because we only modify the request. | |
} | |
@Override | |
public String getDescription() { | |
return "Language detection search component."; | |
} | |
@Override | |
public String getSource() { | |
return LanguageDetectionComponent.class.getCanonicalName(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<searchComponent name="language-detector" class="com.zanox.service.productsearch.solr.LanguageDetectionComponent"> | |
<str name="language">en</str> | |
<str name="language">de</str> | |
<str name="language">es</str> | |
<str name="language">fr</str> | |
<str name="language">it</str> | |
<int name="minLength">10</int> | |
<!-- not yet supported: <int name="maxLength">200</int> --> | |
<float name="languageDetectionProbabilityThreshold">0.90</float> | |
</searchComponent> | |
<requestHandler name="/mySearchHandler" class="solr.SearchHandler"> | |
... | |
<arr name="first-components"> | |
<str>language-detector</str> | |
<str>query-parser-selector</str> | |
<str>search-field-setter</str> | |
</arr> | |
</requestHandler> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment