Created
June 16, 2011 18:00
-
-
Save jrochkind/1029828 to your computer and use it in GitHub Desktop.
definition of Solr 'text' field, with stemming, we use in our Blacklight app
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- This is for a Solr 1.4 installation; when we upgrade to 3.1 this will change substantially, using the built-in ICU filters instead of the custom unicode filters here currently, and probably using new improved tokenization and stemming filters. --> | |
<!-- I don't recall why we've split the filters for indexing vs. query or if we really needed to do that. --> | |
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<!-- bob's umich unicode normalization filter --> | |
<filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true" /> | |
<!-- this comes from bob/UVA's code, tokenizes CJK data into simply | |
one char per token. Does it work? Unclear. Would really like | |
to see the source. This code lives in UnicodeNormailzeFilter.jar | |
I think. --> | |
<filter class="schema.CJKFilterFactory" bigrams="false"/> | |
<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> --> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> | |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true" /> | |
<!-- this comes from bob/UVA's code, tokenizes CJK data into simply | |
one char per token. Does it work? Unclear. Would really like | |
to see the source. This code lives in UnicodeNormalizeFilter.jar | |
I think. --> | |
<filter class="schema.CJKFilterFactory" bigrams="false"/> | |
<!-- not currently using synonyms | |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> --> | |
<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> --> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> | |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment