Skip to content

Instantly share code, notes, and snippets.

@jrochkind
Created June 16, 2011 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jrochkind/1029828 to your computer and use it in GitHub Desktop.
Save jrochkind/1029828 to your computer and use it in GitHub Desktop.
definition of Solr 'text' field, with stemming, we use in our Blacklight app
<!-- This is for a Solr 1.4 installation; when we upgrade to 3.1 this will change substantially, using the built-in ICU filters instead of the custom unicode filters here currently, and probably using new improved tokenization and stemming filters. -->
<!-- I don't recall why we've split the filters for indexing vs. query or if we really needed to do that. -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- bob's umich unicode normalization filter -->
<filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true" />
<!-- this comes from bob/UVA's code, tokenizes CJK data into simply
one char per token. Does it work? Unclear. Would really like
to see the source. This code lives in UnicodeNormailzeFilter.jar
I think. -->
<filter class="schema.CJKFilterFactory" bigrams="false"/>
<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true" />
<!-- this comes from bob/UVA's code, tokenizes CJK data into simply
one char per token. Does it work? Unclear. Would really like
to see the source. This code lives in UnicodeNormalizeFilter.jar
I think. -->
<filter class="schema.CJKFilterFactory" bigrams="false"/>
<!-- not currently using synonyms
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment