Created
March 25, 2015 17:44
-
-
Save yannvery/2f5c1b84c96fa463aebf to your computer and use it in GitHub Desktop.
Default solr field type for lemming stemming
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer> | |
<!-- analyseur configuré pour le français par Gaël --> | |
<!-- suppression d'éventuelles balises HTML--> | |
<charFilter class="solr.HTMLStripCharFilterFactory"/> | |
<!-- découpage selon les espaces --> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<!-- suppression de la ponctuation --> | |
<filter class="solr.PatternReplaceFilterFactory" pattern="^(\p{Punct}*)(.*?)(\p{Punct}*)$" replacement="$2"/> | |
<!-- suppression des tokens vides et des mots démesurés --> | |
<filter class="solr.LengthFilterFactory" min="3" max="100" /> | |
<!-- passage en minuscules --> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<!-- suppression des élisions (l', qu',...) --> | |
<filter class="solr.ElisionFilterFactory" articles="elisionwords.txt"/> | |
<!-- découpage des mots composés --> | |
<filter class="solr.WordDelimiterFilterFactory" splitOnCaseChange="1" splitOnNumerics="1" stemEnglishPossessive="1" generateWordParts="1" | |
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="1" preserveOriginal="1"/> | |
<!-- suppression des mots insignifiants --> | |
<filter class="solr.StopFilterFactory" ignoreCase="1" words="stopwords.txt" enablePositionIncrements="true"/> | |
<!-- gestion des synonymes --> | |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
<!-- lemmatisation (pluriels,...) --> | |
<filter class="solr.SnowballPorterFilterFactory" language="French" protected="protwords.txt"/> | |
<!-- normalisation des accents, cédilles, e dans l'o,...--> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<!-- suppression des doublons éventuels --> | |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
</analyzer> | |
</fieldType> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment