Created
March 1, 2017 23:07
-
-
Save openAccess/9e76aa7fa6135be419968b1372c86957 to your computer and use it in GitHub Desktop.
PLOS Solr Schema
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/schema/types/comment() | |
SourceOptions | |
XPath1/ParseXSL 1.0 | |
<!-- attribute "name" is the name of this schema and is only used for display purposes. | |
Applications should change this to reflect the nature of the search collection. | |
version="x.y" is Solr's version number for the schema syntax and semantics. It should | |
not normally be changed by applications. | |
1.3: removed optional field compress feature | |
1.4: default auto-phrase (QueryParser feature) to off | |
1.5: omitNorms defaults to true for primitive field types (int, float, boolean, string...) | |
--> | |
<schema name="plos" version="1.5"> | |
<types> | |
<!-- omitNorms defaults to "true" for primitive field types --> | |
<fieldType name="string" class="solr.StrField" sortMissingLast="true"/> | |
<!-- | |
Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. | |
--> | |
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/> | |
<fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/> | |
<!-- A Trie based date field for faster date range queries and date faceting. --> | |
<fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/> | |
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of | |
words on case-change, alpha numeric boundaries, and non-alphanumeric chars, | |
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi". | |
Synonyms and stopwords are customized by external files, and stemming is enabled. | |
--> | |
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="3" max="100"/> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> | |
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="0" splitOnCaseChange="0"/> | |
<!-- Disabeling position filter. Note JO: 16743 | |
filter class="solr.PositionFilterFactory" /--> | |
<filter class="solr.TrimFilterFactory"/> | |
<!-- EnglishPorterFilterFactory class is deprecated, replacing it with PorterStemFilterFactory --> | |
<filter class="solr.PorterStemFilterFactory" protected="protwords.txt"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="3" max="100"/> | |
<!-- Disabling synonym filter, we may want to turn this back on eventually --> | |
<!---filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/--> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> | |
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="0" splitOnCaseChange="0"/> | |
<!-- Disabeling position filter. Note JO: 16743 | |
filter class="solr.PositionFilterFactory" /--> | |
<filter class="solr.TrimFilterFactory"/> | |
<!-- EnglishPorterFilterFactory class is deprecated, replacing it with PorterStemFilterFactory --> | |
<filter class="solr.PorterStemFilterFactory" protected="protwords.txt"/> | |
</analyzer> | |
</fieldType> | |
<!-- | |
A text field that has little processing on it to use for term and spelling suggestions | |
--> | |
<fieldType name="text_noprocess" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="3" max="100"/> | |
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="1" splitOnCaseChange="0"/> | |
<filter class="solr.TrimFilterFactory"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="3" max="100"/> | |
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="1" splitOnCaseChange="0"/> | |
<filter class="solr.TrimFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
<!-- | |
Lowercasing terms for case insensitive searching | |
--> | |
<fieldType name="keyword_lowercase" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer type="index"> | |
<tokenizer class="solr.KeywordTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.TrimFilterFactory"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.KeywordTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.TrimFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
<!-- A text field specialized for exact proper name search. --> | |
<fieldType name="text_name" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.WordDelimiterFilterFactory" preserveOriginal="1" splitOnCaseChange="0"/> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="author_stopwords.txt" enablePositionIncrements="true"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.WordDelimiterFilterFactory" preserveOriginal="1" splitOnCaseChange="0"/> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="author_stopwords.txt" enablePositionIncrements="true"/> | |
</analyzer> | |
</fieldType> | |
<!-- A general unstemmed text field that indexes tokens normally and also | |
reversed (via ReversedWildcardFilterFactory), to enable more efficient | |
leading wildcard queries. --> | |
<!-- "autoGeneratePhraseQueries" that defaults to "false" in 3.x. | |
In earlier versions, autoGeneratePhraseQueries was defaulted to 'True" | |
So, it is required to set this value explicitly in order for WhitespaceTokenizerFactory to work properly. --> | |
<fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="3" max="25"/> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> | |
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="3" max="25"/> | |
<!-- Disabling synonym filter, we'll want to turn this back on --> | |
<!--filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/--> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> | |
</analyzer> | |
</fieldType> | |
<!-- | |
Split text on the '/' character. Subjects come in with a defined hierarchy | |
This removes it. | |
--> | |
<fieldType name="text_splitOnSlashLowercase" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer type="index"> | |
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/> | |
<filter class="solr.TrimFilterFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="2" max="5000"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/> | |
<filter class="solr.TrimFilterFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="2" max="5000"/> | |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" tokenizerFactory="solr.KeywordTokenizerFactory"/> | |
</analyzer> | |
</fieldType> | |
<!-- | |
Split text on the '/' character. Subjects come in with a defined hierarchy | |
This removes it. | |
--> | |
<fieldType name="text_splitOnSlash" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer type="index"> | |
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/> | |
<filter class="solr.TrimFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="2" max="5000"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/> | |
<filter class="solr.TrimFilterFactory"/> | |
<filter class="solr.LengthFilterFactory" min="2" max="5000"/> | |
</analyzer> | |
</fieldType> | |
<!-- since fields of this type are by default not stored or indexed, | |
any data added to them will be ignored outright. --> | |
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField"/> | |
<fieldtype name="trigram" stored="false" indexed="true" multiValued="true" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer> | |
<tokenizer class="solr.StandardTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="20"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.StandardTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="20"/> | |
</analyzer> | |
</fieldtype> | |
<fieldtype name="text_subject_search" class="solr.TextField"> | |
<analyzer> | |
<tokenizer class="solr.KeywordTokenizerFactory"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.EdgeNGramFilterFactory" maxGramSize="30"/> | |
</analyzer> | |
</fieldtype> | |
</types> | |
<fields> | |
<field name="id" type="string" indexed="true" stored="true" required="true"/> | |
<field name="doi" type="trigram" indexed="true" stored="true"/> | |
<field name="pmid" type="string" indexed="true" stored="true"/> | |
<field name="pmcid" type="string" indexed="true" stored="true"/> | |
<field name="eissn" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="pissn" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="title" type="text" indexed="true" stored="true" multiValued="false"/> | |
<field name="title_display" type="string" indexed="false" stored="true" multiValued="false"/> | |
<field name="title_ngram" type="trigram" indexed="true" stored="false" multiValued="false"/> | |
<field name="alternate_title" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="subject" type="text_splitOnSlashLowercase" indexed="true" stored="true" multiValued="true"/> | |
<field name="subject_facet" type="text_splitOnSlash" indexed="true" stored="true" multiValued="true"/> | |
<!-- Also store the subject to preserve the hierarchy if needed later --> | |
<field name="subject_hierarchy" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="subject_level_1" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="pagecount" type="int" indexed="true" stored="true"/> | |
<field name="volume" type="int" indexed="true" stored="true"/> | |
<field name="issue" type="int" indexed="true" stored="true"/> | |
<field name="elocation_id" type="keyword_lowercase" indexed="true" stored="true"/> | |
<field name="publisher" type="text_name" indexed="true" stored="true"/> | |
<field name="journal" type="keyword_lowercase" indexed="true" stored="true"/> | |
<field name="journal_name" type="string" indexed="true" stored="true"/> | |
<field name="journal_key" type="string" indexed="true" stored="true"/> | |
<field name="journal_eissn" type="string" indexed="true" stored="true"/> | |
<field name="journal_id_pmc" type="string" indexed="true" stored="true"/> | |
<field name="journal_id_publisher" type="string" indexed="true" stored="true"/> | |
<field name="journal_id_nlm_ta" type="string" indexed="true" stored="true"/> | |
<field name="publication_date" type="tdate" indexed="true" stored="true"/> | |
<field name="received_date" type="tdate" indexed="true" stored="true"/> | |
<field name="accepted_date" type="tdate" indexed="true" stored="true"/> | |
<field name="abstract" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="abstract_ngram" type="trigram" indexed="true" stored="false" multiValued="true"/> | |
<field name="abstract_primary_display" type="string" indexed="false" stored="true" multiValued="true"/> | |
<field name="author" type="text_name" indexed="true" stored="true" multiValued="true"/> | |
<field name="author_facet" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="author_display" type="string" indexed="false" stored="true" multiValued="true"/> | |
<field name="editor" type="text_name" indexed="true" stored="true" multiValued="true"/> | |
<field name="editor_facet" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="editor_display" type="string" indexed="false" stored="true" multiValued="true"/> | |
<field name="author_without_collab_display" type="string" indexed="false" stored="true" multiValued="true"/> | |
<field name="author_collab_only_display" type="string" indexed="false" stored="true" multiValued="true"/> | |
<!--The two following fields are copyied into an affiliates field --> | |
<field name="author_affiliate" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="editor_affiliate" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="affiliate" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="affiliate_facet" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="author_notes" type="text" indexed="true" stored="true"/> | |
<field name="competing_interest" type="text" indexed="true" stored="true"/> | |
<field name="data_availability" type="text" indexed="true" stored="true"/> | |
<field name="financial_disclosure" type="text" indexed="true" stored="true"/> | |
<field name="article_type" type="keyword_lowercase" indexed="true" stored="true" multiValued="false"/> | |
<field name="article_type_facet" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="reference" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="copyright" type="text" indexed="true" stored="true"/> | |
<field name="figure_table_doi" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="figure_table_caption" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="striking_image" type="string" indexed="false" stored="true" multiValued="false"/> | |
<!-- Should we add ngram and rev fields for all of these types as well? --> | |
<!-- Document parts --> | |
<field name="introduction" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="results_and_discussion" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="materials_and_methods" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="supporting_information" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="conclusions" type="text" indexed="true" stored="true" multiValued="true"/> | |
<field name="body" type="text" indexed="true" stored="true" multiValued="false" termVectors="true"/> | |
<!-- catchall field, containing all other searchable text fields --> | |
<field name="everything" type="text" indexed="true" stored="true" multiValued="false" termVectors="true"/> | |
<field name="everything_rev" type="text_rev" indexed="true" stored="false" multiValued="false"/> | |
<field name="everything_ngram" type="trigram" indexed="true" stored="false" multiValued="false"/> | |
<field name="everything_noprocess" type="text_noprocess" indexed="true" stored="true" multiValued="false"/> | |
<!-- Indexes tokens both normally and in reverse for efficient leading wildcard queries. --> | |
<field name="body_rev" type="text_rev" indexed="true" stored="false" multiValued="false"/> | |
<field name="body_ngram" type="trigram" indexed="true" stored="false" multiValued="false"/> | |
<!-- For tracking when the document was indexed --> | |
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> | |
<!-- For tracking ALM data --> | |
<field name="counter_total_all" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="counter_total_month" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_scopusCiteCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_citeulikeCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_connoteaCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_mendeleyCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_twitterCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_facebookCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_pmc_usage_total_all" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<field name="alm_webOfScienceCount" type="int" indexed="true" stored="false" default="0" multiValued="false"/> | |
<field name="alm_crossRefCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/> | |
<!-- Fields for partial documents --> | |
<field name="doc_partial_parent_id" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="doc_type" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="doc_partial_type" type="string" indexed="true" stored="false" multiValued="false"/> | |
<field name="doc_partial_body" type="text" indexed="true" stored="true" multiValued="true" termVectors="true"/> | |
<!-- Fields for academic editor document --> | |
<field name="ae_name" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="ae_name_facet" type="text_name" indexed="true" stored="true" multiValued="false"/> | |
<field name="ae_last_name" type="keyword_lowercase" indexed="true" stored="true" multiValued="false"/> | |
<field name="ae_institute" type="string" indexed="false" stored="true" multiValued="true"/> | |
<field name="ae_country" type="string" indexed="false" stored="true" multiValued="true"/> | |
<field name="ae_subject" type="text_subject_search" indexed="true" stored="true" multiValued="true"/> | |
<field name="ae_subject_facet" type="keyword_lowercase" indexed="true" stored="true" multiValued="true"/> | |
<field name="trial_registration" type="text" indexed="true" stored="true" multiValued="true"/> | |
<!-- Amendment fields --> | |
<field name="correction" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="expression_of_concern" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="retraction" type="string" indexed="true" stored="true" multiValued="false"/> | |
<!-- publication stage and revision date fields --> | |
<field name="publication_stage" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="revision_date" type="tdate" indexed="true" stored="true"/> | |
</fields> | |
<!-- Field to use to determine and enforce document uniqueness. | |
Unless this field is marked with required="false", it will be a required field | |
--> | |
<uniqueKey>id</uniqueKey> | |
<!-- copyField commands copy one field to another at the time a document | |
is added to the index. It's used either to index the same field differently, | |
or to add multiple fields to the same field for easier/faster searching. --> | |
<copyField source="abstract" dest="abstract_ngram"/> | |
<copyField source="author_affiliate" dest="affiliate"/> | |
<copyField source="author_affiliate" dest="affiliate_facet"/> | |
<copyField source="editor_affiliate" dest="affiliate"/> | |
<copyField source="editor_affiliate" dest="affiliate_facet"/> | |
<copyField source="subject" dest="subject_hierarchy"/> | |
<copyField source="subject" dest="subject_facet"/> | |
<copyField source="author" dest="author_facet"/> | |
<copyField source="editor" dest="editor_facet"/> | |
<copyField source="article_type" dest="article_type_facet"/> | |
<copyField source="body" dest="body_rev"/> | |
<copyField source="body" dest="body_ngram"/> | |
<copyField source="title" dest="title_ngram"/> | |
<copyField source="everything" dest="everything_rev"/> | |
<copyField source="everything" dest="everything_ngram"/> | |
<copyField source="everything" dest="everything_noprocess"/> | |
<copyField source="ae_subject" dest="ae_subject_facet"/> | |
<copyField source="ae_name" dest="ae_name_facet"/> | |
</schema> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment