Skip to content

Instantly share code, notes, and snippets.

@openAccess
Created March 1, 2017 23:07
Show Gist options
  • Save openAccess/9e76aa7fa6135be419968b1372c86957 to your computer and use it in GitHub Desktop.
Save openAccess/9e76aa7fa6135be419968b1372c86957 to your computer and use it in GitHub Desktop.
PLOS Solr Schema
/schema/types/comment()
SourceOptions
XPath1/ParseXSL 1.0
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="x.y" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.3: removed optional field compress feature
1.4: default auto-phrase (QueryParser feature) to off
1.5: omitNorms defaults to true for primitive field types (int, float, boolean, string...)
-->
<schema name="plos" version="1.5">
<types>
<!-- omitNorms defaults to "true" for primitive field types -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
<!--
Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
-->
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
<!-- A Trie based date field for faster date range queries and date faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="3" max="100"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="0" splitOnCaseChange="0"/>
<!-- Disabeling position filter. Note JO: 16743
filter class="solr.PositionFilterFactory" /-->
<filter class="solr.TrimFilterFactory"/>
<!-- EnglishPorterFilterFactory class is deprecated, replacing it with PorterStemFilterFactory -->
<filter class="solr.PorterStemFilterFactory" protected="protwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="3" max="100"/>
<!-- Disabling synonym filter, we may want to turn this back on eventually -->
<!---filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="0" splitOnCaseChange="0"/>
<!-- Disabeling position filter. Note JO: 16743
filter class="solr.PositionFilterFactory" /-->
<filter class="solr.TrimFilterFactory"/>
<!-- EnglishPorterFilterFactory class is deprecated, replacing it with PorterStemFilterFactory -->
<filter class="solr.PorterStemFilterFactory" protected="protwords.txt"/>
</analyzer>
</fieldType>
<!--
A text field that has little processing on it to use for term and spelling suggestions
-->
<fieldType name="text_noprocess" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="3" max="100"/>
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="1" splitOnCaseChange="0"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="3" max="100"/>
<filter class="solr.WordDelimiterFilterFactory" stemEnglishPossessive="1" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnNumerics="1" splitOnCaseChange="0"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
</fieldType>
<!--
Lowercasing terms for case insensitive searching
-->
<fieldType name="keyword_lowercase" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field specialized for exact proper name search. -->
<fieldType name="text_name" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" preserveOriginal="1" splitOnCaseChange="0"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="author_stopwords.txt" enablePositionIncrements="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" preserveOriginal="1" splitOnCaseChange="0"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="author_stopwords.txt" enablePositionIncrements="true"/>
</analyzer>
</fieldType>
<!-- A general unstemmed text field that indexes tokens normally and also
reversed (via ReversedWildcardFilterFactory), to enable more efficient
leading wildcard queries. -->
<!-- "autoGeneratePhraseQueries" that defaults to "false" in 3.x.
In earlier versions, autoGeneratePhraseQueries was defaulted to 'True"
So, it is required to set this value explicitly in order for WhitespaceTokenizerFactory to work properly. -->
<fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="3" max="25"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="3" max="25"/>
<!-- Disabling synonym filter, we'll want to turn this back on -->
<!--filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
</analyzer>
</fieldType>
<!--
Split text on the '/' character. Subjects come in with a defined hierarchy
This removes it.
-->
<fieldType name="text_splitOnSlashLowercase" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/>
<filter class="solr.TrimFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="2" max="5000"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/>
<filter class="solr.TrimFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="2" max="5000"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" tokenizerFactory="solr.KeywordTokenizerFactory"/>
</analyzer>
</fieldType>
<!--
Split text on the '/' character. Subjects come in with a defined hierarchy
This removes it.
-->
<fieldType name="text_splitOnSlash" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/>
<filter class="solr.TrimFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="2" max="5000"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PatternTokenizerFactory" pattern="//*"/>
<filter class="solr.TrimFilterFactory"/>
<filter class="solr.LengthFilterFactory" min="2" max="5000"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField"/>
<fieldtype name="trigram" stored="false" indexed="true" multiValued="true" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="20"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="20"/>
</analyzer>
</fieldtype>
<fieldtype name="text_subject_search" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EdgeNGramFilterFactory" maxGramSize="30"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="doi" type="trigram" indexed="true" stored="true"/>
<field name="pmid" type="string" indexed="true" stored="true"/>
<field name="pmcid" type="string" indexed="true" stored="true"/>
<field name="eissn" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="pissn" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="title" type="text" indexed="true" stored="true" multiValued="false"/>
<field name="title_display" type="string" indexed="false" stored="true" multiValued="false"/>
<field name="title_ngram" type="trigram" indexed="true" stored="false" multiValued="false"/>
<field name="alternate_title" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="subject" type="text_splitOnSlashLowercase" indexed="true" stored="true" multiValued="true"/>
<field name="subject_facet" type="text_splitOnSlash" indexed="true" stored="true" multiValued="true"/>
<!-- Also store the subject to preserve the hierarchy if needed later -->
<field name="subject_hierarchy" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="subject_level_1" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="pagecount" type="int" indexed="true" stored="true"/>
<field name="volume" type="int" indexed="true" stored="true"/>
<field name="issue" type="int" indexed="true" stored="true"/>
<field name="elocation_id" type="keyword_lowercase" indexed="true" stored="true"/>
<field name="publisher" type="text_name" indexed="true" stored="true"/>
<field name="journal" type="keyword_lowercase" indexed="true" stored="true"/>
<field name="journal_name" type="string" indexed="true" stored="true"/>
<field name="journal_key" type="string" indexed="true" stored="true"/>
<field name="journal_eissn" type="string" indexed="true" stored="true"/>
<field name="journal_id_pmc" type="string" indexed="true" stored="true"/>
<field name="journal_id_publisher" type="string" indexed="true" stored="true"/>
<field name="journal_id_nlm_ta" type="string" indexed="true" stored="true"/>
<field name="publication_date" type="tdate" indexed="true" stored="true"/>
<field name="received_date" type="tdate" indexed="true" stored="true"/>
<field name="accepted_date" type="tdate" indexed="true" stored="true"/>
<field name="abstract" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="abstract_ngram" type="trigram" indexed="true" stored="false" multiValued="true"/>
<field name="abstract_primary_display" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="author" type="text_name" indexed="true" stored="true" multiValued="true"/>
<field name="author_facet" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="author_display" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="editor" type="text_name" indexed="true" stored="true" multiValued="true"/>
<field name="editor_facet" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="editor_display" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="author_without_collab_display" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="author_collab_only_display" type="string" indexed="false" stored="true" multiValued="true"/>
<!--The two following fields are copyied into an affiliates field -->
<field name="author_affiliate" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="editor_affiliate" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="affiliate" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="affiliate_facet" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="author_notes" type="text" indexed="true" stored="true"/>
<field name="competing_interest" type="text" indexed="true" stored="true"/>
<field name="data_availability" type="text" indexed="true" stored="true"/>
<field name="financial_disclosure" type="text" indexed="true" stored="true"/>
<field name="article_type" type="keyword_lowercase" indexed="true" stored="true" multiValued="false"/>
<field name="article_type_facet" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="reference" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="copyright" type="text" indexed="true" stored="true"/>
<field name="figure_table_doi" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="figure_table_caption" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="striking_image" type="string" indexed="false" stored="true" multiValued="false"/>
<!-- Should we add ngram and rev fields for all of these types as well? -->
<!-- Document parts -->
<field name="introduction" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="results_and_discussion" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="materials_and_methods" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="supporting_information" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="conclusions" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="body" type="text" indexed="true" stored="true" multiValued="false" termVectors="true"/>
<!-- catchall field, containing all other searchable text fields -->
<field name="everything" type="text" indexed="true" stored="true" multiValued="false" termVectors="true"/>
<field name="everything_rev" type="text_rev" indexed="true" stored="false" multiValued="false"/>
<field name="everything_ngram" type="trigram" indexed="true" stored="false" multiValued="false"/>
<field name="everything_noprocess" type="text_noprocess" indexed="true" stored="true" multiValued="false"/>
<!-- Indexes tokens both normally and in reverse for efficient leading wildcard queries. -->
<field name="body_rev" type="text_rev" indexed="true" stored="false" multiValued="false"/>
<field name="body_ngram" type="trigram" indexed="true" stored="false" multiValued="false"/>
<!-- For tracking when the document was indexed -->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<!-- For tracking ALM data -->
<field name="counter_total_all" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="counter_total_month" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_scopusCiteCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_citeulikeCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_connoteaCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_mendeleyCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_twitterCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_facebookCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_pmc_usage_total_all" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<field name="alm_webOfScienceCount" type="int" indexed="true" stored="false" default="0" multiValued="false"/>
<field name="alm_crossRefCount" type="int" indexed="true" stored="true" default="0" multiValued="false"/>
<!-- Fields for partial documents -->
<field name="doc_partial_parent_id" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="doc_type" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="doc_partial_type" type="string" indexed="true" stored="false" multiValued="false"/>
<field name="doc_partial_body" type="text" indexed="true" stored="true" multiValued="true" termVectors="true"/>
<!-- Fields for academic editor document -->
<field name="ae_name" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="ae_name_facet" type="text_name" indexed="true" stored="true" multiValued="false"/>
<field name="ae_last_name" type="keyword_lowercase" indexed="true" stored="true" multiValued="false"/>
<field name="ae_institute" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="ae_country" type="string" indexed="false" stored="true" multiValued="true"/>
<field name="ae_subject" type="text_subject_search" indexed="true" stored="true" multiValued="true"/>
<field name="ae_subject_facet" type="keyword_lowercase" indexed="true" stored="true" multiValued="true"/>
<field name="trial_registration" type="text" indexed="true" stored="true" multiValued="true"/>
<!-- Amendment fields -->
<field name="correction" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="expression_of_concern" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="retraction" type="string" indexed="true" stored="true" multiValued="false"/>
<!-- publication stage and revision date fields -->
<field name="publication_stage" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="revision_date" type="tdate" indexed="true" stored="true"/>
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="abstract" dest="abstract_ngram"/>
<copyField source="author_affiliate" dest="affiliate"/>
<copyField source="author_affiliate" dest="affiliate_facet"/>
<copyField source="editor_affiliate" dest="affiliate"/>
<copyField source="editor_affiliate" dest="affiliate_facet"/>
<copyField source="subject" dest="subject_hierarchy"/>
<copyField source="subject" dest="subject_facet"/>
<copyField source="author" dest="author_facet"/>
<copyField source="editor" dest="editor_facet"/>
<copyField source="article_type" dest="article_type_facet"/>
<copyField source="body" dest="body_rev"/>
<copyField source="body" dest="body_ngram"/>
<copyField source="title" dest="title_ngram"/>
<copyField source="everything" dest="everything_rev"/>
<copyField source="everything" dest="everything_ngram"/>
<copyField source="everything" dest="everything_noprocess"/>
<copyField source="ae_subject" dest="ae_subject_facet"/>
<copyField source="ae_name" dest="ae_name_facet"/>
</schema>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment