Skip to content

Instantly share code, notes, and snippets.

@tingletech
Created October 6, 2012 19:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tingletech/3845898 to your computer and use it in GitHub Desktop.
Save tingletech/3845898 to your computer and use it in GitHub Desktop.
groovy script to load XTF search results into solr
/* oai_to_solr.groovy
load oai xml files into solr */
/*
groovy oai_to_solr.groovy > urllist.txt
*/
import groovy.io.FileType
// set up the solr library with grape http://wiki.apache.org/solr/Solrj#Use_Groovy_and_Grape
@Grab(group='org.apache.solr', module='solr-solrj', version='4.0.0-BETA')
@Grab(group='org.slf4j', module='slf4j-jdk14', version='1.6.6')
import org.apache.solr.client.solrj.impl.HttpSolrServer
import org.apache.solr.common.SolrInputDocument
// set up solr connection
String url = "http://localhost:8983/solr/collection1"
def server = new HttpSolrServer( url );
def dir = new File("/var/lib/tomcat7/webapps/oai/WEB-INF/harvested_records")
dir.eachFileRecurse (FileType.FILES) { file ->
// some OAI files are not well formed
// this needs to be refactored; catching and swallowing too many errors
try {
def oai_dc = new XmlSlurper().parse(file).declareNamespace(
dc: "http://purl.org/dc/elements/1.1/",
oai_dc: "http://www.openarchives.org/OAI/2.0/oai_dc/"
)
def id = oai_dc."dc:identifier".findAll{ it.text().startsWith("http") }[0]
def doc = new SolrInputDocument()
println id
doc.addField("id", id)
oai_dc."*".each{
// multi valued tokenized field, gets copied to _ss multi value solr string and full text index
doc.addField(it.name()+"_txt", it.text())
// http://tingletech.tumblr.com/post/33016859375/one-trick-is-to-add-a-field-names-multivalued
doc.addField("field_names_ss", it.name())
}
server.add(doc)
} catch (all) {
// println all
}
}
server.commit()
/*
Copyright (c) 2012 Regents of the University of California and contributors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
- Neither the name of the University of California nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
/* xtf_to_solr.groovy
load xtf raw serach results into solr */
// set up the solr library with grape http://wiki.apache.org/solr/Solrj#Use_Groovy_and_Grape
@Grab(group='org.apache.solr', module='solr-solrj', version='4.0.0-BETA')
@Grab(group='org.slf4j', module='slf4j-jdk14', version='1.6.6')
import org.apache.solr.client.solrj.impl.HttpSolrServer
import org.apache.solr.common.SolrInputDocument
// set up solr connection
String url = "http://localhost:8983/solr/collection1"
def server = new HttpSolrServer( url );
// XTF results
def results = new XmlSlurper().parse("search-browse-all-yes-raw-1")
results.docHit.each{
def doc = new SolrInputDocument()
doc.addField("id", it.meta.identifier[0])
// skip browse-* sort-* facet-* for this demo
it.meta."*".findAll{ ! it.name().contains("-") }.each {
// multi valued tokenized field, gets copied to _ss multi value solr string and full text index
doc.addField(it.name()+"_txt", it.text())
// http://tingletech.tumblr.com/post/33016859375/one-trick-is-to-add-a-field-names-multivalued
doc.addField("field_names_ss", it.name())
}
server.add(doc)
}
server.commit()
/*
Copyright (c) 2012 Regents of the University of California and contributors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
- Neither the name of the University of California nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
@@ -262,6 +262,10 @@
<!-- Create a string version of author for faceting -->
<copyField source="author" dest="author_s"/>
+
+ <!-- test out facet on every field -->
+ <copyField source="*_txt" dest="*_ss"/>
+ <copyField source="*_txt" dest="text"/>
<!-- Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same

The schema.xml in the example application that ships with solr 4.0.0-BETA defines a couple of dynamic fields we can use.

<dynamicField name="*_txt" type="text_general" indexed="true"  stored="true" multiValued="true"/>

*_txt fileds are repeatable of type text_general

And for facets, we use can use

<dynamicField name="*_ss" type="string"  indexed="true"  stored="true" multiValued="true"/>

the *_ss dynamic field stores repeatable strings

<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" />

Also, everything gets copied into a field named text of type text_general, for keyword search across all fields:

<!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>

The text_general type used by the *_txt and text fields is defined as

    <!-- A general text field that has reasonable, generic
         cross-language defaults: it tokenizes with StandardTokenizer,
	 removes stop words from case-insensitive "stopwords.txt"
	 (empty by default), and down cases.  At query time only, it
	 also applies synonyms. -->
    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <!-- in this example, we will only use synonyms at query time
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

also edit solr/example/solr/collection1/conf/solrconfig.xml or otherwise set facet.* parameters; and the search results template solr/example/solr/collection1/conf/velocity/richtext-doc.vm needs to be edited to show your fields.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment