tingletech/oai_to_solr.groovy

## oai_to_solr.groovy
/*  oai_to_solr.groovy
    load oai xml files into solr */
/*
  groovy oai_to_solr.groovy > urllist.txt
*/

import groovy.io.FileType
// set up the solr library with grape http://wiki.apache.org/solr/Solrj#Use_Groovy_and_Grape
@Grab(group='org.apache.solr', module='solr-solrj', version='4.0.0-BETA')
@Grab(group='org.slf4j', module='slf4j-jdk14', version='1.6.6')
import org.apache.solr.client.solrj.impl.HttpSolrServer
import org.apache.solr.common.SolrInputDocument

// set up solr connection
String url = "http://localhost:8983/solr/collection1"
def server = new HttpSolrServer( url );

def dir = new File("/var/lib/tomcat7/webapps/oai/WEB-INF/harvested_records")
dir.eachFileRecurse (FileType.FILES) { file ->
  // some OAI files are not well formed
  // this needs to be refactored; catching and swallowing too many errors
  try {
    def oai_dc = new XmlSlurper().parse(file).declareNamespace(
      dc: "http://purl.org/dc/elements/1.1/",
      oai_dc: "http://www.openarchives.org/OAI/2.0/oai_dc/"
    )
    def id = oai_dc."dc:identifier".findAll{ it.text().startsWith("http") }[0]
    def doc = new SolrInputDocument()
    println id
    doc.addField("id", id)
    oai_dc."*".each{
      // multi valued tokenized field, gets copied to _ss multi value solr string and full text index
      doc.addField(it.name()+"_txt", it.text())
      // http://tingletech.tumblr.com/post/33016859375/one-trick-is-to-add-a-field-names-multivalued
      doc.addField("field_names_ss", it.name())
    }
    server.add(doc)
  } catch (all) {
    // println all
  }
}
server.commit()

/*
Copyright (c) 2012 Regents of the University of California and contributors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

- Redistributions of source code must retain the above copyright notice,
  this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
- Neither the name of the University of California nor the names of its
  contributors may be used to endorse or promote products derived from this
  software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/

## xtf_to_solr.groovy
/*  xtf_to_solr.groovy
    load xtf raw serach results into solr */

// set up the solr library with grape http://wiki.apache.org/solr/Solrj#Use_Groovy_and_Grape
@Grab(group='org.apache.solr', module='solr-solrj', version='4.0.0-BETA')
@Grab(group='org.slf4j', module='slf4j-jdk14', version='1.6.6')
import org.apache.solr.client.solrj.impl.HttpSolrServer
import org.apache.solr.common.SolrInputDocument

// set up solr connection
String url = "http://localhost:8983/solr/collection1"
def server = new HttpSolrServer( url );

// XTF results
def results = new XmlSlurper().parse("search-browse-all-yes-raw-1")
results.docHit.each{
    def doc = new SolrInputDocument()
    doc.addField("id", it.meta.identifier[0])
    // skip browse-* sort-* facet-* for this demo
    it.meta."*".findAll{ ! it.name().contains("-") }.each {
        // multi valued tokenized field, gets copied to _ss multi value solr string and full text index
        doc.addField(it.name()+"_txt", it.text())
        // http://tingletech.tumblr.com/post/33016859375/one-trick-is-to-add-a-field-names-multivalued
        doc.addField("field_names_ss", it.name())
    }
    server.add(doc)
}
server.commit()
/*
Copyright (c) 2012 Regents of the University of California and contributors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

- Redistributions of source code must retain the above copyright notice,
  this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
- Neither the name of the University of California nor the names of its
  contributors may be used to endorse or promote products derived from this
  software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/

## ⎉schema.xml.diff
@@ -262,6 +262,10 @@

    <!-- Create a string version of author for faceting -->
    <copyField source="author" dest="author_s"/>
+
+   <!-- test out facet on every field -->
+   <copyField source="*_txt" dest="*_ss"/>
+   <copyField source="*_txt" dest="text"/>

    <!-- Above, multiple source fields are copied to the [text] field.
     Another way to map multiple source fields to the same

## ⎊dynamicField.md

      
    Raw
  

              ⎊dynamicField.md
            
          
    The schema.xml in the example application that ships with solr 4.0.0-BETA defines a couple of dynamic fields we can use.
<dynamicField name="*_txt" type="text_general" indexed="true"  stored="true" multiValued="true"/>
*_txt fileds are repeatable of type text_general
And for facets, we use can use
<dynamicField name="*_ss" type="string"  indexed="true"  stored="true" multiValued="true"/>
the *_ss dynamic field stores repeatable strings
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" />
Also, everything gets copied into a field named text of type text_general, for
keyword search across all fields:
<!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
The text_general type used by the *_txt and text fields is defined as
    <!-- A general text field that has reasonable, generic
         cross-language defaults: it tokenizes with StandardTokenizer,
	 removes stop words from case-insensitive "stopwords.txt"
	 (empty by default), and down cases.  At query time only, it
	 also applies synonyms. -->
    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <!-- in this example, we will only use synonyms at query time
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

  
## ⎊readme.md

      
    Raw
  

              ⎊readme.md
            
          
    also edit solr/example/solr/collection1/conf/solrconfig.xml or otherwise set facet.* parameters; and the search results template  solr/example/solr/collection1/conf/velocity/richtext-doc.vm needs to be edited to show your fields.
	/* oai_to_solr.groovy
	load oai xml files into solr */
	/*
	groovy oai_to_solr.groovy > urllist.txt
	*/

	import groovy.io.FileType
	// set up the solr library with grape http://wiki.apache.org/solr/Solrj#Use_Groovy_and_Grape
	@Grab(group='org.apache.solr', module='solr-solrj', version='4.0.0-BETA')
	@Grab(group='org.slf4j', module='slf4j-jdk14', version='1.6.6')
	import org.apache.solr.client.solrj.impl.HttpSolrServer
	import org.apache.solr.common.SolrInputDocument

	// set up solr connection
	String url = "http://localhost:8983/solr/collection1"
	def server = new HttpSolrServer( url );

	def dir = new File("/var/lib/tomcat7/webapps/oai/WEB-INF/harvested_records")
	dir.eachFileRecurse (FileType.FILES) { file ->
	// some OAI files are not well formed
	// this needs to be refactored; catching and swallowing too many errors
	try {
	def oai_dc = new XmlSlurper().parse(file).declareNamespace(
	dc: "http://purl.org/dc/elements/1.1/",
	oai_dc: "http://www.openarchives.org/OAI/2.0/oai_dc/"
	)
	def id = oai_dc."dc:identifier".findAll{ it.text().startsWith("http") }[0]
	def doc = new SolrInputDocument()
	println id
	doc.addField("id", id)
	oai_dc."*".each{
	// multi valued tokenized field, gets copied to _ss multi value solr string and full text index
	doc.addField(it.name()+"_txt", it.text())
	// http://tingletech.tumblr.com/post/33016859375/one-trick-is-to-add-a-field-names-multivalued
	doc.addField("field_names_ss", it.name())
	}
	server.add(doc)
	} catch (all) {
	// println all
	}
	}
	server.commit()

	/*
	Copyright (c) 2012 Regents of the University of California and contributors
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:

	- Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.
	- Redistributions in binary form must reproduce the above copyright notice,
	this list of conditions and the following disclaimer in the documentation
	and/or other materials provided with the distribution.
	- Neither the name of the University of California nor the names of its
	contributors may be used to endorse or promote products derived from this
	software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	POSSIBILITY OF SUCH DAMAGE.
	*/
	/* xtf_to_solr.groovy
	load xtf raw serach results into solr */

	// set up the solr library with grape http://wiki.apache.org/solr/Solrj#Use_Groovy_and_Grape
	@Grab(group='org.apache.solr', module='solr-solrj', version='4.0.0-BETA')
	@Grab(group='org.slf4j', module='slf4j-jdk14', version='1.6.6')
	import org.apache.solr.client.solrj.impl.HttpSolrServer
	import org.apache.solr.common.SolrInputDocument

	// set up solr connection
	String url = "http://localhost:8983/solr/collection1"
	def server = new HttpSolrServer( url );

	// XTF results
	def results = new XmlSlurper().parse("search-browse-all-yes-raw-1")
	results.docHit.each{
	def doc = new SolrInputDocument()
	doc.addField("id", it.meta.identifier[0])
	// skip browse-* sort-* facet-* for this demo
	it.meta."*".findAll{ ! it.name().contains("-") }.each {
	// multi valued tokenized field, gets copied to _ss multi value solr string and full text index
	doc.addField(it.name()+"_txt", it.text())
	// http://tingletech.tumblr.com/post/33016859375/one-trick-is-to-add-a-field-names-multivalued
	doc.addField("field_names_ss", it.name())
	}
	server.add(doc)
	}
	server.commit()
	/*
	Copyright (c) 2012 Regents of the University of California and contributors
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:

	- Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.
	- Redistributions in binary form must reproduce the above copyright notice,
	this list of conditions and the following disclaimer in the documentation
	and/or other materials provided with the distribution.
	- Neither the name of the University of California nor the names of its
	contributors may be used to endorse or promote products derived from this
	software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	POSSIBILITY OF SUCH DAMAGE.
	*/
	@@ -262,6 +262,10 @@

	<!-- Create a string version of author for faceting -->
	<copyField source="author" dest="author_s"/>
	+
	+ <!-- test out facet on every field -->
	+ <copyField source="_txt" dest="_ss"/>
	+ <copyField source="*_txt" dest="text"/>

	<!-- Above, multiple source fields are copied to the [text] field.
	Another way to map multiple source fields to the same