nichtich/README.txt

## README.txt
This gist contains two files for simple indexing of PDF files.

== requirements ==
First you need to install Solr (which requires a Java JDK): Download a tar or zipfile at http://www.apache.org/dyn/closer.cgi/lucene/solr/ and unpack it to a directory of your choice. Go into this directory and start solr running in jetty by:

$ cd example
$ java -jar start.jar

Then locate your browser to http://localhost:8983/solr/

== data extraction ==
Metadata and text is extracted from PDF files with 'xpdf'.
An alternative to xpdf is PDFbox: http://pdfbox.apache.org/.
You should also have a look at http://aperture.sourceforge.net/
to extract (meta)data from PDF and other files but aperture seems to depend
on half the internet (or I just don't like overblown Java frameworks).

== usage ==
Put pdf2solr.sh and html2solr.xsl in a directory of your choice and make it
executable. Then just call pdf2solr.sh with one or more PDF files as arguments.

== limitations ==
The current version only indexes text, author and title fields.

## html2solr.xsl
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      xmlns:m="http://example.com" exclude-result-prefixes="xsl m">

<xsl:output method="xml" indent="yes" encoding="UTF-8"/>

<mapping xmlns="http://example.com">
  <map meta="Author" field="author"/>
  <map meta="Title" field="title"/>
</mapping>

<xsl:param name="ID"/>
<xsl:variable name="keepunknownfields" select="false()"/>
<xsl:variable name="textfield" select="'text'"/>
<xsl:variable name="mapping" select="document('')//m:map"/>

<xsl:template match="/html">
  <add>
    <doc>
      <field name="id"><xsl:value-of select="$ID"/></field> <!-- TODO: ID must not be empty -->
      <xsl:for-each select="head/meta">
        <xsl:variable name="meta" select="@name"/>
        <xsl:variable name="map2" select="$mapping[@meta=$meta]/@field"/>
        <xsl:if test="$map2 or $keepunknownfields">
        <field>
          <xsl:attribute name="name">
            <xsl:if test="$map2"><xsl:value-of select="$map2"/></xsl:if>
            <xsl:if test="not($map2)"><xsl:value-of select="$meta"/></xsl:if>
          </xsl:attribute>
          <xsl:value-of select="@content"/>
        </field>
        </xsl:if>
      </xsl:for-each>
      <field name="{$textfield}">
        <xsl:value-of select="body"/>
      </field>
    </doc>
  </add>
</xsl:template>

</xsl:stylesheet>

## pdf2solr.sh
#!/bin/bash

# simple PDF indexer for Solr

FILES=$*
TMPFILE=~tmp
HTML2SOLR=html2solr.xsl
URL=http://localhost:8983/solr/update

for PDF in $FILES; do
  [ -r "$PDF" ] || continue

  SHA=`shasum "$PDF" |awk '{print $1}'`

  pdftotext -htmlmeta "$PDF" $TMPFILE.htm
  xmllint --xmlout --dropdtd --html $TMPFILE.htm 2> /dev/null > $TMPFILE.xhtml
  xsltproc --param ID "'"$SHA"'" $HTML2SOLR $TMPFILE.xhtml > $TMPFILE.xml
  # TODO: xsltproc may fail because of broken XML

  f="$TMPFILE.xml"
  echo "Posting file $f to $URL as $SHA"
  curl $URL --data-binary @$f -H 'Content-type:text/xml; charset=utf-8'
  echo
done

#send the commit command to make sure all the changes are flushed and visible
curl $URL --data-binary '<commit/>' -H 'Content-type:text/xml; charset=utf-8'
echo
	This gist contains two files for simple indexing of PDF files.

	== requirements ==
	First you need to install Solr (which requires a Java JDK): Download a tar or zipfile at http://www.apache.org/dyn/closer.cgi/lucene/solr/ and unpack it to a directory of your choice. Go into this directory and start solr running in jetty by:

	$ cd example
	$ java -jar start.jar

	Then locate your browser to http://localhost:8983/solr/

	== data extraction ==
	Metadata and text is extracted from PDF files with 'xpdf'.
	An alternative to xpdf is PDFbox: http://pdfbox.apache.org/.
	You should also have a look at http://aperture.sourceforge.net/
	to extract (meta)data from PDF and other files but aperture seems to depend
	on half the internet (or I just don't like overblown Java frameworks).

	== usage ==
	Put pdf2solr.sh and html2solr.xsl in a directory of your choice and make it
	executable. Then just call pdf2solr.sh with one or more PDF files as arguments.

	== limitations ==
	The current version only indexes text, author and title fields.
	<?xml version="1.0" encoding="UTF-8"?>
	<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
	xmlns:m="http://example.com" exclude-result-prefixes="xsl m">

	<xsl:output method="xml" indent="yes" encoding="UTF-8"/>

	<mapping xmlns="http://example.com">
	<map meta="Author" field="author"/>
	<map meta="Title" field="title"/>
	</mapping>

	<xsl:param name="ID"/>
	<xsl:variable name="keepunknownfields" select="false()"/>
	<xsl:variable name="textfield" select="'text'"/>
	<xsl:variable name="mapping" select="document('')//m:map"/>

	<xsl:template match="/html">
	<add>
	<doc>
	<field name="id"><xsl:value-of select="$ID"/></field> <!-- TODO: ID must not be empty -->
	<xsl:for-each select="head/meta">
	<xsl:variable name="meta" select="@name"/>
	<xsl:variable name="map2" select="$mapping[@meta=$meta]/@field"/>
	<xsl:if test="$map2 or $keepunknownfields">
	<field>
	<xsl:attribute name="name">
	<xsl:if test="$map2"><xsl:value-of select="$map2"/></xsl:if>
	<xsl:if test="not($map2)"><xsl:value-of select="$meta"/></xsl:if>
	</xsl:attribute>
	<xsl:value-of select="@content"/>
	</field>
	</xsl:if>
	</xsl:for-each>
	<field name="{$textfield}">
	<xsl:value-of select="body"/>
	</field>
	</doc>
	</add>
	</xsl:template>

	</xsl:stylesheet>
	#!/bin/bash

	# simple PDF indexer for Solr

	FILES=$*
	TMPFILE=~tmp
	HTML2SOLR=html2solr.xsl
	URL=http://localhost:8983/solr/update

	for PDF in $FILES; do
	[ -r "$PDF" ] \|\| continue

	SHA=`shasum "$PDF" \|awk '{print $1}'`

	pdftotext -htmlmeta "$PDF" $TMPFILE.htm
	xmllint --xmlout --dropdtd --html $TMPFILE.htm 2> /dev/null > $TMPFILE.xhtml
	xsltproc --param ID "'"$SHA"'" $HTML2SOLR $TMPFILE.xhtml > $TMPFILE.xml
	# TODO: xsltproc may fail because of broken XML

	f="$TMPFILE.xml"
	echo "Posting file $f to $URL as $SHA"
	curl $URL --data-binary @$f -H 'Content-type:text/xml; charset=utf-8'
	echo
	done

	#send the commit command to make sure all the changes are flushed and visible
	curl $URL --data-binary '<commit/>' -H 'Content-type:text/xml; charset=utf-8'
	echo