johnmiedema/ApacheTikaSolrIndexSearch

## ApacheTikaSolrIndexSearch
//Use Apache Tika and Solr to crawl, index and search documents
//John Miedema http://johnmiedema.com
//-----------------------------------------------------------
//referenced libraries:
//Apache Tika 1.5
//Apache Solr 4.7.2
//Apache HttpClient 4.3.3 reqd to connect to Solr server
//Noggit json parser reqd for Solr commands
//-----------------------------------------------------------
//after Solr is downloaded, start it using the following commands
//cd path\solr-4.7.2\example
//java -jar start.jar
//-----------------------------------------------------------

package whatson2;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;

import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class Main {

	private static SolrServer solr;

	public static void main(String[] args) throws IOException, SAXException, TikaException {

		try {
			solr = new HttpSolrServer("http://localhost:8983/solr/"); //create solr connection
			solr.deleteByQuery( "*:*" ); //delete everything in the index; good for testing

			//location of source documents
			//later this will be switched to a database
			String path = "C:\\content\\";
			String file_html = path + "mobydick.htm";
			String file_txt = path + "robinsoncrusoe.txt";
			String file_pdf = path + "callofthewild.pdf";

			processDocument(file_html);
			processDocument(file_txt);
			processDocument(file_pdf);

			solr.commit(); //after all docs are added, commit to the index

			//now you can search at http://localhost:8983/solr/browse
		}
	    catch  (Exception ex) {
	        System.out.println(ex.getMessage());
	    }
	}

	private static void processDocument(String pathfilename)  {

	    try {
	        InputStream input = new FileInputStream(new File(pathfilename));

	        //use Apache Tika to convert documents in different formats to plain text
	        ContentHandler textHandler = new BodyContentHandler(10*1024*1024);
	        Metadata meta = new Metadata();
	        Parser parser = new AutoDetectParser(); //handles documents in different formats:
	        ParseContext context = new ParseContext();
	        parser.parse(input, textHandler, meta, context); //convert to plain text

	        //collect metadata and content from Tika and other sources

	        //document id must be unique, use guid
		UUID guid = java.util.UUID.randomUUID();
		String docid = guid.toString();

	        //Dublin Core metadata (partial set)
	        String doctitle = meta.get(DublinCore.TITLE);
	        String doccreator = meta.get(DublinCore.CREATOR);

	        //other metadata
	        String docurl = pathfilename; //document url

	        //content
	        String doccontent = textHandler.toString();

	        //call to index
	        indexDocument(docid, doctitle, doccreator, docurl, doccontent);
	    }
	    catch  (Exception ex) {
	        System.out.println(ex.getMessage());
	    }
	}

	private static void indexDocument(String docid, String doctitle, String doccreator, String docurl, String doccontent)  {

		try {
			SolrInputDocument doc = new SolrInputDocument();

			doc.addField("id", docid);

			//map metadata fields to default schema
			//location: path\solr-4.7.2\example\solr\collection1\conf\schema.xml

			//Dublin Core
			//thought: schema could be modified to use Dublin Core
			doc.addField("title", doctitle);
			doc.addField("author", doccreator);

			//other metadata
			doc.addField("url", docurl);

			//content (and text)
			//per schema, the content field is not indexed by default, used for returning and highlighting document content
			//the schema "copyField" command automatically copies this to the "text" field which is indexed
			doc.addField("content", doccontent);

			//indexing
			//when a field is indexed, like "text", Solr will handle tokenization, stemming, removal of stopwords etc, per the schema defn

			//add to index
			solr.add(doc);
		}
		catch (Exception ex) {
			System.out.println(ex.getMessage());
		}
	}
}
	//Use Apache Tika and Solr to crawl, index and search documents
	//John Miedema http://johnmiedema.com
	//-----------------------------------------------------------
	//referenced libraries:
	//Apache Tika 1.5
	//Apache Solr 4.7.2
	//Apache HttpClient 4.3.3 reqd to connect to Solr server
	//Noggit json parser reqd for Solr commands
	//-----------------------------------------------------------
	//after Solr is downloaded, start it using the following commands
	//cd path\solr-4.7.2\example
	//java -jar start.jar
	//-----------------------------------------------------------

	package whatson2;

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.UUID;

	import org.apache.solr.client.solrj.SolrServer;
	import org.apache.solr.client.solrj.impl.HttpSolrServer;
	import org.apache.solr.common.SolrInputDocument;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.DublinCore;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.mime.MimeTypes;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.sax.BodyContentHandler;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.SAXException;

	public class Main {

	private static SolrServer solr;

	public static void main(String[] args) throws IOException, SAXException, TikaException {

	try {
	solr = new HttpSolrServer("http://localhost:8983/solr/"); //create solr connection
	solr.deleteByQuery( ":" ); //delete everything in the index; good for testing

	//location of source documents
	//later this will be switched to a database
	String path = "C:\\content\\";
	String file_html = path + "mobydick.htm";
	String file_txt = path + "robinsoncrusoe.txt";
	String file_pdf = path + "callofthewild.pdf";

	processDocument(file_html);
	processDocument(file_txt);
	processDocument(file_pdf);

	solr.commit(); //after all docs are added, commit to the index

	//now you can search at http://localhost:8983/solr/browse
	}
	catch (Exception ex) {
	System.out.println(ex.getMessage());
	}
	}

	private static void processDocument(String pathfilename) {

	try {
	InputStream input = new FileInputStream(new File(pathfilename));

	//use Apache Tika to convert documents in different formats to plain text
	ContentHandler textHandler = new BodyContentHandler(1010241024);
	Metadata meta = new Metadata();
	Parser parser = new AutoDetectParser(); //handles documents in different formats:
	ParseContext context = new ParseContext();
	parser.parse(input, textHandler, meta, context); //convert to plain text

	//collect metadata and content from Tika and other sources

	//document id must be unique, use guid
	UUID guid = java.util.UUID.randomUUID();
	String docid = guid.toString();

	//Dublin Core metadata (partial set)
	String doctitle = meta.get(DublinCore.TITLE);
	String doccreator = meta.get(DublinCore.CREATOR);

	//other metadata
	String docurl = pathfilename; //document url

	//content
	String doccontent = textHandler.toString();

	//call to index
	indexDocument(docid, doctitle, doccreator, docurl, doccontent);
	}
	catch (Exception ex) {
	System.out.println(ex.getMessage());
	}
	}

	private static void indexDocument(String docid, String doctitle, String doccreator, String docurl, String doccontent) {

	try {
	SolrInputDocument doc = new SolrInputDocument();

	doc.addField("id", docid);

	//map metadata fields to default schema
	//location: path\solr-4.7.2\example\solr\collection1\conf\schema.xml

	//Dublin Core
	//thought: schema could be modified to use Dublin Core
	doc.addField("title", doctitle);
	doc.addField("author", doccreator);

	//other metadata
	doc.addField("url", docurl);

	//content (and text)
	//per schema, the content field is not indexed by default, used for returning and highlighting document content
	//the schema "copyField" command automatically copies this to the "text" field which is indexed
	doc.addField("content", doccontent);

	//indexing
	//when a field is indexed, like "text", Solr will handle tokenization, stemming, removal of stopwords etc, per the schema defn

	//add to index
	solr.add(doc);
	}
	catch (Exception ex) {
	System.out.println(ex.getMessage());
	}
	}
	}