Skip to content

Instantly share code, notes, and snippets.

@UmarIqbal
Created January 17, 2014 12:36
Show Gist options
  • Save UmarIqbal/8472680 to your computer and use it in GitHub Desktop.
Save UmarIqbal/8472680 to your computer and use it in GitHub Desktop.
package searchIndexer;
import java.io.File;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PDFReader {
//Extract text from PDF document
public DocStructure index(File file)
{
String content="";
try
{
PDDocument doc = PDDocument.load(file);
try
{
content= new PDFTextStripper().getText(doc);
}
catch(Exception e)
{
System.out.println("Unreadable file. Can not be indexed");
}
doc.close();
}
catch(Exception e)
{
e.printStackTrace();
}
return new DocStructure((long)file.getName().hashCode(), file.getName(), content);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment