deepakprasanna/doc_pdf_crawler.rb

## doc_pdf_crawler.rb
JARS_PATH = File.join(RAILS_ROOT, "lib/jars")
Dir["#{JARS_PATH}/*jar"].each {|j| require j} #Suck the jars.

require "java"

import org.apache.pdfbox.pdfparser.PDFParser
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.util.PDFTextStripper
import org.apache.poi.extractor.ExtractorFactory
#importing all the required classes.


#PDF parsing
pdf_path = "sample.pdf"
pdf_file=PDDocument.load pdf_path
pdf_text=PDFTextStripper.new.getText(doc)
pdf_file.close
puts pdf_text #text inside the pdf file


#Doc/docx Parsing
doc_path = "sample.doc"
doc_file=java.io.FileInputStream.new(resume)
extractor=ExtractorFactory.createExtractor(fs)
doc_file.close
puts extractor.text #text inside doc file


puts "Whoa, im done" #super cool
#Make sure to run this script using jruby and not the system ruby.
#To execute the script, execute the command "ruby doc_pdf_crawler.rb" in your terminal.
	JARS_PATH = File.join(RAILS_ROOT, "lib/jars")
	Dir["#{JARS_PATH}/*jar"].each {\|j\| require j} #Suck the jars.

	require "java"

	import org.apache.pdfbox.pdfparser.PDFParser
	import org.apache.pdfbox.pdmodel.PDDocument
	import org.apache.pdfbox.util.PDFTextStripper
	import org.apache.poi.extractor.ExtractorFactory
	#importing all the required classes.


	#PDF parsing
	pdf_path = "sample.pdf"
	pdf_file=PDDocument.load pdf_path
	pdf_text=PDFTextStripper.new.getText(doc)
	pdf_file.close
	puts pdf_text #text inside the pdf file


	#Doc/docx Parsing
	doc_path = "sample.doc"
	doc_file=java.io.FileInputStream.new(resume)
	extractor=ExtractorFactory.createExtractor(fs)
	doc_file.close
	puts extractor.text #text inside doc file


	puts "Whoa, im done" #super cool
	#Make sure to run this script using jruby and not the system ruby.
	#To execute the script, execute the command "ruby doc_pdf_crawler.rb" in your terminal.