Last active
June 25, 2017 21:12
-
-
Save buren/0ee04f634d0b1599bcc1a02966091428 to your computer and use it in GitHub Desktop.
Parse documents with the yomu gem.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'yomu' | |
filename_or_stream = ARGV[0] # This could also be a tempfile or any object that responds to #read | |
parsed_document = Yomu.new(filename_or_stream) | |
metadata = parsed_document.metadata | |
result = { | |
title: metadata['title'], | |
created_at: metadata['meta:creation-date'], | |
created_at_human: metadata['created'], | |
filename: filename, | |
metadata: metadata, | |
text: parsed_document.text | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"Content-Type"=>"application/pdf", | |
"Creation-Date"=>"2015-08-17T22:29:23Z", | |
"X-Parsed-By"=> | |
["org.apache.tika.parser.DefaultParser", | |
"org.apache.tika.parser.pdf.PDFParser"], | |
"access_permission:assemble_document"=>"true", | |
"access_permission:can_modify"=>"true", | |
"access_permission:can_print"=>"true", | |
"access_permission:can_print_degraded"=>"true", | |
"access_permission:extract_content"=>"true", | |
"access_permission:extract_for_accessibility"=>"true", | |
"access_permission:fill_in_form"=>"true", | |
"access_permission:modify_annotations"=>"true", | |
"created"=>"Tue Aug 18 00:29:23 CEST 2015", | |
"dc:format"=>"application/pdf; version=1.4", | |
"dc:title"=>"Jacob | Full Stack Developer & Code Enthusiast", | |
"dcterms:created"=>"2015-08-17T22:29:23Z", | |
"meta:creation-date"=>"2015-08-17T22:29:23Z", | |
"pdf:PDFVersion"=>"1.4", | |
"pdf:encrypted"=>"false", | |
"producer"=>"wkhtmltopdf", | |
"title"=>"Jacob | Full Stack Developer & Code Enthusiast", | |
"xmpTPg:NPages"=>"1"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"Content-Type"=>"application/pdf", | |
"X-Parsed-By"=> | |
["org.apache.tika.parser.DefaultParser", | |
"org.apache.tika.parser.pdf.PDFParser"], | |
"access_permission:assemble_document"=>"true", | |
"access_permission:can_modify"=>"true", | |
"access_permission:can_print"=>"true", | |
"access_permission:can_print_degraded"=>"true", | |
"access_permission:extract_content"=>"true", | |
"access_permission:extract_for_accessibility"=>"true", | |
"access_permission:fill_in_form"=>"true", | |
"access_permission:modify_annotations"=>"true", | |
"dc:format"=>"application/pdf; version=1.5", | |
"pdf:PDFVersion"=>"1.5", | |
"pdf:encrypted"=>"false", | |
"producer"=>"Skia/PDF m59", | |
"xmpTPg:NPages"=>"1"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"Application-Name": "Microsoft Macintosh Word", | |
"Application-Version": "15.0000", | |
"Author": "Lukas", | |
"Character Count": "11503", | |
"Character-Count-With-Spaces": "13302", | |
"Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
"Creation-Date": "2017-05-29T16:28:00Z", | |
"Last-Author": "Lukas", | |
"Last-Modified": "2017-05-29T16:28:00Z", | |
"Last-Save-Date": "2017-05-29T16:28:00Z", | |
"Line-Count": "95", | |
"Page-Count": "8", | |
"Paragraph-Count": "26", | |
"Revision-Number": "2", | |
"Template": "Normal.dotm", | |
"Word-Count": "1825", | |
"X-Parsed-By": [ | |
"org.apache.tika.parser.DefaultParser", | |
"org.apache.tika.parser.microsoft.ooxml.OOXMLParser" | |
], | |
"cp:revision": "2", | |
"creator": "Lukas", | |
"date": "2017-05-29T16:28:00Z", | |
"dc:creator": "Lukas", | |
"dc:publisher": "Business", | |
"dc:title": "Belgium", | |
"dcterms:created": "2017-05-29T16:28:00Z", | |
"dcterms:modified": "2017-05-29T16:28:00Z", | |
"extended-properties:AppVersion": "15.0000", | |
"extended-properties:Application": "Microsoft Macintosh Word", | |
"extended-properties:Company": "Business", | |
"extended-properties:Template": "Normal.dotm", | |
"meta:author": "Lukas", | |
"meta:character-count": "11503", | |
"meta:character-count-with-spaces": "13302", | |
"meta:creation-date": "2017-05-29T16:28:00Z", | |
"meta:last-author": "Lukas", | |
"meta:line-count": "95", | |
"meta:page-count": "8", | |
"meta:paragraph-count": "26", | |
"meta:save-date": "2017-05-29T16:28:00Z", | |
"meta:word-count": "1825", | |
"modified": "2017-05-29T16:28:00Z", | |
"publisher": "Business", | |
"title": "Belgium", | |
"xmpTPg:NPages": "8" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment