Skip to content

Instantly share code, notes, and snippets.

@zaid
Last active July 21, 2016 15:07
Show Gist options
  • Save zaid/84ebf2cec59941806ee2db55efa1e322 to your computer and use it in GitHub Desktop.
Save zaid/84ebf2cec59941806ee2db55efa1e322 to your computer and use it in GitHub Desktop.
Attachments from Outlook Msgs extractor
require 'mapi/msg'
require 'fileutils'
require 'pdfkit'
require 'nokogiri'
PDFKit.configure do |config|
config.default_options[:load_error_handling] = 'ignore'
config.default_options[:load_media_error_handling] = 'ignore'
config.default_options[:disable_javascript] = true
end
def extract_pdf_files(path)
extraction_directory = File.join(File.dirname(path), 'extracted_files')
FileUtils.mkdir(extraction_directory) unless File.exists?(extraction_directory)
Dir.glob(File.join(path, '*.msg')).each do |file_path|
msg = Mapi::Msg.open(file_path)
pdf_filepath = File.join(extraction_directory, "#{File.basename(file_path, '.*')}.pdf")
next if File.exists?(pdf_filepath)
msg.attachments.each do |attachment|
next unless attachment.filename
File.open(File.join(extraction_directory, attachment.filename), 'wb') do |output|
source = attachment.data.io
source.rewind
output << source.read
end
end
unless File.exists?(pdf_filepath)
mime = msg.to_mime
if mime.parts.nil?
puts "MIME conversion failed for #{File.basename(file_path)}"
next
end
html_mime_parts(mime).each do |mime_part|
body = mime_part.body.to_s.dup.encode('UTF-8', invalid: :replace, undef: :replace)
raw_html = html_from_nokogiri(body)
raw_html ||= msg.properties.body
kit = PDFKit.new(raw_html)
kit.to_file(pdf_filepath)
end
unless File.exists?(pdf_filepath)
text_mime_parts(mime).each do |mime_part|
body = mime_part.body.to_s.dup.encode('UTF-8', invalid: :replace, undef: :replace)
kit = PDFKit.new(body)
kit.to_file(pdf_filepath)
end
end
end
end
end
def html_mime_parts(mime)
mime.parts.select do |part|
part.content_type == 'text/html'
end
end
def text_mime_parts(mime)
mime.parts.select do |part|
part.content_type == 'text/plain'
end + mime.parts.select do |part|
part.content_type == 'multipart/alternative'
end.map do |part|
part.parts.select do |child_part|
child_part.content_type == 'text/plain'
end
end.flatten.select { |part| part.body =~ /Dear/ }
end
def html_from_nokogiri(html)
Nokogiri::HTML::Document.new(html).to_html(encoding: 'UTF-8')
end
@zaid
Copy link
Author

zaid commented Jun 30, 2016

@alexchumak this is a working prototype for extracting attachments from those MSG files.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment