Skip to content

Instantly share code, notes, and snippets.

@rubys
Created February 1, 2015 11:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rubys/6f6bdb8fae57017e2fc8 to your computer and use it in GitHub Desktop.
Save rubys/6f6bdb8fae57017e2fc8 to your computer and use it in GitHub Desktop.
require 'mail'
require 'zlib'
require 'zip'
require 'yaml'
require 'stringio'
def file_list(contents, command)
file = Tempfile.new('parsemail')
file.write(contents)
file.close
output = `#{command} #{file.path}`.scan(/\s+(\d+)\s*(?:[-:\s]\d+){5}\s+(.*)/)
output.map! {|length, name| {length: length.to_i, name: name}}
output.select! {|file| file[:length]>0 or not file[:name].end_with? '/'}
output.select! {|file| file[:name] !~ /(^|\/)__MACOSX\//}
output.select! {|file| file[:name] !~ /(^|\/).DS_Store$/}
file.unlink
output
end
def zip_list(contents)
file_list(contents, 'unzip -lqq')
end
def tgz_list(contents)
file_list(contents, 'tar tzvf')
end
def safe_paths? paths
not paths.any? do |path|
name = path[:name]
name.empty? or # no path found
name =~ /[^[:print:]]/ or # non printable characters
name =~ /^[\\\/]/ or # leading slash or backslash
name =~ /(^|\\|\/)\.\./ # .. in a path
end
end
def page_count(contents)
file = Tempfile.new('parsemail')
file.write(contents)
file.close
count = `pdftk #{file.path} dump_data 2>&1`[/NumberOfPages: (\d+)/, 1].to_i
file.unlink
count
end
path = '/home/apmail/private-arch/officers-secretary'
database = File.basename(path)
Dir.chdir File.dirname(File.expand_path(__FILE__))
if ARGV.include? '--update'
system "rsync -av --delete --exclude='*.yml' minotaur.apache.org:#{path} ."
end
width = 0
Dir[File.join(database, '2*')].sort.each do |name|
next if name.end_with? '.yml'
print "#{name.ljust(width)}\r"
width = name.length
yaml = File.join(database, File.basename(name)[/\d+/] + '.yml')
mbox = YAML.load_file(yaml) rescue {}
next if mbox[:mtime] == File.mtime(name)
mbox[:mtime] = File.mtime(name)
mails = File.read(name)
if name.end_with? '.gz'
stream = StringIO.new(mails)
reader = Zlib::GzipReader.new(stream)
mails = reader.read
reader.close
stream.close rescue nil
end
mails.force_encoding Encoding::ASCII_8BIT
mails = mails.split(/^From .*/)
mails.shift
mails.each do |mail|
id = mail[/^Message-ID: <(.*?)>\s*$/i, 1]
next if id and mbox[id]
mail = Mail.read_from_string(mail)
id ||= mail.message_id
next if mbox[id]
begin
from = Mail::Address.new(mail[:from].value).display_name
rescue Exception
from = mail[:from].value
end
mbox[id] = {
from: mail.from_addrs.first,
name: from,
date: mail.date,
subject: (mail.subject rescue mail.header['subject'].value.inspect)
}
if mail.attachments.length > 0
cc = []
cc = mail[:to].value.split(/,\s*/) if mail[:to]
cc += mail[:cc].value.split(/,\s*/) if mail[:cc]
cc.reject! do |email|
begin
address = Mail::Address.new(email).address
return true if address == 'secretary@apache.org'
return true if mail.from_addrs.include? address
rescue Exception
true
end
end
parts = mail.attachments.map do |attach|
result = {name: attach.filename, length: attach.body.to_s.length,
mime: attach.mime_type}
if attach.filename =~ /\.zip$/
result[:parts] = zip_list(attach.body.to_s)
elsif attach.filename =~ /\.t(ar\.)?gz$/
result[:parts] = tgz_list(attach.body.to_s)
elsif attach.filename =~ /\.pdf$/
result[:pages] = page_count(attach.body.to_s)
end
result
end
mbox[id].merge! cc: cc, parts: parts.to_a
end
end
File.open(yaml, 'w') { |file| YAML.dump(mbox, file) }
end
puts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment