Created
August 31, 2011 00:37
-
-
Save josh-lauer/1182529 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is within a module named "Parser" | |
# be gentle, i'm rusty, lol | |
def Parser.parse_email(source) | |
# if source is empty, do nothing | |
return {} if source.empty? | |
# initialize some locally-scoped variables | |
results = { :header => {}, :original_keys => {}, :body => [] } | |
body_begin = 0 | |
# strip blank lines off the top of the source, if they exist. | |
# the parser will choke on a noncompliant email that starts with a blank line. | |
source = source.drop_while {|n| n.blank?} | |
# make sure the fist line doesn't have any leading whitespace, or | |
# the parser will choke | |
source[0].lstrip! | |
# first pass: rstrip each line, remove trailing ";", split lines that | |
# contain ";" into separate lines (with leading whitespace) | |
source.each_with_index do |this_line, n| | |
break if this_line.blank? | |
this_line.rstrip! | |
this_line.chop! if this_line[-1] == ";" | |
if this_line.include?(";") | |
this_line.split(";")[1 .. -1].reverse.each do |this_chunk| | |
source.insert(n + 1, " " + this_chunk) | |
end | |
source[n] = source[n].split(";")[0] | |
end | |
end | |
# iterate through email header line by line | |
source.each_with_index do |this_line, n| | |
# if this line is blank, you have reached the end of the header. | |
if this_line.blank? | |
body_begin = n + 1 | |
break | |
# if this line has leading whitespace and @last_key is defined | |
# it is a "folded" continuation of the previous line. | |
elsif this_line =~ /^\s+/ && defined?(@last_key) | |
# if the value corresponding to @last_key is a string, concatenate | |
# this line onto the string with a line break. If it is an array, | |
# concatenate it onto the last item in the array with a line break. | |
if results[:header][@last_key].class == String | |
results[:header][@last_key] << "\n#{this_line.strip}" | |
else | |
results[:header][@last_key][-1] << "\n#{this_line.strip}" | |
end | |
# if this line defines something which already exists in results[:header] | |
# then convert the value corresponding to key into an array (if it is | |
# currently a string) and push the new value string onto the array. | |
# TODO: this conditional is ugly and raises a warning. But, it's faster | |
# than scanning the string twice, so for now it stays. Fix or leave as is? | |
elsif results[:header].has_key?((this_key = this_line.scan(/^[a-z|A-Z|-]+/)[0]) && | |
this_key_symbolized = this_key.down_under.intern) | |
@last_key = this_key_symbolized | |
if results[:header][this_key_symbolized].class == String | |
temp = results[:header][this_key_symbolized] | |
results[:header][this_key_symbolized] = Array[temp] | |
end | |
results[:header][this_key_symbolized].push(this_line.sub(/^[a-z|A-Z|-]+[:]/, "").strip) | |
# otherwise, this line contains a new key, so push it on to results[:header], and save the | |
# original key in results[:original_keys]. | |
else | |
@last_key = this_key_symbolized | |
results[:header][this_key_symbolized] = this_line.sub(/^[a-z|A-Z|-]+[:]/, "").strip | |
results[:original_keys][this_key_symbolized] = this_key | |
end | |
end | |
# if multipart, get boundary value and chop it up | |
if results[:header].has_key?(:content_type) | |
if results[:header][:content_type].start_with?("multipart") | |
boundary_line = results[:header][:content_type].split("\n")[1] | |
#puts "boundary line: " + boundary_line | |
if boundary_line.start_with?("boundary=\"") | |
boundary = boundary_line.sub("boundary=\"", "").chop | |
elsif boundary_line.start_with?("boundary=") | |
boundary = boundary_line.sub("boundary=", "") | |
else | |
puts "Error: broken boundary line!!" | |
Process.exit | |
end | |
#puts "boundary: " + boundary | |
unparsed_bodies = Parser.break_by_boundary(source[body_begin .. -1], boundary) | |
#puts "number of chunks: " + unparsed_bodies.size.to_s | |
unparsed_bodies.each_with_index do |this_body, index| | |
#puts "parsing chunk " + index.to_s + " ..." | |
results[:body].push( parse_email(this_body) ) | |
end | |
#puts "parsed chunk classes:" | |
results[:body].each do |this_chunk| | |
#puts this_chunk.class | |
end | |
else | |
results[:body] = [ source[body_begin .. -1] ] | |
#puts "number of chunks: " + results[:body].size.to_s | |
#puts "parsed chunk classes:" | |
results[:body].each do |this_chunk| | |
#puts this_chunk.class | |
end | |
end | |
else | |
puts "NO CONTENT TYPE!!!" | |
Process.exit | |
end | |
# return the results to the caller | |
return results | |
end # end parse() | |
def Parser.break_by_boundary(source, boundary) | |
results = [] | |
boundaries = [] | |
source.each_index do |index| | |
if source[index].start_with?("--#{boundary}--") | |
boundaries.push(index) | |
break | |
elsif source[index].start_with?("--#{boundary}") | |
boundaries.push(index) | |
end | |
end | |
last_boundary = boundaries[0] | |
boundaries[1 .. -1].each do |n| | |
results.push(source[(last_boundary + 1) .. (n - 1)]) | |
last_boundary = n | |
end | |
return results | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment