Created
September 12, 2013 12:56
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Entry | |
attr_accessor :author_name, :author_email, :title, :body, :semester | |
def initialize(author_name, author_email, title, body, semester, date_of_submission) | |
@author_name = author_name | |
@author_email = author_email | |
@title = title | |
@body = body | |
@semester = semester # "200(8|9) (Spring|Fall)" | |
end | |
def self.all | |
@all ||= [] | |
end | |
end | |
# Define a method to handle unescaping the escape characters, | |
# since they're supposed to be read that way from the dump file | |
def unescape_escapes(s) | |
s = s.gsub("\\\\", "\\") #Backslash | |
s = s.gsub('\\"', '"') #Double quotes | |
s = s.gsub("\\'", "\'") #Single quotes | |
s = s.gsub("\\a", "\a") #Bell/alert | |
s = s.gsub("\\b", "\b") #Backspace | |
s = s.gsub("\\r", "\r") #Carriage Return | |
s = s.gsub("\\n", "\n") #New Line | |
s = s.gsub("\\s", "\s") #Space | |
s = s.gsub("\\t", "\t") #Tab | |
s | |
end | |
File.open('problemchilddb.sql') do |f| | |
f.each_line do |line| | |
if line =~ /INSERT INTO `Entries`/ | |
# regex matching each entry is easy because I knew that every single one | |
# ends with NULL, and is obviously wrapped with parenthesis | |
line.scan(%r{\(.*?NULL\)}) do |entry_as_string| | |
begin | |
entry_as_string = entry_as_string.encode( | |
'ISO-8859-1', | |
:fallback => { | |
"€" => "\x80".force_encoding('ISO-8859-1'), | |
"™" => "\x99".force_encoding('ISO-8859-1'), | |
"˜" => "\x98".force_encoding('ISO-8859-1'), | |
"”" => "\x94".force_encoding('ISO-8859-1'), | |
"“" => "\x93".force_encoding('ISO-8859-1'), | |
"œ" => "\x9c".force_encoding('ISO-8859-1'), | |
"\u009D" => "\xfd".force_encoding('ISO-8859-1'), | |
} | |
).force_encoding('UTF-8') | |
entry_as_string = unescape_escapes(entry_as_string) | |
entry_params = entry_as_string.scan(/'(.*?)',/m).flatten | |
rescue Encoding::UndefinedConversionError | |
# some entries didn't seem to have been UTF8 originally, and the | |
# Latin1->UTF8 conversion had totally wrecked them. The number of | |
# these turned out to be 7, so manual cleanup made sense. | |
entry_as_string = unescape_escapes(entry_as_string) | |
entry_params = entry_as_string.scan(/'(.*?)',/m).flatten | |
end | |
if entry_params.length == 6 | |
Entry.all << Entry.new(*entry_params) | |
else | |
p entry_as_string | |
p entry_params | |
fail "you're parsing shit badly (you didn't end up with 6 arguments, see above)" | |
end | |
end # line.scan | |
end # if line =~ | |
end # f.each_line | |
end # File.open |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment