Skip to content

Instantly share code, notes, and snippets.

@gr8bit
Last active March 10, 2023 06:34
When exporting CSV with the Microsoft SQL Administrator tool, it generates somewhat invalid CSV which cannot be parsed through regular libraries. This ruby-tool reads these files and converts them into valid CSV, keeping encoding, seperators, and linebreaks from the original file. Run "ruby fix_sql_administrator_csv.rb" for info.
require 'csv'
class MSSQLAdministratorCSVParser
QUOTE = '"'
ESCAPE = '"'
SEPARATORS = %W(; , \t) # when equally probable, separatur occurring first is being picked
UTF8_BOM = "\xef\xbb\xbf".force_encoding('ASCII-8BIT')
def initialize(file_name, options={})
error("file '#{file_name}' does not exist") unless File.exists? file_name
@file_name = file_name
detected_options = detect_options(file_name)
@options = detected_options.merge(options)
@file = read_file(@file_name, detected_options[:is_utf8])
end
# as correctly quoted, readable csv
def as_csv(options={})
options = @options.merge(options)
CSV.generate(
col_sep: options[:separator_char],
row_sep: options[:line_end],
quote_char: options[:quote_char],
force_quotes: true
) do |csv|
parse.each { |line|
csv << line
}
end
end
# for validation purposes
def as_administrator_csv
lines = parse.map{|line|
line.map{|col|
(col.include?(@options[:separator_char]) ? "#{@options[:quote_char]}#{col}#{@options[:quote_char]}" : col)
}.join(@options[:separator_char])
}.join(@options[:line_end])
if @options[:is_utf8]
UTF8_BOM + lines.force_encoding('ASCII-8BIT') + @options[:line_end]
else
lines.force_encoding('ASCII-8BIT') + @options[:line_end]
end
end
private
def detect_options(file_name)
content = File.read(file_name, mode: 'rb')
utf8 = content.start_with? UTF8_BOM
content = read_file(file_name, utf8)
content.match /\A([^\n]*\r?\n?)/
line = $1
error('no information found in first line') unless line && line.match(/\S/)
line.match /(\r?\n?)\z/
line_end = $1
error('cannot determine line ending sequence') unless line_end
error('unsupported line ending "\r"') if line_end == "\r"
separator = SEPARATORS.reverse.max {|sep|
line.count sep
}
error("no field separator (tried #{%W(; , \t).map(&:inspect).join ', '}) found in header") if line.count(separator) == 0
header_count = CSV.parse_line(line,
row_sep: line_end,
col_sep: separator,
quote_char: QUOTE
).length
{
is_utf8: utf8,
line_end: line_end,
separator_char: separator,
quote_char: QUOTE,
escape_char: ESCAPE,
header_count: header_count
}
end
def error(message)
STDERR.puts "Error: #{message}"
exit false
end
def read_file(file_name, is_utf8)
if is_utf8
# Read as UTF8 with BOM
File.read(file_name, mode: 'r:BOM|UTF-8')
else
# Read as ISO-8859-1
File.read(file_name, mode: 'r:ISO-8859-1')
end
end
def parse
content = @file
column_count = @options[:header_count]
lines = []
columns = []
value = ''
skip = 0
in_field = quoted = escaping_or_end = false
content.chars.each.with_index {|char, i|
if skip > 0
skip -= 1
next
end
if in_field
if quoted
# in quoted field
if escaping_or_end
# found another quote char (may be end of the quote or an escape char)
case
when char == @options[:quote_char]
# a quote followed by a quote (in quoting mode) is an escaped quote
escaping_or_end = false
value << char
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
# after a closing quote followed by a linebreak the whole line is definitely over
in_field = quoted = escaping_or_end = false
columns << value
value = ''
lines << columns
columns = []
skip = @options[:line_end].length - 1
when char == @options[:separator_char]
# a separator after a closing quote ends the field
in_field = quoted = escaping_or_end = false
columns << value
value = ''
else # ...;"bla "blubb"" ";...
raise "invalid quote escape sequence '#{@options[:quote_char]}#{char.inspect.gsub(/^"(.+)"$/, '\1')}' (parsing column #{columns.length} of line #{lines.length+1})"
end
else
case char
when @options[:quote_char]
# quote in quoting mode starts escaping (...;" bla ""blubb"" ";..) or ends the quote (...;"bla ";...)
escaping_or_end = true
else
value << char
end
end
else
# in unquoted field
case
when char == @options[:separator_char]
# a separator in non-quoting mode ends the currently read in field
in_field = false
columns << value
value = ''
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
# in an unquoted field, a linebreak may mean the end of the current line or a linebreak in the current field
# if it's the last column, check if the next line contains a separator (this would mean it is a new line as
# we have enough columns already)
if columns.length >= column_count - 1 && (content[(i+@options[:line_end].length)..-1] =~ /\A.*#{@options[:separator_char]}.*$/ || i+@options[:line_end].length >= content.length)
# end the line
in_field = false
columns << value
value = ''
lines << columns
columns = []
else
# it's not the last column or the next line should add to the last column, treat as linebreak
value << @options[:line_end]
end
skip = @options[:line_end].length - 1
else
value << char
end
end
else
# not in a field
case
when char == @options[:quote_char]
# new quoted field starts
in_field = quoted = true
when char == @options[:separator_char]
# field ends and starts another
columns << value
value = ''
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
# a linebreak may indicate the end of the current line or a new field starting with a linebreak
if columns.length == column_count
# enough columns/fields parsed, treat as line end
lines << columns
columns = []
else
# not enough columns/fields in current line, treat as new field
in_field = true
value << @options[:line_end]
end
skip = @options[:line_end].length - 1
else
in_field = true
value << char
end
end
}
if in_field && quoted && !escaping_or_end
raise 'missing closing quote on last line'
end
lines
end
end
file_name = ARGV[0]
unless file_name
puts
puts 'Usage: fix_sql_administrator_csv.rb <file name>'
puts
puts 'Generates "<file name>_fixed.csv" which contains the fixed CSV with the'
puts 'following settings:'
puts '- double quotes as field quote character (...;"content";...)'
puts '- double quotes as quoted field escape character (...;" text ""quoted"" ";...)'
puts '- the same field separation character (comma or semicolon) as in the input file'
puts '- the same line breaks as in the input file (\n or \r\n)'
puts '- the same character encoding (UTF-8 with BOM or ISO-8859-1) as in the input file'
puts '- all fields are quoted'
puts
puts 'If validation fails, a file named "<file name>_validation.csv" will be left'
puts 'which should contain a byte-exact recreation of the input file (for debugging).'
exit
end
puts 'Reading input file...'
parser = MSSQLAdministratorCSVParser.new file_name
puts 'Writing fixed CSV file...'
File.binwrite("#{file_name}_fixed.csv", parser.as_csv)
puts 'Writing validation file...'
validation_file = "#{file_name}_validation.csv"
File.binwrite(validation_file, parser.as_administrator_csv)
puts 'Comparing original and validation file...'
raise 'Generated file does not match input file (keeping validation file for reference)!' unless File.binread(file_name) == File.binread(validation_file)
puts 'Removing validation file...'
File.delete(validation_file)
puts 'All done.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment