Skip to content

Instantly share code, notes, and snippets.

@gr8bit
Last active March 10, 2023 06:34
Show Gist options
  • Save gr8bit/62202ea89a7e3aff67df2ff080ee8e88 to your computer and use it in GitHub Desktop.
Save gr8bit/62202ea89a7e3aff67df2ff080ee8e88 to your computer and use it in GitHub Desktop.
When exporting CSV with the Microsoft SQL Administrator tool, it generates somewhat invalid CSV which cannot be parsed through regular libraries. This ruby-tool reads these files and converts them into valid CSV, keeping encoding, seperators, and linebreaks from the original file. Run "ruby fix_sql_administrator_csv.rb" for info.
require 'csv'
class MSSQLAdministratorCSVParser
QUOTE = '"'
ESCAPE = '"'
SEPARATORS = %W(; , \t) # when equally probable, separatur occurring first is being picked
UTF8_BOM = "\xef\xbb\xbf".force_encoding('ASCII-8BIT')
def initialize(file_name, options={})
error("file '#{file_name}' does not exist") unless File.exists? file_name
@file_name = file_name
detected_options = detect_options(file_name)
@options = detected_options.merge(options)
@file = read_file(@file_name, detected_options[:is_utf8])
end
# as correctly quoted, readable csv
def as_csv(options={})
options = @options.merge(options)
CSV.generate(
col_sep: options[:separator_char],
row_sep: options[:line_end],
quote_char: options[:quote_char],
force_quotes: true
) do |csv|
parse.each { |line|
csv << line
}
end
end
# for validation purposes
def as_administrator_csv
lines = parse.map{|line|
line.map{|col|
(col.include?(@options[:separator_char]) ? "#{@options[:quote_char]}#{col}#{@options[:quote_char]}" : col)
}.join(@options[:separator_char])
}.join(@options[:line_end])
if @options[:is_utf8]
UTF8_BOM + lines.force_encoding('ASCII-8BIT') + @options[:line_end]
else
lines.force_encoding('ASCII-8BIT') + @options[:line_end]
end
end
private
def detect_options(file_name)
content = File.read(file_name, mode: 'rb')
utf8 = content.start_with? UTF8_BOM
content = read_file(file_name, utf8)
content.match /\A([^\n]*\r?\n?)/
line = $1
error('no information found in first line') unless line && line.match(/\S/)
line.match /(\r?\n?)\z/
line_end = $1
error('cannot determine line ending sequence') unless line_end
error('unsupported line ending "\r"') if line_end == "\r"
separator = SEPARATORS.reverse.max {|sep|
line.count sep
}
error("no field separator (tried #{%W(; , \t).map(&:inspect).join ', '}) found in header") if line.count(separator) == 0
header_count = CSV.parse_line(line,
row_sep: line_end,
col_sep: separator,
quote_char: QUOTE
).length
{
is_utf8: utf8,
line_end: line_end,
separator_char: separator,
quote_char: QUOTE,
escape_char: ESCAPE,
header_count: header_count
}
end
def error(message)
STDERR.puts "Error: #{message}"
exit false
end
def read_file(file_name, is_utf8)
if is_utf8
# Read as UTF8 with BOM
File.read(file_name, mode: 'r:BOM|UTF-8')
else
# Read as ISO-8859-1
File.read(file_name, mode: 'r:ISO-8859-1')
end
end
def parse
content = @file
column_count = @options[:header_count]
lines = []
columns = []
value = ''
skip = 0
in_field = quoted = escaping_or_end = false
content.chars.each.with_index {|char, i|
if skip > 0
skip -= 1
next
end
if in_field
if quoted
# in quoted field
if escaping_or_end
# found another quote char (may be end of the quote or an escape char)
case
when char == @options[:quote_char]
# a quote followed by a quote (in quoting mode) is an escaped quote
escaping_or_end = false
value << char
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
# after a closing quote followed by a linebreak the whole line is definitely over
in_field = quoted = escaping_or_end = false
columns << value
value = ''
lines << columns
columns = []
skip = @options[:line_end].length - 1
when char == @options[:separator_char]
# a separator after a closing quote ends the field
in_field = quoted = escaping_or_end = false
columns << value
value = ''
else # ...;"bla "blubb"" ";...
raise "invalid quote escape sequence '#{@options[:quote_char]}#{char.inspect.gsub(/^"(.+)"$/, '\1')}' (parsing column #{columns.length} of line #{lines.length+1})"
end
else
case char
when @options[:quote_char]
# quote in quoting mode starts escaping (...;" bla ""blubb"" ";..) or ends the quote (...;"bla ";...)
escaping_or_end = true
else
value << char
end
end
else
# in unquoted field
case
when char == @options[:separator_char]
# a separator in non-quoting mode ends the currently read in field
in_field = false
columns << value
value = ''
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
# in an unquoted field, a linebreak may mean the end of the current line or a linebreak in the current field
# if it's the last column, check if the next line contains a separator (this would mean it is a new line as
# we have enough columns already)
if columns.length >= column_count - 1 && (content[(i+@options[:line_end].length)..-1] =~ /\A.*#{@options[:separator_char]}.*$/ || i+@options[:line_end].length >= content.length)
# end the line
in_field = false
columns << value
value = ''
lines << columns
columns = []
else
# it's not the last column or the next line should add to the last column, treat as linebreak
value << @options[:line_end]
end
skip = @options[:line_end].length - 1
else
value << char
end
end
else
# not in a field
case
when char == @options[:quote_char]
# new quoted field starts
in_field = quoted = true
when char == @options[:separator_char]
# field ends and starts another
columns << value
value = ''
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
# a linebreak may indicate the end of the current line or a new field starting with a linebreak
if columns.length == column_count
# enough columns/fields parsed, treat as line end
lines << columns
columns = []
else
# not enough columns/fields in current line, treat as new field
in_field = true
value << @options[:line_end]
end
skip = @options[:line_end].length - 1
else
in_field = true
value << char
end
end
}
if in_field && quoted && !escaping_or_end
raise 'missing closing quote on last line'
end
lines
end
end
file_name = ARGV[0]
unless file_name
puts
puts 'Usage: fix_sql_administrator_csv.rb <file name>'
puts
puts 'Generates "<file name>_fixed.csv" which contains the fixed CSV with the'
puts 'following settings:'
puts '- double quotes as field quote character (...;"content";...)'
puts '- double quotes as quoted field escape character (...;" text ""quoted"" ";...)'
puts '- the same field separation character (comma or semicolon) as in the input file'
puts '- the same line breaks as in the input file (\n or \r\n)'
puts '- the same character encoding (UTF-8 with BOM or ISO-8859-1) as in the input file'
puts '- all fields are quoted'
puts
puts 'If validation fails, a file named "<file name>_validation.csv" will be left'
puts 'which should contain a byte-exact recreation of the input file (for debugging).'
exit
end
puts 'Reading input file...'
parser = MSSQLAdministratorCSVParser.new file_name
puts 'Writing fixed CSV file...'
File.binwrite("#{file_name}_fixed.csv", parser.as_csv)
puts 'Writing validation file...'
validation_file = "#{file_name}_validation.csv"
File.binwrite(validation_file, parser.as_administrator_csv)
puts 'Comparing original and validation file...'
raise 'Generated file does not match input file (keeping validation file for reference)!' unless File.binread(file_name) == File.binread(validation_file)
puts 'Removing validation file...'
File.delete(validation_file)
puts 'All done.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment