Last active
March 10, 2023 06:34
-
-
Save gr8bit/62202ea89a7e3aff67df2ff080ee8e88 to your computer and use it in GitHub Desktop.
When exporting CSV with the Microsoft SQL Administrator tool, it generates somewhat invalid CSV which cannot be parsed through regular libraries. This ruby-tool reads these files and converts them into valid CSV, keeping encoding, seperators, and linebreaks from the original file. Run "ruby fix_sql_administrator_csv.rb" for info.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'csv' | |
class MSSQLAdministratorCSVParser | |
QUOTE = '"' | |
ESCAPE = '"' | |
SEPARATORS = %W(; , \t) # when equally probable, separatur occurring first is being picked | |
UTF8_BOM = "\xef\xbb\xbf".force_encoding('ASCII-8BIT') | |
def initialize(file_name, options={}) | |
error("file '#{file_name}' does not exist") unless File.exists? file_name | |
@file_name = file_name | |
detected_options = detect_options(file_name) | |
@options = detected_options.merge(options) | |
@file = read_file(@file_name, detected_options[:is_utf8]) | |
end | |
# as correctly quoted, readable csv | |
def as_csv(options={}) | |
options = @options.merge(options) | |
CSV.generate( | |
col_sep: options[:separator_char], | |
row_sep: options[:line_end], | |
quote_char: options[:quote_char], | |
force_quotes: true | |
) do |csv| | |
parse.each { |line| | |
csv << line | |
} | |
end | |
end | |
# for validation purposes | |
def as_administrator_csv | |
lines = parse.map{|line| | |
line.map{|col| | |
(col.include?(@options[:separator_char]) ? "#{@options[:quote_char]}#{col}#{@options[:quote_char]}" : col) | |
}.join(@options[:separator_char]) | |
}.join(@options[:line_end]) | |
if @options[:is_utf8] | |
UTF8_BOM + lines.force_encoding('ASCII-8BIT') + @options[:line_end] | |
else | |
lines.force_encoding('ASCII-8BIT') + @options[:line_end] | |
end | |
end | |
private | |
def detect_options(file_name) | |
content = File.read(file_name, mode: 'rb') | |
utf8 = content.start_with? UTF8_BOM | |
content = read_file(file_name, utf8) | |
content.match /\A([^\n]*\r?\n?)/ | |
line = $1 | |
error('no information found in first line') unless line && line.match(/\S/) | |
line.match /(\r?\n?)\z/ | |
line_end = $1 | |
error('cannot determine line ending sequence') unless line_end | |
error('unsupported line ending "\r"') if line_end == "\r" | |
separator = SEPARATORS.reverse.max {|sep| | |
line.count sep | |
} | |
error("no field separator (tried #{%W(; , \t).map(&:inspect).join ', '}) found in header") if line.count(separator) == 0 | |
header_count = CSV.parse_line(line, | |
row_sep: line_end, | |
col_sep: separator, | |
quote_char: QUOTE | |
).length | |
{ | |
is_utf8: utf8, | |
line_end: line_end, | |
separator_char: separator, | |
quote_char: QUOTE, | |
escape_char: ESCAPE, | |
header_count: header_count | |
} | |
end | |
def error(message) | |
STDERR.puts "Error: #{message}" | |
exit false | |
end | |
def read_file(file_name, is_utf8) | |
if is_utf8 | |
# Read as UTF8 with BOM | |
File.read(file_name, mode: 'r:BOM|UTF-8') | |
else | |
# Read as ISO-8859-1 | |
File.read(file_name, mode: 'r:ISO-8859-1') | |
end | |
end | |
def parse | |
content = @file | |
column_count = @options[:header_count] | |
lines = [] | |
columns = [] | |
value = '' | |
skip = 0 | |
in_field = quoted = escaping_or_end = false | |
content.chars.each.with_index {|char, i| | |
if skip > 0 | |
skip -= 1 | |
next | |
end | |
if in_field | |
if quoted | |
# in quoted field | |
if escaping_or_end | |
# found another quote char (may be end of the quote or an escape char) | |
case | |
when char == @options[:quote_char] | |
# a quote followed by a quote (in quoting mode) is an escaped quote | |
escaping_or_end = false | |
value << char | |
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end] | |
# after a closing quote followed by a linebreak the whole line is definitely over | |
in_field = quoted = escaping_or_end = false | |
columns << value | |
value = '' | |
lines << columns | |
columns = [] | |
skip = @options[:line_end].length - 1 | |
when char == @options[:separator_char] | |
# a separator after a closing quote ends the field | |
in_field = quoted = escaping_or_end = false | |
columns << value | |
value = '' | |
else # ...;"bla "blubb"" ";... | |
raise "invalid quote escape sequence '#{@options[:quote_char]}#{char.inspect.gsub(/^"(.+)"$/, '\1')}' (parsing column #{columns.length} of line #{lines.length+1})" | |
end | |
else | |
case char | |
when @options[:quote_char] | |
# quote in quoting mode starts escaping (...;" bla ""blubb"" ";..) or ends the quote (...;"bla ";...) | |
escaping_or_end = true | |
else | |
value << char | |
end | |
end | |
else | |
# in unquoted field | |
case | |
when char == @options[:separator_char] | |
# a separator in non-quoting mode ends the currently read in field | |
in_field = false | |
columns << value | |
value = '' | |
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end] | |
# in an unquoted field, a linebreak may mean the end of the current line or a linebreak in the current field | |
# if it's the last column, check if the next line contains a separator (this would mean it is a new line as | |
# we have enough columns already) | |
if columns.length >= column_count - 1 && (content[(i+@options[:line_end].length)..-1] =~ /\A.*#{@options[:separator_char]}.*$/ || i+@options[:line_end].length >= content.length) | |
# end the line | |
in_field = false | |
columns << value | |
value = '' | |
lines << columns | |
columns = [] | |
else | |
# it's not the last column or the next line should add to the last column, treat as linebreak | |
value << @options[:line_end] | |
end | |
skip = @options[:line_end].length - 1 | |
else | |
value << char | |
end | |
end | |
else | |
# not in a field | |
case | |
when char == @options[:quote_char] | |
# new quoted field starts | |
in_field = quoted = true | |
when char == @options[:separator_char] | |
# field ends and starts another | |
columns << value | |
value = '' | |
when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end] | |
# a linebreak may indicate the end of the current line or a new field starting with a linebreak | |
if columns.length == column_count | |
# enough columns/fields parsed, treat as line end | |
lines << columns | |
columns = [] | |
else | |
# not enough columns/fields in current line, treat as new field | |
in_field = true | |
value << @options[:line_end] | |
end | |
skip = @options[:line_end].length - 1 | |
else | |
in_field = true | |
value << char | |
end | |
end | |
} | |
if in_field && quoted && !escaping_or_end | |
raise 'missing closing quote on last line' | |
end | |
lines | |
end | |
end | |
file_name = ARGV[0] | |
unless file_name | |
puts | |
puts 'Usage: fix_sql_administrator_csv.rb <file name>' | |
puts | |
puts 'Generates "<file name>_fixed.csv" which contains the fixed CSV with the' | |
puts 'following settings:' | |
puts '- double quotes as field quote character (...;"content";...)' | |
puts '- double quotes as quoted field escape character (...;" text ""quoted"" ";...)' | |
puts '- the same field separation character (comma or semicolon) as in the input file' | |
puts '- the same line breaks as in the input file (\n or \r\n)' | |
puts '- the same character encoding (UTF-8 with BOM or ISO-8859-1) as in the input file' | |
puts '- all fields are quoted' | |
puts | |
puts 'If validation fails, a file named "<file name>_validation.csv" will be left' | |
puts 'which should contain a byte-exact recreation of the input file (for debugging).' | |
exit | |
end | |
puts 'Reading input file...' | |
parser = MSSQLAdministratorCSVParser.new file_name | |
puts 'Writing fixed CSV file...' | |
File.binwrite("#{file_name}_fixed.csv", parser.as_csv) | |
puts 'Writing validation file...' | |
validation_file = "#{file_name}_validation.csv" | |
File.binwrite(validation_file, parser.as_administrator_csv) | |
puts 'Comparing original and validation file...' | |
raise 'Generated file does not match input file (keeping validation file for reference)!' unless File.binread(file_name) == File.binread(validation_file) | |
puts 'Removing validation file...' | |
File.delete(validation_file) | |
puts 'All done.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment