binarybana/io.jl

## io.jl
require("profile")
using Profile
@profile begin
##############################################################################
#
# Low-level text parsing
#
##############################################################################

# Implements a very simple two-state machine that splits *-separated
# lines on the single character `separator`, but ignores occurrences
# of `separator` when they occur inside a region bounded by
# `quotation_character`
#
# For now, we're going to restrict things to only handle incoming
# strings that use single Char encodings
function split_separated_line{T <: String}(line::T,
                                           separator::Char,
                                           quotation_character::Char)
  inside_quotes = false
  items = Array(UTF8String, strlen(line))
  current_item = Array(Uint8, strlen(line))
  total_items = 0
  i = 0
  for chr in line
    i += 1
    if inside_quotes
      if chr == quotation_character
        inside_quotes = false
        i -= 1
      else
        current_item[i] = chr
      end
    else
      if chr == quotation_character
        inside_quotes = true
        i -= 1
      else
        if chr == separator
          total_items += 1
          items[total_items] = bytestring(current_item[1:(i - 1)])
          i = 0
          current_item = Array(Uint8, strlen(line))
        else
          current_item[i] = chr
        end
      end
    end
  end
  total_items += 1
  items[total_items] = bytestring(current_item[1:i])
  return items[1:total_items]
end

##############################################################################
#
# Inferential steps
#
##############################################################################

function determine_separator{T <: String}(filename::T)
  if ismatch(r"csv$", filename)
    return ','
  elseif ismatch(r"tsv$", filename)
    return '\t'
  elseif ismatch(r"wsv$", filename)
    return ' '
  else
    error("Unable to determine separator used in $filename")
  end
end

function determine_nrows{T <: String}(filename::T, header::Bool)
  total_lines = countlines(filename)
  if header
    return total_lines - 1
  else
    return total_lines
  end
end

function determine_ncols{T <: String}(filename::T,
                                      separator::Char,
                                      quotation_character::Char)
  io = open(filename, "r")
  line = chomp(readline(io))
  close(io)
  return length(split_separated_line(line, separator, quotation_character))
end

function determine_column_names(io::IOStream,
                                separator::Char,
                                quotation_character::Char,
                                header::Bool)
  seek(io, 0)
  line = chomp(readline(io))

  if length(line) == 0
    error("Failed to determine column names from an empty data source")
  end

  fields = split_separated_line(line, separator, quotation_character)

  if header
    seek(io, 0)
    return fields
  else
    seek(io, 0)
    column_names = generate_column_names(length(fields))
  end
end

# Read data line-by-line
# Line-by-line reading may be IO-bound
function read_separated_text(io::IOStream,
                             nrows::Int,
                             ncols::Int,
                             separator::Char,
                             quotation_character::Char)
  text_data = Array(UTF8String, nrows, ncols)

  i = 0
  while i < nrows
    line = chomp(readline(io))
    if length(line) == 0
      break
    end
    i += 1
    text_data[i, 1:ncols] = split_separated_line(line, separator, quotation_character)
  end

  if i == 0
    return Array(UTF8String, 0, 0)
  else
    return text_data[1:i, :]
  end
end

function infer_column_types{S <: String, T <: String}(text_data::Matrix{S},
                                         missingness_indicators::Vector{T})
  nrows, ncols = size(text_data)

  # Default to Int64 for all column types until we have to demote them
  # May want to shift to using numeric codes for types
  column_types = Array(Any, ncols)
  for i in 1:ncols
    column_types[i] = Int64
  end

  for j in 1:ncols
    for i in 1:nrows
      if column_types[j] <: String
        break
      end
      if !contains(missingness_indicators, text_data[i, j])
        column_types[j] = tightest_type(text_data[i, j], column_types[j])
      end
    end
  end

  return column_types
end

# TODO: Split this into determine_column_names and infer_column_types
# Short-circuit option allows one to just guess metadata for massive files
# Currently maxes out after 1,000 lines
function determine_metadata{T <: String}(filename::String,
                                         separator::Char,
                                         quotation_character::Char,
                                         missingness_indicators::Vector{T},
                                         header::Bool,
                                         short_circuit::Bool)

  nrows = determine_nrows(filename, header)
  maxlines = nrows
  if short_circuit
    maxlines = min(nrows, 1_000)
  end

  io = open(filename, "r")
  column_names = determine_column_names(io, separator, quotation_character, header)
  ncols = length(column_names)
  if header # Skip the header for type inference
    readline(io)
  end
  text_data = read_separated_text(io, maxlines, ncols, separator, quotation_character)
  close(io)

  column_types = infer_column_types(text_data, missingness_indicators)

  # Return the inferred column names and types
  return (column_names, column_types, nrows)
end

function determine_metadata{T <: String}(filename::String,
                                         header::Bool,
                                         short_circuit::Bool)
  separator = determine_separator(filename)
  quotation_character = '"'
  determine_metadata(filename, separator, quotation_character, missingness_indicators, header, short_circuit)
end

function convert_to_dataframe{R <: String,
                              S <: String,
                              T <: String}(text_data::Matrix{R},
                                           missingness_indicators::Vector{S},
                                           column_types::Vector,
                                           column_names::Vector{T})
  # Keep a record of number of rows and columns
  nrows, ncols = size(text_data)

  # Short-circuit if the text data is empty
  if nrows == 0
    return DataFrame(column_types, column_names, 0)
  end

  # Make sure that the user has specified coherent types and names
  if ncols != length(column_types) || ncols != length(column_names)
    error("Column types and names do not match the input data's size")
  end

  # Store the columns as a set of DataVec's inside an Array of Any's
  columns = Array(Any, ncols)

  # Convert each column of text into a DataVec of the
  # appropriate type
  for j in 1:ncols
    is_missing = BitVector(nrows)
    for i in 1:nrows
      if contains(missingness_indicators, text_data[i, j])
        text_data[i, j] = string(baseval(column_types[j]))
        is_missing[i] = true
      else
        is_missing[i] = false
      end
    end
    if column_types[j] == Int64
      values = int(text_data[1:nrows, j])
    elseif column_types[j] == Float64
      values = float(text_data[1:nrows, j])
    elseif column_types[j] == UTF8String
      values = convert(Array{UTF8String, 1}, text_data[1:nrows, j])
    elseif column_types[j] == ASCIIString
      values = convert(Array{ASCIIString, 1}, text_data[1:nrows, j])
    else
      error("Column cannot be converted to type: $(column_types[j])")
    end
    columns[j] = DataVec(values, is_missing)
  end

  # Prepare the DataFrame we'll return
  df = DataFrame(columns, column_names)
  return df
end

##############################################################################
#
# Text input
#
##############################################################################

# Read at most N lines from an IOStream
# Then return a minibatch of at most N rows as a DataFrame
function read_minibatch{R <: String,
                        S <: String,
                        T}(io::IOStream,
                           separator::Char,
                           quotation_character::Char,
                           missingness_indicators::Vector{R},
                           column_names::Vector{S},
                           column_types::Vector{T},
                           minibatch_size::Int64)
  # Keep a record of number of columns
  ncols = length(column_types)

  # Represent data as an array of strings before type conversion
  text_data = read_separated_text(io, minibatch_size, ncols, separator, quotation_character)

  # Convert text data to a DataFrame
  return convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)
end

# Read an entire data set into a DataFrame from an IOStream
# TODO: Do only IO-pass through the data
function read_table{R <: String,
                    S <: String}(io::IOStream,
                                 separator::Char,
                                 quotation_character::Char,
                                 missingness_indicators::Vector{R},
                                 header::Bool,
                                 column_names::Vector{S},
                                 nrows::Int64)
  # Return to start of stream
  seek(io, 0)

  # Read first line to remove header in advance
  if header
    readline(io)
  end

  # Keep a record of number of columns
  ncols = length(column_names)

  # Represent data as an array of strings before type conversion
  text_data = read_separated_text(io, nrows, ncols, separator, quotation_character)

  # Short-circuit if data set is empty except for a header line
  if size(text_data, 1) == 0
    column_types = {Any for i in 1:ncols}
    return DataFrame(column_types, column_names, 0)
  end

  # Infer column types
  column_types = infer_column_types(text_data, missingness_indicators)

  # Convert text data to a DataFrame
  return convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)
end

function read_table{T <: String}(filename::T)
  # Do inference for missing configuration settings
  separator = determine_separator(filename)
  quotation_character = '"'
  missingness_indicators = ["", "NA"]
  header = true
  nrows = determine_nrows(filename, header)
  io = open(filename, "r")
  column_names = determine_column_names(io, separator, quotation_character, header)
  df = read_table(io,
                  separator,
                  quotation_character,
                  missingness_indicators,
                  header,
                  column_names,
                  nrows)
  close(io)
  return df
end

end #profile

read_table("/home/bana/Downloads/movies.csv")
@profile report

##############################################################################
#
# Text output
#
##############################################################################

# Quotation rules
# Quote all string fields
# Don't quote real-valued fields
# Quote non-string, non-real-valued fields
function in_quotes{T <: String}(val::T, quotation_character::Char)
  strcat(quotation_character, val, quotation_character)
end
function in_quotes{T <: Real}(val::T, quotation_character::Char)
  string(val)
end
function in_quotes{T <: Any}(val::T, quotation_character::Char)
  strcat(quotation_character, string(val), quotation_character)
end

# TODO: write_table should do more to react to the type of each column
# Need to increase precision of string representation of Float64's
function print_table(df::DataFrame,
                     io::IOStream,
                     separator::Char,
                     quotation_character::Char)
  n, p = nrow(df), ncol(df)
  column_names = colnames(df)
  for j in 1:p
    if j < p
      print(io, in_quotes(column_names[j], quotation_character))
      print(io, separator)
    else
      println(io, in_quotes(column_names[j], quotation_character))
    end
  end
  for i in 1:n
    for j in 1:p
      if j < p
        print(io, in_quotes(df[i, j], quotation_character))
        print(io, separator)
      else
        println(io, in_quotes(df[i, j], quotation_character))
      end
    end
  end
end

function print_table(df::DataFrame, separator::Char, quotation_character::Char)
  print_table(df, OUTPUT_STREAM, separator, quotation_character)
end

print_table(df::DataFrame) = print_table(df, OUTPUT_STREAM, ',', '"')

function write_table{T <: String}(df::DataFrame,
                                  filename::T,
                                  separator::Char,
                                  quotation_character::Char)
  io = open(filename, "w")
  print_table(df, io, separator, quotation_character)
  close(io)
end

# Infer configuration settings from filename
function write_table{T <: String}(df::DataFrame, filename::T)
  separator = determine_separator(filename)
  quotation_character = '"'
  write_table(df, filename, separator, quotation_character)
end

##############################################################################
#
# Binary serialization
#
##############################################################################

# Wrappers for serialization
function save(filename, d)
    f = open(filename, "w")
    serialize(f, d)
    close(f)
end

function load_df(filename)
    f = open(filename)
    dd = deserialize(f)()
    close(f)
    return dd
end

## profile_results
julia> require("DataFrames")
   count  time(%)  time(s)
   58789     0.00  0.000186     #  /home/bana/.julia/DataFrames/src/io.jl, line 20
   58789     0.02  0.028055     #  line 21
   58789     0.01  0.006174     #  line 22
   58789     0.00  0.000105     #  line 23
   58789     0.00  0.000124     #  line 24
 5941920     0.03  0.038879     #  line 26
  176389     0.00  0.002492     #  line 29
  176389     0.00  0.000460     #  line 30
 1267077     0.01  0.007016     #  line 32
  176389     0.00  0.003109     #  line 36
  176389     0.00  0.000457     #  line 37
 1410936     0.01  0.016423     #  line 40
 1410936     1.60  1.976091     #  line 41
 1410936     0.00  0.002732     #  line 42
 1410936     0.11  0.134190     #  line 43
 2911129     0.02  0.019880     #  line 45
   58789     0.00  0.000789     #  line 50
   58789     0.01  0.012929     #  line 51
   58789     0.12  0.146372     #  line 52
       1     0.03  0.033243     #  /home/bana/.julia/DataFrames/src/io.jl, line 74
       1     0.00  0.000005     #  /home/bana/.julia/DataFrames/src/io.jl, line 95
       1     0.01  0.007993     #  line 96
       1     0.12  0.150127     #  line 102
       1     0.00  0.000003     #  line 105
       1     0.01  0.009379     #  /home/bana/.julia/DataFrames/src/io.jl, line 120
       1     0.00  0.000000     #  line 122
   58788     0.56  0.694576     #  line 124
   58788     0.36  0.442168     #  line 128
   58788    17.30  21.355132    #  line 129
       1     0.00  0.000000     #  /home/bana/.julia/DataFrames/src/io.jl, line 141
       1     0.00  0.000002     #  line 145
      25     0.00  0.000001     #  line 147
 1298553     5.01  6.185185     #  line 156
       1     0.00  0.000000     #  line 161
       1     0.00  0.000000     #  /home/bana/.julia/DataFrames/src/io.jl, line 210
       1     0.00  0.000004     #  line 223
      25     0.00  0.000306     #  line 228
  107437     0.44  0.539362     #  line 231
  107437     0.01  0.009928     #  line 232
 1362263     1.52  1.877132     #  line 234
      12     0.76  0.934277     #  line 238
      11     0.67  0.832819     #  line 240
       2     0.00  0.000717     #  line 242
      25     0.02  0.023742     #  line 248
       1     0.10  0.123444     #  line 252
       1     0.00  0.000000     #  line 253
       1     0.00  0.000005     #  /home/bana/.julia/DataFrames/src/io.jl, line 294
       1     0.00  0.000078     #  line 298
       1     0.00  0.000000     #  line 302
       1    18.33  22.622687    #  line 305 *read_separated_text(io, nrows, ncols, separator, quotation_character)*
       1    10.65  13.139062    #  line 314 *infer_column_types(text_data, missingness_indicators)*
       1     6.35  7.838155     #  line 317 *convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)*
       1     0.00  0.000015     #  /home/bana/.julia/DataFrames/src/io.jl, line 322
       1     0.00  0.000000     #  line 323
       1     0.00  0.000002     #  line 324
       1     0.00  0.000000     #  line 325
       1     0.03  0.033252     #  line 326
       1     0.00  0.000026     #  line 327
       1     0.15  0.190201     #  line 328
       1    35.63  43.976593    #  line 329
       1     0.00  0.000017     #  line 336
       1     0.00  0.000000     #  line 337
	require("profile")
	using Profile
	@profile begin
	##############################################################################
	#
	# Low-level text parsing
	#
	##############################################################################

	# Implements a very simple two-state machine that splits *-separated
	# lines on the single character `separator`, but ignores occurrences
	# of `separator` when they occur inside a region bounded by
	# `quotation_character`
	#
	# For now, we're going to restrict things to only handle incoming
	# strings that use single Char encodings
	function split_separated_line{T <: String}(line::T,
	separator::Char,
	quotation_character::Char)
	inside_quotes = false
	items = Array(UTF8String, strlen(line))
	current_item = Array(Uint8, strlen(line))
	total_items = 0
	i = 0
	for chr in line
	i += 1
	if inside_quotes
	if chr == quotation_character
	inside_quotes = false
	i -= 1
	else
	current_item[i] = chr
	end
	else
	if chr == quotation_character
	inside_quotes = true
	i -= 1
	else
	if chr == separator
	total_items += 1
	items[total_items] = bytestring(current_item[1:(i - 1)])
	i = 0
	current_item = Array(Uint8, strlen(line))
	else
	current_item[i] = chr
	end
	end
	end
	end
	total_items += 1
	items[total_items] = bytestring(current_item[1:i])
	return items[1:total_items]
	end

	##############################################################################
	#
	# Inferential steps
	#
	##############################################################################

	function determine_separator{T <: String}(filename::T)
	if ismatch(r"csv$", filename)
	return ','
	elseif ismatch(r"tsv$", filename)
	return '\t'
	elseif ismatch(r"wsv$", filename)
	return ' '
	else
	error("Unable to determine separator used in $filename")
	end
	end

	function determine_nrows{T <: String}(filename::T, header::Bool)
	total_lines = countlines(filename)
	if header
	return total_lines - 1
	else
	return total_lines
	end
	end

	function determine_ncols{T <: String}(filename::T,
	separator::Char,
	quotation_character::Char)
	io = open(filename, "r")
	line = chomp(readline(io))
	close(io)
	return length(split_separated_line(line, separator, quotation_character))
	end

	function determine_column_names(io::IOStream,
	separator::Char,
	quotation_character::Char,
	header::Bool)
	seek(io, 0)
	line = chomp(readline(io))

	if length(line) == 0
	error("Failed to determine column names from an empty data source")
	end

	fields = split_separated_line(line, separator, quotation_character)

	if header
	seek(io, 0)
	return fields
	else
	seek(io, 0)
	column_names = generate_column_names(length(fields))
	end
	end

	# Read data line-by-line
	# Line-by-line reading may be IO-bound
	function read_separated_text(io::IOStream,
	nrows::Int,
	ncols::Int,
	separator::Char,
	quotation_character::Char)
	text_data = Array(UTF8String, nrows, ncols)

	i = 0
	while i < nrows
	line = chomp(readline(io))
	if length(line) == 0
	break
	end
	i += 1
	text_data[i, 1:ncols] = split_separated_line(line, separator, quotation_character)
	end

	if i == 0
	return Array(UTF8String, 0, 0)
	else
	return text_data[1:i, :]
	end
	end

	function infer_column_types{S <: String, T <: String}(text_data::Matrix{S},
	missingness_indicators::Vector{T})
	nrows, ncols = size(text_data)

	# Default to Int64 for all column types until we have to demote them
	# May want to shift to using numeric codes for types
	column_types = Array(Any, ncols)
	for i in 1:ncols
	column_types[i] = Int64
	end

	for j in 1:ncols
	for i in 1:nrows
	if column_types[j] <: String
	break
	end
	if !contains(missingness_indicators, text_data[i, j])
	column_types[j] = tightest_type(text_data[i, j], column_types[j])
	end
	end
	end

	return column_types
	end

	# TODO: Split this into determine_column_names and infer_column_types
	# Short-circuit option allows one to just guess metadata for massive files
	# Currently maxes out after 1,000 lines
	function determine_metadata{T <: String}(filename::String,
	separator::Char,
	quotation_character::Char,
	missingness_indicators::Vector{T},
	header::Bool,
	short_circuit::Bool)

	nrows = determine_nrows(filename, header)
	maxlines = nrows
	if short_circuit
	maxlines = min(nrows, 1_000)
	end

	io = open(filename, "r")
	column_names = determine_column_names(io, separator, quotation_character, header)
	ncols = length(column_names)
	if header # Skip the header for type inference
	readline(io)
	end
	text_data = read_separated_text(io, maxlines, ncols, separator, quotation_character)
	close(io)

	column_types = infer_column_types(text_data, missingness_indicators)

	# Return the inferred column names and types
	return (column_names, column_types, nrows)
	end

	function determine_metadata{T <: String}(filename::String,
	header::Bool,
	short_circuit::Bool)
	separator = determine_separator(filename)
	quotation_character = '"'
	determine_metadata(filename, separator, quotation_character, missingness_indicators, header, short_circuit)
	end

	function convert_to_dataframe{R <: String,
	S <: String,
	T <: String}(text_data::Matrix{R},
	missingness_indicators::Vector{S},
	column_types::Vector,
	column_names::Vector{T})
	# Keep a record of number of rows and columns
	nrows, ncols = size(text_data)

	# Short-circuit if the text data is empty
	if nrows == 0
	return DataFrame(column_types, column_names, 0)
	end

	# Make sure that the user has specified coherent types and names
	if ncols != length(column_types) \|\| ncols != length(column_names)
	error("Column types and names do not match the input data's size")
	end

	# Store the columns as a set of DataVec's inside an Array of Any's
	columns = Array(Any, ncols)

	# Convert each column of text into a DataVec of the
	# appropriate type
	for j in 1:ncols
	is_missing = BitVector(nrows)
	for i in 1:nrows
	if contains(missingness_indicators, text_data[i, j])
	text_data[i, j] = string(baseval(column_types[j]))
	is_missing[i] = true
	else
	is_missing[i] = false
	end
	end
	if column_types[j] == Int64
	values = int(text_data[1:nrows, j])
	elseif column_types[j] == Float64
	values = float(text_data[1:nrows, j])
	elseif column_types[j] == UTF8String
	values = convert(Array{UTF8String, 1}, text_data[1:nrows, j])
	elseif column_types[j] == ASCIIString
	values = convert(Array{ASCIIString, 1}, text_data[1:nrows, j])
	else
	error("Column cannot be converted to type: $(column_types[j])")
	end
	columns[j] = DataVec(values, is_missing)
	end

	# Prepare the DataFrame we'll return
	df = DataFrame(columns, column_names)
	return df
	end

	##############################################################################
	#
	# Text input
	#
	##############################################################################

	# Read at most N lines from an IOStream
	# Then return a minibatch of at most N rows as a DataFrame
	function read_minibatch{R <: String,
	S <: String,
	T}(io::IOStream,
	separator::Char,
	quotation_character::Char,
	missingness_indicators::Vector{R},
	column_names::Vector{S},
	column_types::Vector{T},
	minibatch_size::Int64)
	# Keep a record of number of columns
	ncols = length(column_types)

	# Represent data as an array of strings before type conversion
	text_data = read_separated_text(io, minibatch_size, ncols, separator, quotation_character)

	# Convert text data to a DataFrame
	return convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)
	end

	# Read an entire data set into a DataFrame from an IOStream
	# TODO: Do only IO-pass through the data
	function read_table{R <: String,
	S <: String}(io::IOStream,
	separator::Char,
	quotation_character::Char,
	missingness_indicators::Vector{R},
	header::Bool,
	column_names::Vector{S},
	nrows::Int64)
	# Return to start of stream
	seek(io, 0)

	# Read first line to remove header in advance
	if header
	readline(io)
	end

	# Keep a record of number of columns
	ncols = length(column_names)

	# Represent data as an array of strings before type conversion
	text_data = read_separated_text(io, nrows, ncols, separator, quotation_character)

	# Short-circuit if data set is empty except for a header line
	if size(text_data, 1) == 0
	column_types = {Any for i in 1:ncols}
	return DataFrame(column_types, column_names, 0)
	end

	# Infer column types
	column_types = infer_column_types(text_data, missingness_indicators)

	# Convert text data to a DataFrame
	return convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)
	end

	function read_table{T <: String}(filename::T)
	# Do inference for missing configuration settings
	separator = determine_separator(filename)
	quotation_character = '"'
	missingness_indicators = ["", "NA"]
	header = true
	nrows = determine_nrows(filename, header)
	io = open(filename, "r")
	column_names = determine_column_names(io, separator, quotation_character, header)
	df = read_table(io,
	separator,
	quotation_character,
	missingness_indicators,
	header,
	column_names,
	nrows)
	close(io)
	return df
	end

	end #profile

	read_table("/home/bana/Downloads/movies.csv")
	@profile report

	##############################################################################
	#
	# Text output
	#
	##############################################################################

	# Quotation rules
	# Quote all string fields
	# Don't quote real-valued fields
	# Quote non-string, non-real-valued fields
	function in_quotes{T <: String}(val::T, quotation_character::Char)
	strcat(quotation_character, val, quotation_character)
	end
	function in_quotes{T <: Real}(val::T, quotation_character::Char)
	string(val)
	end
	function in_quotes{T <: Any}(val::T, quotation_character::Char)
	strcat(quotation_character, string(val), quotation_character)
	end

	# TODO: write_table should do more to react to the type of each column
	# Need to increase precision of string representation of Float64's
	function print_table(df::DataFrame,
	io::IOStream,
	separator::Char,
	quotation_character::Char)
	n, p = nrow(df), ncol(df)
	column_names = colnames(df)
	for j in 1:p
	if j < p
	print(io, in_quotes(column_names[j], quotation_character))
	print(io, separator)
	else
	println(io, in_quotes(column_names[j], quotation_character))
	end
	end
	for i in 1:n
	for j in 1:p
	if j < p
	print(io, in_quotes(df[i, j], quotation_character))
	print(io, separator)
	else
	println(io, in_quotes(df[i, j], quotation_character))
	end
	end
	end
	end

	function print_table(df::DataFrame, separator::Char, quotation_character::Char)
	print_table(df, OUTPUT_STREAM, separator, quotation_character)
	end

	print_table(df::DataFrame) = print_table(df, OUTPUT_STREAM, ',', '"')

	function write_table{T <: String}(df::DataFrame,
	filename::T,
	separator::Char,
	quotation_character::Char)
	io = open(filename, "w")
	print_table(df, io, separator, quotation_character)
	close(io)
	end

	# Infer configuration settings from filename
	function write_table{T <: String}(df::DataFrame, filename::T)
	separator = determine_separator(filename)
	quotation_character = '"'
	write_table(df, filename, separator, quotation_character)
	end

	##############################################################################
	#
	# Binary serialization
	#
	##############################################################################

	# Wrappers for serialization
	function save(filename, d)
	f = open(filename, "w")
	serialize(f, d)
	close(f)
	end

	function load_df(filename)
	f = open(filename)
	dd = deserialize(f)()
	close(f)
	return dd
	end
	julia> require("DataFrames")
	count time(%) time(s)
	58789 0.00 0.000186 # /home/bana/.julia/DataFrames/src/io.jl, line 20
	58789 0.02 0.028055 # line 21
	58789 0.01 0.006174 # line 22
	58789 0.00 0.000105 # line 23
	58789 0.00 0.000124 # line 24
	5941920 0.03 0.038879 # line 26
	176389 0.00 0.002492 # line 29
	176389 0.00 0.000460 # line 30
	1267077 0.01 0.007016 # line 32
	176389 0.00 0.003109 # line 36
	176389 0.00 0.000457 # line 37
	1410936 0.01 0.016423 # line 40
	1410936 1.60 1.976091 # line 41
	1410936 0.00 0.002732 # line 42
	1410936 0.11 0.134190 # line 43
	2911129 0.02 0.019880 # line 45
	58789 0.00 0.000789 # line 50
	58789 0.01 0.012929 # line 51
	58789 0.12 0.146372 # line 52
	1 0.03 0.033243 # /home/bana/.julia/DataFrames/src/io.jl, line 74
	1 0.00 0.000005 # /home/bana/.julia/DataFrames/src/io.jl, line 95
	1 0.01 0.007993 # line 96
	1 0.12 0.150127 # line 102
	1 0.00 0.000003 # line 105
	1 0.01 0.009379 # /home/bana/.julia/DataFrames/src/io.jl, line 120
	1 0.00 0.000000 # line 122
	58788 0.56 0.694576 # line 124
	58788 0.36 0.442168 # line 128
	58788 17.30 21.355132 # line 129
	1 0.00 0.000000 # /home/bana/.julia/DataFrames/src/io.jl, line 141
	1 0.00 0.000002 # line 145
	25 0.00 0.000001 # line 147
	1298553 5.01 6.185185 # line 156
	1 0.00 0.000000 # line 161
	1 0.00 0.000000 # /home/bana/.julia/DataFrames/src/io.jl, line 210
	1 0.00 0.000004 # line 223
	25 0.00 0.000306 # line 228
	107437 0.44 0.539362 # line 231
	107437 0.01 0.009928 # line 232
	1362263 1.52 1.877132 # line 234
	12 0.76 0.934277 # line 238
	11 0.67 0.832819 # line 240
	2 0.00 0.000717 # line 242
	25 0.02 0.023742 # line 248
	1 0.10 0.123444 # line 252
	1 0.00 0.000000 # line 253
	1 0.00 0.000005 # /home/bana/.julia/DataFrames/src/io.jl, line 294
	1 0.00 0.000078 # line 298
	1 0.00 0.000000 # line 302
	1 18.33 22.622687 # line 305 read_separated_text(io, nrows, ncols, separator, quotation_character)
	1 10.65 13.139062 # line 314 infer_column_types(text_data, missingness_indicators)
	1 6.35 7.838155 # line 317 convert_to_dataframe(text_data, missingness_indicators, column_types, column_names)
	1 0.00 0.000015 # /home/bana/.julia/DataFrames/src/io.jl, line 322
	1 0.00 0.000000 # line 323
	1 0.00 0.000002 # line 324
	1 0.00 0.000000 # line 325
	1 0.03 0.033252 # line 326
	1 0.00 0.000026 # line 327
	1 0.15 0.190201 # line 328
	1 35.63 43.976593 # line 329
	1 0.00 0.000017 # line 336
	1 0.00 0.000000 # line 337