BurntSushi/qcsv.py

## qcsv.py
"""
A small API to read and analyze CSV files by inferring types for
each column of data.

Currently, only int, float and string types are supported.
from collections import namedtuple
"""
from __future__ import absolute_import, division, print_function
from collections import namedtuple
import csv

import numpy as np

__pdoc__ = {}

Table = namedtuple('Table', ['types', 'names', 'rows'])
__pdoc__['Table.types'] = '''
Contains inferred type information for each column in the table
as a dictionary mapping type name to a Python type constructor.
When a type cannot be inferred, it will have type `None`.
'''
__pdoc__['Table.names'] = '''
A list of column names from the header row of the source data.
'''
__pdoc__['Table.rows'] = '''
A list of rows, where each row is a list of data. Each datum
is guaranteed to have type `float`, `int`, `str` or will be the
`None` value.
'''

Column = namedtuple('Column', ['type', 'name', 'cells'])
__pdoc__['Column.type'] = '''
The type of this column as a Python type constructor, or `None`
if the type could not be inferred.
'''
__pdoc__['Column.name'] = '''
The name of this column.
'''
__pdoc__['Column.cells'] = '''
A list of list of all data in this column. Each datum is guaranteed to
have type `float`, `int`, `str` or will be the `None` value.
'''

try:
    text_type = basestring
except NameError:
    text_type = str


def read(fname, delimiter=',', skip_header=False):
    """
    `read` loads cell data, column headers and type information
    for each column given a file path to a CSV formatted file. A
    `qcsv.Table` namedtuple is returned with fields `qcsv.Table.types`,
    `qcsv.Table.names` and `qcsv.Table.rows`.

    All cells have left and right whitespace trimmed.

    All rows **must** be the same length.

    `delimiter` is the string the separates each field in a row.

    If `skip_header` is set, then no column headers are read, and
    column names are set to their corresponding indices (as strings).
    """
    names, rows = _data(fname, delimiter, skip_header)
    types = _column_types(names, rows)

    return cast(Table(types=types, names=names, rows=rows))


def _data(fname, delimiter=',', skip_header=False):
    """
    `_data` loads cell data and column headers, and returns the names
    and rows.

    All cells have left and right whitespace trimmed.

    All rows MUST be the same length.

    `delimiter` and `skip_header` are described in `qcsv.read`.
    """
    names = []
    rows = []
    reader = csv.reader(open(fname), delimiter=delimiter)
    if not skip_header:
        names = list(map(str.strip, next(reader)))

    for i, row in enumerate(reader):
        # If we haven't discovered names from column headers, then name the
        # columns "0", "1", ..., "n-1" where "n" is the number of columns in
        # the first row.
        if len(names) == 0:
            names = list(map(str, range(0, len(row))))
        assert len(row) == len(names), \
            'The length of row %d is %d, but others rows have length %d' \
            % (i, len(row), len(names))

        rows.append(list(map(str.strip, row)))

    return names, rows


def _column_types(names, rows):
    """
    `_column_types` infers type information from the columns in
    `rows`. Types are stored as either a Python type constructor (str,
    int or float) or as a `None` value. A dictionary of column names to
    types is returned.

    A column has type `None` if and only if all cells in the column are
    empty.  (Cells are empty if the length of its value is zero after
    left and right whitespace has been trimmed.)

    A column has type `float` if and only if all cells in the column
    are empty, integers or floats AND at least one value is a float.

    A column has type `int` if and only if all cells in the column are
    empty or integers AND at least one value is an int.

    A column has type `str` in any other case.
    """
    types = dict([(name, None) for name in names])

    for c in range(len(names)):
        # prev_typ is what we believe the type of this column to be up
        # until this point.
        prev_typ = None

        # next_typ is the type of the current cell only. It is compared
        # with prev_typ to determine the type of the overall column as
        # per the conditions specified in this function's documentation.
        next_typ = None

        for row in rows:
            col = row[c]

            # A missing value always has type None.
            if len(col) == 0:
                next_typ = None
            # No need to inspect the type if we've already committed to str.
            # (We bail out because it's expensive to inspect types like this.)
            elif prev_typ is not str:
                # The trick here is to attempt type casting from a stirng to
                # an int or a string to a float, and if Python doesn't like it,
                # we try something else.
                try:
                    # We try int first, since any integer can be successfully
                    # converted to a float, but not all floats can converted
                    # to integers.
                    int(col)
                    next_typ = int
                except ValueError:
                    try:
                        # If we can't convert to float, then we must scale back
                        # to a string.
                        float(col)
                        next_typ = float
                    except ValueError:
                        next_typ = str

            # If a column contains a string, the column type is always a
            # string.
            if prev_typ is str or next_typ is str:
                prev_typ = str
            # A column with floats and ints has type float.
            elif next_typ is float and prev_typ is int:
                prev_typ = float
            # A column with missing values and X has type X.
            elif prev_typ is None and next_typ is not None:
                prev_typ = next_typ

        types[names[c]] = prev_typ
    return types


def map_names(table, f):
    """
    `map_names` executes `f` on every column header in `table`, with
    three arguments, in order: column type, column index, column
    name. The result of the function is placed in the corresponding
    header location.

    A new `qcsv.Table` is returned with the new column names.
    """
    new_names = []
    for i, name in enumerate(table.names):
        new_names.append(f(table.types[name], i, name))
    return table._replace(names=new_names)


def map_data(table, f):
    """
    `map_data` executes `f` on every cell in `table` with five
    arguments, in order: column type, column name, row index, column
    index, contents. The result of the function is placed in the
    corresponding cell location.

    A new `qcsv.Table` is returned with the converted values.
    """
    new_rows = [None] * len(table.rows)
    for r, row in enumerate(table.rows):
        new_row = [None] * len(row)
        for c, col in enumerate(row):
            name = table.names[c]
            typ = table.types[name]
            new_row[c] = f(typ, name, r, c, col)
        new_rows[r] = new_row
    return table._replace(rows=new_rows)


def cast(table):
    """
    `cast` type casts all of the values in `table` to their
    corresponding types in `qcsv.Table.types`.

    The only special case here is missing values or NULL columns. If a
    value is missing or a column has type NULL (i.e., all values are
    missing), then the value is replaced with `None`.

    N.B. cast is idempotent. i.e., `cast(x) = cast(cast(x))`.
    """
    def f(typ, name, r, c, cell):
        if (isinstance(cell, text_type) and len(cell) == 0) \
                or typ is None or cell is None:
            return None
        return typ(cell)
    return map_data(table, f)


def convert_missing_cells(table, dstr="", dint=0, dfloat=0.0):
    """
    `convert_missing_cells` changes the values of all NULL cells to the
    values specified by `dstr`, `dint` and `dfloat`. For example, all
    NULL cells in columns with type `str` will be replaced with the
    value given to `dstr`.
    """
    def f(typ, name, r, c, cell):
        if cell is None and typ is not None:
            if typ == str:
                return dstr
            elif typ == int:
                return dint
            elif typ == float:
                return dfloat
            else:
                assert False, "Unknown type: %s" % typ
        return cell
    return map_data(table, f)


def convert_columns(table, **kwargs):
    """
    `convert_columns` executes converter functions on specific columns,
    where the parameter names for `kwargs` are the column names, and
    the parameter values are functions of one parameter that return a
    single value.

    For example

        convert_columns(names, rows, colname=lambda s: s.lower())

    would convert all values in the column with name `colname` to
    lowercase.
    """
    def f(typ, name, r, c, cell):
        if name in kwargs:
            return kwargs[name](cell)
        return cell
    return map_data(table, f)


def convert_types(table, fstr=None, fint=None, ffloat=None):
    """
    `convert_types` works just like `qcsv.convert_columns`, but on
    types instead of specific columns.
    """
    def f(typ, name, r, c, cell):
        if typ == str and fstr is not None:
            return fstr(cell)
        elif typ == int and fint is not None:
            return fint(cell)
        elif typ == float and ffloat is not None:
            return ffloat(cell)
        return cell
    return map_data(table, f)


def column(table, colname):
    """
    `column` returns a named tuple `qcsv.Column` of the column in
    `table` with name `colname`.
    """
    colcells = []
    colname = colname.lower()
    colindex = -1
    for i, name in enumerate(table.names):
        if name.lower() == colname.lower():
            colindex = i
            break
    assert colindex > -1, 'Column name %s does not exist' % colname

    for row in table.rows:
        for i, col in enumerate(row):
            if i == colindex:
                colcells.append(col)

    return Column(type=table.types[table.names[colindex]],
                  name=table.names[colindex],
                  cells=np.array(colcells))


def columns(table):
    """
    `columns` returns a list of all columns in the data set, where each
    column has type `qcsv.Column`.
    """
    colcells = []
    for _ in table.names:
        colcells.append([])
    for row in table.rows:
        for i, col in enumerate(row):
            colcells[i].append(col)

    cols = []
    for i, name in enumerate(table.names):
        col = Column(type=table.types[name],
                     name=name,
                     cells=np.array(colcells[i]))
        cols.append(col)
    return cols


def frequencies(column):
    """
    `frequencies` returns a dictionary where the keys are unique values
    in the column, and the values correspond to the frequency of each
    value in the column.
    """
    ukeys = np.unique(column.cells)
    bins = np.searchsorted(ukeys, column.cells)
    return dict(zip(ukeys, np.bincount(bins)))


def type_str(typ):
    """
    `type_str` returns a string representation of a column type.
    """
    if typ is None:
        return "None"
    elif typ is float:
        return "float"
    elif typ is int:
        return "int"
    elif typ is str:
        return "str"
    return "Unknown"


def cell_str(cell_contents):
    """
    `cell_str` is a convenience function for converting cell contents
    to a string when there are still NULL values.

    N.B. If you choose to work with data while keeping NULL values, you
    will likely need to write more functions similar to this one.
    """
    if cell_contents is None:
        return "NULL"
    return str(cell_contents)


def print_data_table(table):
    """
    `print_data_table` is a convenience function for pretty-printing
    the data in tabular format, including header names and type
    annotations.
    """
    padding = 2
    headers = ['%s (%s)' % (n, type_str(table.types[n])) for n in table.names]
    maxlens = list(map(len, headers))
    for row in table.rows:
        for i, col in enumerate(row):
            maxlens[i] = max(maxlens[i], len(cell_str(col)))

    def padded_cell(i, s):
        spaces = maxlens[i] - len(cell_str(s)) + padding
        return '%s%s' % (cell_str(s), ' ' * spaces)

    line = ""
    for i, name in enumerate(headers):
        line += padded_cell(i, name)
    print(line)
    print('-' * (sum(map(len, headers)) + len(headers) * padding))
    for row in table.rows:
        line = ""
        for i, col in enumerate(row):
            line += padded_cell(i, cell_str(col))
        print(line)


if __name__ == '__main__':
    # File name.
    f = "sample.csv"

    table = read(f)

    # Print the table of raw data.
    print("# Raw data.")
    print_data_table(table)
    print('\n')

    # Print the table after converting missing values from NULL to concrete
    # values. The benefit here is that NULL values are inherently incomputable.
    # Whenever they get thrown into a computation on the data, they will always
    # provoke a runtime error. This is a Good Thing, because missing values
    # SHOULD be given explicit treatment. Inserting values into missing cells
    # is making an *assumption* about the data, and should never be implied.
    #
    # Thus, `convert_missing_cells` is an EXPLICIT way of throwing away NULL
    # cells. If you view the output from the previous table, and the output of
    # the next table, you'll notice that NULL values have been replaced.
    # (String columns get empty strings, integer columns get 0 and float
    # columns get 0.0.)
    #
    # If you want to change what the missing values are replaced with, you can
    # use the function's optional parameters:
    #
    #   rows = convert_missing_cells(types, names, rows,
    #                                dstr="-9.99", dfloat=-9.99, dint=-9)
    table = convert_missing_cells(table)
    print("# Convert missing cells to arbitrary values")
    print_data_table(table)
    print('\n')

    # Now that all of the NULL cells have been removed, we are free to run data
    # sanitization functions on the columns of data without worrying about
    # seeing those nasty NULL values. For instance, we might want to make all
    # strings in the 'string1' column be lowercase. We need only to pass a
    # function as an argument, where the function we pass takes a single
    # argument (the cell contents) and returns the new cell contents. In this
    # case, we tell every cell in the `string1` column to be converted using
    # the `str.lower` function.
    table = convert_columns(table, string1=str.lower)
    print("# Sanitize just one column of data")
    print_data_table(table)
    print('\n')

    # The aforementioned function has limited use, since you typically
    # want to be more dynamic than having to give names of columns. Thus, the
    # `convert_types` function allows you to convert cells based on their
    # *type*. That is, instead of making only a selection of columns lowercase,
    # we can specify that *all* string columns should be lowercase.
    table = convert_types(table, fstr=str.lower)
    print("# Sanitize all cells that have type string")
    print_data_table(table)
    print('\n')

    # Finally, you can traverse your data set by columns like so:
    for col in columns(table):
        print('(%s, %s) [%s]'
              % (col.name, col.type, ', '.join(map(cell_str, col.cells))))
    print('\n')

    # Or pick out one column in particular:
    print(column(table, "mixed"))
	"""
	A small API to read and analyze CSV files by inferring types for
	each column of data.

	Currently, only int, float and string types are supported.
	from collections import namedtuple
	"""
	from __future__ import absolute_import, division, print_function
	from collections import namedtuple
	import csv

	import numpy as np

	__pdoc__ = {}

	Table = namedtuple('Table', ['types', 'names', 'rows'])
	__pdoc__['Table.types'] = '''
	Contains inferred type information for each column in the table
	as a dictionary mapping type name to a Python type constructor.
	When a type cannot be inferred, it will have type `None`.
	'''
	__pdoc__['Table.names'] = '''
	A list of column names from the header row of the source data.
	'''
	__pdoc__['Table.rows'] = '''
	A list of rows, where each row is a list of data. Each datum
	is guaranteed to have type `float`, `int`, `str` or will be the
	`None` value.
	'''

	Column = namedtuple('Column', ['type', 'name', 'cells'])
	__pdoc__['Column.type'] = '''
	The type of this column as a Python type constructor, or `None`
	if the type could not be inferred.
	'''
	__pdoc__['Column.name'] = '''
	The name of this column.
	'''
	__pdoc__['Column.cells'] = '''
	A list of list of all data in this column. Each datum is guaranteed to
	have type `float`, `int`, `str` or will be the `None` value.
	'''

	try:
	text_type = basestring
	except NameError:
	text_type = str


	def read(fname, delimiter=',', skip_header=False):
	"""
	`read` loads cell data, column headers and type information
	for each column given a file path to a CSV formatted file. A
	`qcsv.Table` namedtuple is returned with fields `qcsv.Table.types`,
	`qcsv.Table.names` and `qcsv.Table.rows`.

	All cells have left and right whitespace trimmed.

	All rows must be the same length.

	`delimiter` is the string the separates each field in a row.

	If `skip_header` is set, then no column headers are read, and
	column names are set to their corresponding indices (as strings).
	"""
	names, rows = _data(fname, delimiter, skip_header)
	types = _column_types(names, rows)

	return cast(Table(types=types, names=names, rows=rows))


	def _data(fname, delimiter=',', skip_header=False):
	"""
	`_data` loads cell data and column headers, and returns the names
	and rows.

	All cells have left and right whitespace trimmed.

	All rows MUST be the same length.

	`delimiter` and `skip_header` are described in `qcsv.read`.
	"""
	names = []
	rows = []
	reader = csv.reader(open(fname), delimiter=delimiter)
	if not skip_header:
	names = list(map(str.strip, next(reader)))

	for i, row in enumerate(reader):
	# If we haven't discovered names from column headers, then name the
	# columns "0", "1", ..., "n-1" where "n" is the number of columns in
	# the first row.
	if len(names) == 0:
	names = list(map(str, range(0, len(row))))
	assert len(row) == len(names), \
	'The length of row %d is %d, but others rows have length %d' \
	% (i, len(row), len(names))

	rows.append(list(map(str.strip, row)))

	return names, rows


	def _column_types(names, rows):
	"""
	`_column_types` infers type information from the columns in
	`rows`. Types are stored as either a Python type constructor (str,
	int or float) or as a `None` value. A dictionary of column names to
	types is returned.

	A column has type `None` if and only if all cells in the column are
	empty. (Cells are empty if the length of its value is zero after
	left and right whitespace has been trimmed.)

	A column has type `float` if and only if all cells in the column
	are empty, integers or floats AND at least one value is a float.

	A column has type `int` if and only if all cells in the column are
	empty or integers AND at least one value is an int.

	A column has type `str` in any other case.
	"""
	types = dict([(name, None) for name in names])

	for c in range(len(names)):
	# prev_typ is what we believe the type of this column to be up
	# until this point.
	prev_typ = None

	# next_typ is the type of the current cell only. It is compared
	# with prev_typ to determine the type of the overall column as
	# per the conditions specified in this function's documentation.
	next_typ = None

	for row in rows:
	col = row[c]

	# A missing value always has type None.
	if len(col) == 0:
	next_typ = None
	# No need to inspect the type if we've already committed to str.
	# (We bail out because it's expensive to inspect types like this.)
	elif prev_typ is not str:
	# The trick here is to attempt type casting from a stirng to
	# an int or a string to a float, and if Python doesn't like it,
	# we try something else.
	try:
	# We try int first, since any integer can be successfully
	# converted to a float, but not all floats can converted
	# to integers.
	int(col)
	next_typ = int
	except ValueError:
	try:
	# If we can't convert to float, then we must scale back
	# to a string.
	float(col)
	next_typ = float
	except ValueError:
	next_typ = str

	# If a column contains a string, the column type is always a
	# string.
	if prev_typ is str or next_typ is str:
	prev_typ = str
	# A column with floats and ints has type float.
	elif next_typ is float and prev_typ is int:
	prev_typ = float
	# A column with missing values and X has type X.
	elif prev_typ is None and next_typ is not None:
	prev_typ = next_typ

	types[names[c]] = prev_typ
	return types


	def map_names(table, f):
	"""
	`map_names` executes `f` on every column header in `table`, with
	three arguments, in order: column type, column index, column
	name. The result of the function is placed in the corresponding
	header location.

	A new `qcsv.Table` is returned with the new column names.
	"""
	new_names = []
	for i, name in enumerate(table.names):
	new_names.append(f(table.types[name], i, name))
	return table._replace(names=new_names)


	def map_data(table, f):
	"""
	`map_data` executes `f` on every cell in `table` with five
	arguments, in order: column type, column name, row index, column
	index, contents. The result of the function is placed in the
	corresponding cell location.

	A new `qcsv.Table` is returned with the converted values.
	"""
	new_rows = [None] * len(table.rows)
	for r, row in enumerate(table.rows):
	new_row = [None] * len(row)
	for c, col in enumerate(row):
	name = table.names[c]
	typ = table.types[name]
	new_row[c] = f(typ, name, r, c, col)
	new_rows[r] = new_row
	return table._replace(rows=new_rows)


	def cast(table):
	"""
	`cast` type casts all of the values in `table` to their
	corresponding types in `qcsv.Table.types`.

	The only special case here is missing values or NULL columns. If a
	value is missing or a column has type NULL (i.e., all values are
	missing), then the value is replaced with `None`.

	N.B. cast is idempotent. i.e., `cast(x) = cast(cast(x))`.
	"""
	def f(typ, name, r, c, cell):
	if (isinstance(cell, text_type) and len(cell) == 0) \
	or typ is None or cell is None:
	return None
	return typ(cell)
	return map_data(table, f)


	def convert_missing_cells(table, dstr="", dint=0, dfloat=0.0):
	"""
	`convert_missing_cells` changes the values of all NULL cells to the
	values specified by `dstr`, `dint` and `dfloat`. For example, all
	NULL cells in columns with type `str` will be replaced with the
	value given to `dstr`.
	"""
	def f(typ, name, r, c, cell):
	if cell is None and typ is not None:
	if typ == str:
	return dstr
	elif typ == int:
	return dint
	elif typ == float:
	return dfloat
	else:
	assert False, "Unknown type: %s" % typ
	return cell
	return map_data(table, f)


	def convert_columns(table, **kwargs):
	"""
	`convert_columns` executes converter functions on specific columns,
	where the parameter names for `kwargs` are the column names, and
	the parameter values are functions of one parameter that return a
	single value.

	For example

	convert_columns(names, rows, colname=lambda s: s.lower())

	would convert all values in the column with name `colname` to
	lowercase.
	"""
	def f(typ, name, r, c, cell):
	if name in kwargs:
	return kwargs[name](cell)
	return cell
	return map_data(table, f)


	def convert_types(table, fstr=None, fint=None, ffloat=None):
	"""
	`convert_types` works just like `qcsv.convert_columns`, but on
	types instead of specific columns.
	"""
	def f(typ, name, r, c, cell):
	if typ == str and fstr is not None:
	return fstr(cell)
	elif typ == int and fint is not None:
	return fint(cell)
	elif typ == float and ffloat is not None:
	return ffloat(cell)
	return cell
	return map_data(table, f)


	def column(table, colname):
	"""
	`column` returns a named tuple `qcsv.Column` of the column in
	`table` with name `colname`.
	"""
	colcells = []
	colname = colname.lower()
	colindex = -1
	for i, name in enumerate(table.names):
	if name.lower() == colname.lower():
	colindex = i
	break
	assert colindex > -1, 'Column name %s does not exist' % colname

	for row in table.rows:
	for i, col in enumerate(row):
	if i == colindex:
	colcells.append(col)

	return Column(type=table.types[table.names[colindex]],
	name=table.names[colindex],
	cells=np.array(colcells))


	def columns(table):
	"""
	`columns` returns a list of all columns in the data set, where each
	column has type `qcsv.Column`.
	"""
	colcells = []
	for _ in table.names:
	colcells.append([])
	for row in table.rows:
	for i, col in enumerate(row):
	colcells[i].append(col)

	cols = []
	for i, name in enumerate(table.names):
	col = Column(type=table.types[name],
	name=name,
	cells=np.array(colcells[i]))
	cols.append(col)
	return cols


	def frequencies(column):
	"""
	`frequencies` returns a dictionary where the keys are unique values
	in the column, and the values correspond to the frequency of each
	value in the column.
	"""
	ukeys = np.unique(column.cells)
	bins = np.searchsorted(ukeys, column.cells)
	return dict(zip(ukeys, np.bincount(bins)))


	def type_str(typ):
	"""
	`type_str` returns a string representation of a column type.
	"""
	if typ is None:
	return "None"
	elif typ is float:
	return "float"
	elif typ is int:
	return "int"
	elif typ is str:
	return "str"
	return "Unknown"


	def cell_str(cell_contents):
	"""
	`cell_str` is a convenience function for converting cell contents
	to a string when there are still NULL values.

	N.B. If you choose to work with data while keeping NULL values, you
	will likely need to write more functions similar to this one.
	"""
	if cell_contents is None:
	return "NULL"
	return str(cell_contents)


	def print_data_table(table):
	"""
	`print_data_table` is a convenience function for pretty-printing
	the data in tabular format, including header names and type
	annotations.
	"""
	padding = 2
	headers = ['%s (%s)' % (n, type_str(table.types[n])) for n in table.names]
	maxlens = list(map(len, headers))
	for row in table.rows:
	for i, col in enumerate(row):
	maxlens[i] = max(maxlens[i], len(cell_str(col)))

	def padded_cell(i, s):
	spaces = maxlens[i] - len(cell_str(s)) + padding
	return '%s%s' % (cell_str(s), ' ' * spaces)

	line = ""
	for i, name in enumerate(headers):
	line += padded_cell(i, name)
	print(line)
	print('-' * (sum(map(len, headers)) + len(headers) * padding))
	for row in table.rows:
	line = ""
	for i, col in enumerate(row):
	line += padded_cell(i, cell_str(col))
	print(line)


	if __name__ == '__main__':
	# File name.
	f = "sample.csv"

	table = read(f)

	# Print the table of raw data.
	print("# Raw data.")
	print_data_table(table)
	print('\n')

	# Print the table after converting missing values from NULL to concrete
	# values. The benefit here is that NULL values are inherently incomputable.
	# Whenever they get thrown into a computation on the data, they will always
	# provoke a runtime error. This is a Good Thing, because missing values
	# SHOULD be given explicit treatment. Inserting values into missing cells
	# is making an assumption about the data, and should never be implied.
	#
	# Thus, `convert_missing_cells` is an EXPLICIT way of throwing away NULL
	# cells. If you view the output from the previous table, and the output of
	# the next table, you'll notice that NULL values have been replaced.
	# (String columns get empty strings, integer columns get 0 and float
	# columns get 0.0.)
	#
	# If you want to change what the missing values are replaced with, you can
	# use the function's optional parameters:
	#
	# rows = convert_missing_cells(types, names, rows,
	# dstr="-9.99", dfloat=-9.99, dint=-9)
	table = convert_missing_cells(table)
	print("# Convert missing cells to arbitrary values")
	print_data_table(table)
	print('\n')

	# Now that all of the NULL cells have been removed, we are free to run data
	# sanitization functions on the columns of data without worrying about
	# seeing those nasty NULL values. For instance, we might want to make all
	# strings in the 'string1' column be lowercase. We need only to pass a
	# function as an argument, where the function we pass takes a single
	# argument (the cell contents) and returns the new cell contents. In this
	# case, we tell every cell in the `string1` column to be converted using
	# the `str.lower` function.
	table = convert_columns(table, string1=str.lower)
	print("# Sanitize just one column of data")
	print_data_table(table)
	print('\n')

	# The aforementioned function has limited use, since you typically
	# want to be more dynamic than having to give names of columns. Thus, the
	# `convert_types` function allows you to convert cells based on their
	# type. That is, instead of making only a selection of columns lowercase,
	# we can specify that all string columns should be lowercase.
	table = convert_types(table, fstr=str.lower)
	print("# Sanitize all cells that have type string")
	print_data_table(table)
	print('\n')

	# Finally, you can traverse your data set by columns like so:
	for col in columns(table):
	print('(%s, %s) [%s]'
	% (col.name, col.type, ', '.join(map(cell_str, col.cells))))
	print('\n')

	# Or pick out one column in particular:
	print(column(table, "mixed"))