Dan-Patterson/excel_np.py

## excel_np.py
# see https://community.esri.com/blogs/dan_patterson/2018/11/18/excel-arrays-tables-in-arcgis-pro for
# more information and descriptions

import numpy as np
def excel_np(path, sheet_num=0):
    """Read excel files to numpy structured/record arrays.  Your spreadsheet
    must adhere to simple rules::
      - first row must contain the field names for the output array
      - no blank rows or columns, basically, no fluff or formatting
      - if you have nodata values, put them in, since blank cells will be
        'corrected' as best as possible.
      - text and numbers in a column, results in a text column

    Parameters:
    -----------
    path : text
        Full path to the xls, xlsx file
    sheet_num : integer
        Sheets are numbered from 0.

    Returns:
    --------
    A numpy structured array is returned.  Excel only uses float or string
    data, so attempts are made to coerse integer columns by comparing the
    float vs int versions of the arrays.  A tad of a kludge, but it works.

    The first row's data type is compared to its matching column data type.
    If they match, then it is used as the dtype.  If there is a mismatch an
    attempt is made to recover numeric data by assigning blanks etc in numeric
    columns a value of np.nan.

    String/text columns are check for empty cells, '', "" and that ever so
    ugly invisible space.

    Notes:
    ------
    >>> aString = open('c:/temp/test.xlsx','rb').read()
    >>> book_ = open_workbook(file_contents=aString)
    >>> dir(book_):
        get_sheet, nsheets, sheet_by_index, sheet_by_name etc....

    Now you can read a sheet

    >>> sheet = book_.sheet_by_index(0)  # first sheet
    >>> sheet.col_types(0)

    References:
    ----------
    `<https://media.readthedocs.org/pdf/xlrd/latest/xlrd.pdf>`_.
    """
    def _values(sheet, rows, cols):
        """return cell types for the above.  Skip the first row
        Not use.... just kept for future reference
        """
        ar = []
        for i in range(1, rows):
            c = []
            for j in range(cols):
                c.append(sheet.cell_values(i, j))  # sheet.cell_types also
            ar.append(c)
        return ar

    def isfloat(a):
        """float check"""
        try:
            i = float(a)
            return i
        except ValueError:
            return np.nan

    def punc_space(name):
        """delete punctuation and spaces and replace with '_'"""
        punc = list('!"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~ ')
        return "".join([[i, '_'][i in punc] for i in name])

    import xlrd
    w = xlrd.open_workbook(path)        # xlrd.book.Book class
    sheet = w.sheet_by_index(sheet_num) # sheet by number
    # sheet = w.sheet_by_name('test')   # case sensitive, not implemented
    names = sheet.row_values(0)         # clean these up later
    cols = sheet.ncols
    rows = sheet.nrows
    col_data = [sheet.col_values(i, 1, rows) for i in range(cols)]
    row_guess = sheet.row_values(1)
    row_dts = [np.asarray(i).dtype.kind for i in row_guess]
    col_dts = [np.asarray(col_data[i]).dtype.kind
               for i in range(cols)]
    clean = []
    for i in range(len(row_dts)):
        c = col_data[i]
        if row_dts[i] == col_dts[i]:    # same dtype... send to array
            ar = np.asarray(c)
        if row_dts[i] == 'f':           # float? if so, substitute np.nan
            ar = np.array([isfloat(i) for i in c])
            is_nan = np.isnan(ar)       # find the nan values, then check
            not_nan = ar[~is_nan]       # are the floats == ints?
            if np.all(np.equal(not_nan, not_nan.astype('int'))):  # integer?
                ar[is_nan] = -999
                ar = ar.astype('int')
        elif row_dts[i] in ('U', 'S'):  # unicode/string... send to array
            ar = np.char.strip(ar)
            ar = np.where(np.char.str_len(ar) == 0, 'None', ar)
        else:
            ar = np.asarray(c)
        clean.append(ar)
    # ---- assemble the columns for the array ----
    dt_str = [i.dtype.str for i in clean]
    names = [i.strip() for i in names]      # clean up leading/trailing spaces
    names = [punc_space(i) for i in names]  # replace punctuation and spaces
    dts_name = list(zip(names, dt_str))
    arr = np.empty((rows-1,), dtype= dts_name)
    cnt = 0
    for i in names:
        arr[i] = clean[cnt]
        cnt +=1
    return arr
#
# ----------------------------------------------------------------------
# __main__ .... code section
if __name__ == "__main__":
    """Optionally...
    : provide a spreadsheet for testing
    """
    #path = "c:/test/text.xlsx"
    #arr = excel_np(path, 0)
	# see https://community.esri.com/blogs/dan_patterson/2018/11/18/excel-arrays-tables-in-arcgis-pro for
	# more information and descriptions

	import numpy as np
	def excel_np(path, sheet_num=0):
	"""Read excel files to numpy structured/record arrays. Your spreadsheet
	must adhere to simple rules::
	- first row must contain the field names for the output array
	- no blank rows or columns, basically, no fluff or formatting
	- if you have nodata values, put them in, since blank cells will be
	'corrected' as best as possible.
	- text and numbers in a column, results in a text column

	Parameters:
	-----------
	path : text
	Full path to the xls, xlsx file
	sheet_num : integer
	Sheets are numbered from 0.

	Returns:
	--------
	A numpy structured array is returned. Excel only uses float or string
	data, so attempts are made to coerse integer columns by comparing the
	float vs int versions of the arrays. A tad of a kludge, but it works.

	The first row's data type is compared to its matching column data type.
	If they match, then it is used as the dtype. If there is a mismatch an
	attempt is made to recover numeric data by assigning blanks etc in numeric
	columns a value of np.nan.

	String/text columns are check for empty cells, '', "" and that ever so
	ugly invisible space.

	Notes:
	------
	>>> aString = open('c:/temp/test.xlsx','rb').read()
	>>> book_ = open_workbook(file_contents=aString)
	>>> dir(book_):
	get_sheet, nsheets, sheet_by_index, sheet_by_name etc....

	Now you can read a sheet

	>>> sheet = book_.sheet_by_index(0) # first sheet
	>>> sheet.col_types(0)

	References:
	----------
	`<https://media.readthedocs.org/pdf/xlrd/latest/xlrd.pdf>`_.
	"""
	def _values(sheet, rows, cols):
	"""return cell types for the above. Skip the first row
	Not use.... just kept for future reference
	"""
	ar = []
	for i in range(1, rows):
	c = []
	for j in range(cols):
	c.append(sheet.cell_values(i, j)) # sheet.cell_types also
	ar.append(c)
	return ar

	def isfloat(a):
	"""float check"""
	try:
	i = float(a)
	return i
	except ValueError:
	return np.nan

	def punc_space(name):
	"""delete punctuation and spaces and replace with '_'"""
	punc = list('!"#$%&\'()*+,-./:;<=>?@[\\]^`{\|}~ ')
	return "".join([[i, '_'][i in punc] for i in name])

	import xlrd
	w = xlrd.open_workbook(path) # xlrd.book.Book class
	sheet = w.sheet_by_index(sheet_num) # sheet by number
	# sheet = w.sheet_by_name('test') # case sensitive, not implemented
	names = sheet.row_values(0) # clean these up later
	cols = sheet.ncols
	rows = sheet.nrows
	col_data = [sheet.col_values(i, 1, rows) for i in range(cols)]
	row_guess = sheet.row_values(1)
	row_dts = [np.asarray(i).dtype.kind for i in row_guess]
	col_dts = [np.asarray(col_data[i]).dtype.kind
	for i in range(cols)]
	clean = []
	for i in range(len(row_dts)):
	c = col_data[i]
	if row_dts[i] == col_dts[i]: # same dtype... send to array
	ar = np.asarray(c)
	if row_dts[i] == 'f': # float? if so, substitute np.nan
	ar = np.array([isfloat(i) for i in c])
	is_nan = np.isnan(ar) # find the nan values, then check
	not_nan = ar[~is_nan] # are the floats == ints?
	if np.all(np.equal(not_nan, not_nan.astype('int'))): # integer?
	ar[is_nan] = -999
	ar = ar.astype('int')
	elif row_dts[i] in ('U', 'S'): # unicode/string... send to array
	ar = np.char.strip(ar)
	ar = np.where(np.char.str_len(ar) == 0, 'None', ar)
	else:
	ar = np.asarray(c)
	clean.append(ar)
	# ---- assemble the columns for the array ----
	dt_str = [i.dtype.str for i in clean]
	names = [i.strip() for i in names] # clean up leading/trailing spaces
	names = [punc_space(i) for i in names] # replace punctuation and spaces
	dts_name = list(zip(names, dt_str))
	arr = np.empty((rows-1,), dtype= dts_name)
	cnt = 0
	for i in names:
	arr[i] = clean[cnt]
	cnt +=1
	return arr
	#
	# ----------------------------------------------------------------------
	# __main__ .... code section
	if __name__ == "__main__":
	"""Optionally...
	: provide a spreadsheet for testing
	"""
	#path = "c:/test/text.xlsx"
	#arr = excel_np(path, 0)