fabacab/xl2sqlite.py

## xl2sqlite.py
#!/usr/bin/env python
#
# Convert a set of similarly-structured .xlsx files into a SQLite DB.
#
# For example, say you have hundreds of Excel files in a directory
# called "big-analysis-project" and that each of these Excel files
# has a worksheet containing the same set of columns. Rather than
# having hundreds of separate Excel files, it would be handy to have
# all their data inside one relational database management system.
#
# Pass this script the path to the directory containing said files,
# and this script will create a SQLite database file in the current
# directory.
#
# From there, most operations are a breeze. For instance, looking at
# only the most recent timestamped entry for each user in the dataset:
#
#     SELECT MAX(Last_Updated) AS Latest_Updated, Column_1, Column_2 FROM records GROUP BY Column_2
#

import sys, glob, openpyxl, sqlite3

def main():
    # TODO: Replace this with getopt.
    if 2 == len(sys.argv) and '--help' == sys.argv[1]:
        print('USAGE:')
        print(sys.argv[0], '[directory]')

    # Choose a directory.
    try:
        where = sys.argv[1]
    except:
        where = '.'

    # Create a SQLite database for the results.
    db_filename = 'excel-data-database.db' # TODO: Set this from user input.
    conn = sqlite3.connect(db_filename)
    db_fields = []

    i = 0
    for filepath in glob.glob(where + '/*.xlsx'):
        print('Loading workbook from ' + filepath, file=sys.stderr)
        try:
            wb = openpyxl.load_workbook(filepath)
        except openpyxl.formula.tokenizer.TokenizerError as e:
            print(e, file=sys.stderr)
            continue
        ws = wb.active
        if 0 == i: # if the first file, create the SQL table
            headers = []
            for cell in ws.rows[0]:
                t = {
                    ord(' '): '_',
                    ord('"'): '',
                    ord('?'): '',
                    ord('/'): '',
                    ord('('): '',
                    ord(')'): ''
                }
                headers.append(cell.value.translate(t))

            db_fields = headers
            try:
                conn.execute('CREATE TABLE records (' + ','.join(headers) +  ')')
            except sqlite3.Error as e:
                print(e, file=sys.stderr)

        for row in ws.get_squared_range(ws.min_column, ws.min_row + 1, ws.max_column, ws.max_row):
            placeholders = []
            vals = []
            for cell in row:
                placeholders.append('?')
                vals.append(cell.value)
            sql = 'INSERT INTO records (' + ','.join(db_fields) + ') VALUES (' + ','.join(placeholders[:len(db_fields)]) + ')'
            try:
                conn.execute(sql, tuple(vals[:len(db_fields)])) # make sure only db_fields number of columns
            except sqlite3.InterfaceError as e:
                print(e, file=sys.stderr)
        conn.commit()
        i = i + 1

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	#
	# Convert a set of similarly-structured .xlsx files into a SQLite DB.
	#
	# For example, say you have hundreds of Excel files in a directory
	# called "big-analysis-project" and that each of these Excel files
	# has a worksheet containing the same set of columns. Rather than
	# having hundreds of separate Excel files, it would be handy to have
	# all their data inside one relational database management system.
	#
	# Pass this script the path to the directory containing said files,
	# and this script will create a SQLite database file in the current
	# directory.
	#
	# From there, most operations are a breeze. For instance, looking at
	# only the most recent timestamped entry for each user in the dataset:
	#
	# SELECT MAX(Last_Updated) AS Latest_Updated, Column_1, Column_2 FROM records GROUP BY Column_2
	#

	import sys, glob, openpyxl, sqlite3

	def main():
	# TODO: Replace this with getopt.
	if 2 == len(sys.argv) and '--help' == sys.argv[1]:
	print('USAGE:')
	print(sys.argv[0], '[directory]')

	# Choose a directory.
	try:
	where = sys.argv[1]
	except:
	where = '.'

	# Create a SQLite database for the results.
	db_filename = 'excel-data-database.db' # TODO: Set this from user input.
	conn = sqlite3.connect(db_filename)
	db_fields = []

	i = 0
	for filepath in glob.glob(where + '/*.xlsx'):
	print('Loading workbook from ' + filepath, file=sys.stderr)
	try:
	wb = openpyxl.load_workbook(filepath)
	except openpyxl.formula.tokenizer.TokenizerError as e:
	print(e, file=sys.stderr)
	continue
	ws = wb.active
	if 0 == i: # if the first file, create the SQL table
	headers = []
	for cell in ws.rows[0]:
	t = {
	ord(' '): '_',
	ord('"'): '',
	ord('?'): '',
	ord('/'): '',
	ord('('): '',
	ord(')'): ''
	}
	headers.append(cell.value.translate(t))

	db_fields = headers
	try:
	conn.execute('CREATE TABLE records (' + ','.join(headers) + ')')
	except sqlite3.Error as e:
	print(e, file=sys.stderr)

	for row in ws.get_squared_range(ws.min_column, ws.min_row + 1, ws.max_column, ws.max_row):
	placeholders = []
	vals = []
	for cell in row:
	placeholders.append('?')
	vals.append(cell.value)
	sql = 'INSERT INTO records (' + ','.join(db_fields) + ') VALUES (' + ','.join(placeholders[:len(db_fields)]) + ')'
	try:
	conn.execute(sql, tuple(vals[:len(db_fields)])) # make sure only db_fields number of columns
	except sqlite3.InterfaceError as e:
	print(e, file=sys.stderr)
	conn.commit()
	i = i + 1

	if __name__ == '__main__':
	main()