alexwoolford/mongousExcelGobbler.py

## mongousExcelGobbler.py
#!/usr/bin/python

"""
From time-to-time it's necessary to consolidate data from multiple spreadsheets. If the spreadsheets lack validation
rules, users may enter values that break business rules, add columns, change the column order, etc... it can become a
messy problem.

One possible way to handle this is to load all the workbooks/worksheets into a NoSQL database (e.g. MongoDB), and explore
the dataset in its entirety in order to identify and address anomalies.

In the example below, all the every row is loaded into its own Mongo document.

"""

from openpyxl import load_workbook
import json
from pymongo import MongoClient
import glob
import os


# Connect to Mongo and create a 'records' collection.

client = MongoClient()
db = client.excelGobbler
collection = db.records


# Identify all the xlsx spreadsheets in a folder.

files = glob.glob(r'/Users/awoolford/Documents/fileGobbler/*.xlsx')


# Each cell contains the attributes below. They're listed in the order that they're returned in openpyxl.

RAW_ATTRIBUTES = ['row', 'column', 'coordinate', 'internal_value', 'data_type', 'style_id', 'number_format']

# Loop through all the workbooks and worksheets, loading every row into its own Mongo document.

for file in files:
    wb = load_workbook(filename = file, use_iterators = True)
    for ws in wb.worksheets:
        for rownum, row in  enumerate(ws.iter_rows()):
            if rownum == 0:
                cols = [col.internal_value.encode('utf-8') for col in row]
                print cols
            for colnum, col in enumerate(cols):
                record = {col: dict(zip(RAW_ATTRIBUTES, row[colnum]))}
                record['sheetname'] = ws._title
                record['filename'] = os.path.basename(file)
                collection.insert(record)

"""
The following Mongo mapreduce job identifies all the column names:

    > mr = db.runCommand({
    ...   "mapreduce" : "records",
    ...   "map" : function() {
    ...     for (var key in this) { emit(key, null); }
    ...   },
    ...   "reduce" : function(key, stuff) { return null; },
    ...   "out": "records" + "_keys"
    ... })
    {
    	"result" : "records_keys",
    	"timeMillis" : 102,
    	"counts" : {
    		"input" : 762,
    		"emit" : 3048,
    		"reduce" : 42,
    		"output" : 7
    	},
    	"ok" : 1
    }
    > db.records_keys.find()
    { "_id" : "_c1", "value" : null }
    { "_id" : "_id", "value" : null }
    { "_id" : "filename", "value" : null }
    { "_id" : "ingestdate", "value" : null }
    { "_id" : "mfr", "value" : null }
    { "_id" : "records", "value" : null }
    { "_id" : "sheetname", "value" : null }

The next step would probably be to load this into Hive and query using HQL, e.g. https://github.com/mongodb/mongo-hadoop/tree/master/hive

"""
	#!/usr/bin/python

	"""
	From time-to-time it's necessary to consolidate data from multiple spreadsheets. If the spreadsheets lack validation
	rules, users may enter values that break business rules, add columns, change the column order, etc... it can become a
	messy problem.

	One possible way to handle this is to load all the workbooks/worksheets into a NoSQL database (e.g. MongoDB), and explore
	the dataset in its entirety in order to identify and address anomalies.

	In the example below, all the every row is loaded into its own Mongo document.

	"""

	from openpyxl import load_workbook
	import json
	from pymongo import MongoClient
	import glob
	import os


	# Connect to Mongo and create a 'records' collection.

	client = MongoClient()
	db = client.excelGobbler
	collection = db.records


	# Identify all the xlsx spreadsheets in a folder.

	files = glob.glob(r'/Users/awoolford/Documents/fileGobbler/*.xlsx')


	# Each cell contains the attributes below. They're listed in the order that they're returned in openpyxl.

	RAW_ATTRIBUTES = ['row', 'column', 'coordinate', 'internal_value', 'data_type', 'style_id', 'number_format']

	# Loop through all the workbooks and worksheets, loading every row into its own Mongo document.

	for file in files:
	wb = load_workbook(filename = file, use_iterators = True)
	for ws in wb.worksheets:
	for rownum, row in enumerate(ws.iter_rows()):
	if rownum == 0:
	cols = [col.internal_value.encode('utf-8') for col in row]
	print cols
	for colnum, col in enumerate(cols):
	record = {col: dict(zip(RAW_ATTRIBUTES, row[colnum]))}
	record['sheetname'] = ws._title
	record['filename'] = os.path.basename(file)
	collection.insert(record)

	"""
	The following Mongo mapreduce job identifies all the column names:

	> mr = db.runCommand({
	... "mapreduce" : "records",
	... "map" : function() {
	... for (var key in this) { emit(key, null); }
	... },
	... "reduce" : function(key, stuff) { return null; },
	... "out": "records" + "_keys"
	... })
	{
	"result" : "records_keys",
	"timeMillis" : 102,
	"counts" : {
	"input" : 762,
	"emit" : 3048,
	"reduce" : 42,
	"output" : 7
	},
	"ok" : 1
	}
	> db.records_keys.find()
	{ "_id" : "_c1", "value" : null }
	{ "_id" : "_id", "value" : null }
	{ "_id" : "filename", "value" : null }
	{ "_id" : "ingestdate", "value" : null }
	{ "_id" : "mfr", "value" : null }
	{ "_id" : "records", "value" : null }
	{ "_id" : "sheetname", "value" : null }

	The next step would probably be to load this into Hive and query using HQL, e.g. https://github.com/mongodb/mongo-hadoop/tree/master/hive

	"""