Created
February 23, 2018 13:31
-
-
Save jobliz/0a8d8883392dab81c88d8c8a36b2d333 to your computer and use it in GitHub Desktop.
UCI retail dataset CSV to sqlalchemy. Old ideas, rough code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://archive.ics.uci.edu/ml/datasets/online+retail | |
import csv | |
import sys | |
import time | |
import datetime | |
from sqlalchemy import * | |
from sqlalchemy import create_engine | |
from sqlalchemy import sql | |
import numpy as np | |
import matplotlib.pyplot as plt | |
metadata = MetaData() | |
# entity tables | |
invoice = Table('invoice', metadata, | |
Column('number', String(16), primary_key=True), | |
Column('cancelled', Boolean), | |
Column('customer_id', String(16), ForeignKey('customer.id')), | |
Column('date', Date) | |
) | |
product = Table('product', metadata, | |
Column('stock_code', String(16), primary_key=True), | |
Column('description', String(255)), | |
Column('unit_price', Float) | |
) | |
invoice_product = Table('invoice_product', metadata, | |
Column('id', Integer, primary_key=True), | |
Column('product_stock_code', String(16), ForeignKey('product.stock_code')), | |
Column('invoice_number', String(16), ForeignKey('invoice.number')), | |
Column('quantity', Integer) | |
) | |
country = Table('country', metadata, | |
Column('id', Integer, primary_key=True), | |
Column('name', String(32)) | |
) | |
customer = Table('customer', metadata, | |
Column('id', String(16), primary_key=True), | |
Column('country_id', Integer, ForeignKey('country.id')) | |
) | |
# metric tables | |
rfm_analysis = Table('rfm_analysis', metadata, | |
Column('id', Integer, primary_key=True), | |
Column('start_date', Date), | |
Column('end_date', Date) | |
) | |
rfm_analysis_customer = Table('rfm_analysis_customer', metadata, | |
Column('id', Integer, primary_key=True), | |
Column('customer_id', String(16), ForeignKey('customer.id')), | |
Column('days_since_last_purchase', Integer), | |
Column('net_spending', Float), | |
Column('number_of_orders', Integer), | |
Column('average_spending_per_order', Float) | |
) | |
def create_db(db, name='retail.csv'): | |
""" | |
Creates a SQLite database file. | |
""" | |
# dicts, allow for faster lookups than lists | |
invoice_lookup = {} | |
product_lookup = {} | |
customer_lookup = {} | |
country_lookup = {} | |
# mass insert lists, 'payload' | |
invoice_payload = [] | |
product_payload = [] | |
customer_payload = [] | |
country_payload = [] | |
invoice_product_payload = [] | |
# counters, for new integer incremental ids | |
country_counter = 0 | |
invoice_product_counter = 0 | |
# counter of rows without customer id | |
no_customer_id_counter = 0 | |
iteration_counter = 0 | |
with open('retail.csv', 'rb') as f: | |
reader = csv.reader(f) | |
next(reader) # skip first line | |
for row in reader: | |
invoice_number = row[0] | |
product_stock_code = row[1] | |
product_description = row[2].decode('utf-8') | |
order_quantity = row[3] | |
invoice_date = datetime.datetime.strptime(row[4], '%m/%d/%Y %H:%M') | |
product_unit_price = row[5] | |
customer_id = row[6] | |
country_name = row[7] | |
# paper says "filter transaction with no postcode" | |
# perhaps this is an equivalent? | |
if customer_id == '': | |
no_customer_id_counter += 1 | |
continue | |
else: | |
iteration_counter += 1 | |
# new country. id is created here | |
if country_name not in country_lookup: | |
country_counter += 1 | |
country_payload.append({'id': country_counter, 'name': country_name}) | |
country_lookup[country_name] = country_counter | |
# new customer. id comes from data | |
if customer_id not in customer_lookup: | |
customer_payload.append({ | |
'id': customer_id, | |
'country_id': country_lookup[country_name] | |
}) | |
customer_lookup[customer_id] = customer_id | |
# new product | |
# we're assuming product prices DO NOT change across time and invoices! | |
if product_stock_code not in product_lookup: | |
product_payload.append({ | |
'stock_code': product_stock_code, | |
'description': product_description, | |
'unit_price': product_unit_price | |
}) | |
product_lookup[product_stock_code] = product_stock_code | |
# new invoice | |
if invoice_number not in invoice_lookup: | |
if invoice_number[0] == 'c': | |
cancelled = True | |
else: | |
cancelled = False | |
invoice_payload.append({ | |
'number': invoice_number, | |
'cancelled': cancelled, | |
'customer_id': customer_id, | |
'date': invoice_date | |
}) | |
invoice_lookup[invoice_number] = invoice_number | |
# fill invoice/product association table | |
invoice_product_counter += 1 | |
invoice_product_payload.append({ | |
'id': invoice_product_counter, | |
'product_stock_code': product_stock_code, | |
'invoice_number': invoice_number, | |
'quantity': order_quantity | |
}) | |
print(''.join(['Skipped ', str(no_customer_id_counter), ' iterations without customer ID'])) | |
print(''.join([str(iteration_counter), ' transaction iterations done'])) | |
print('Created entities:') | |
print(''.join(['Countries:\t', str(len(country_payload))])) | |
print(''.join(['Customers:\t', str(len(customer_payload))])) | |
print(''.join(['Products:\t', str(len(product_payload))])) | |
print(''.join(['Invoices:\t', str(len(invoice_payload))])) | |
# print("Invoice/Product entries: ", len(invoice_product_payload)) | |
print("\nSaving into database...") | |
start = time.time() | |
db.execute(country.insert(), country_payload) | |
db.execute(customer.insert(), customer_payload) | |
db.execute(product.insert(), product_payload) | |
db.execute(invoice.insert(), invoice_payload) | |
db.execute(invoice_product.insert(), invoice_product_payload) | |
duration = time.time() - start | |
print(''.join(['Saving to DB took ', str(duration), ' seconds.'])) | |
if __name__ == '__main__': | |
action = sys.argv[1] | |
engine = create_engine('sqlite:///retail.db') | |
conn = engine.connect() | |
if action == 'create_database_mysql': | |
engine = create_engine('mysql://root:clave@localhost/online_retail_dataset') | |
metadata.create_all(engine) | |
conn = engine.connect() | |
create_db(conn) | |
if action == 'create_database': | |
metadata.create_all(engine) | |
create_db(conn) | |
if action == 'first_summary': | |
rs = conn.execute("""SELECT AVG(invoices) FROM | |
(SELECT COUNT(invoice.number) AS invoices | |
FROM customer | |
JOIN invoice ON customer.id = invoice.customer_id | |
GROUP BY customer.id)""") | |
print("On average, each customer as these transactions:") | |
print(str(rs.fetchone()[0])) # fetchall is the other method! | |
# average number of distinct products contained in each transaction | |
# is 18.3, or total transactions / invoices (= 406 830 / 22190) | |
if action == 'summary_median': | |
# not sure if this is the right table to be analyzing | |
# http://stackoverflow.com/questions/5669473/sqlite-select-date-for-a-specific-month | |
# http://tiebing.blogspot.com/2011/07/sqlite-3-string-to-integer-conversion.html | |
rs = conn.execute(""" | |
SELECT | |
MIN(CAST(strftime('%m', invoice.date) as integer)) | |
FROM customer | |
JOIN invoice ON customer.id = invoice.customer_id | |
WHERE customer.country_id = 1 | |
GROUP BY customer.id | |
""") | |
max_dates = [] | |
for row in rs: | |
max_dates.append(row[0]) | |
print(np.median(max_dates)) | |
print(max(max_dates)) | |
if action == 'idea': | |
rs = conn.execute(""" | |
SELECT | |
customer.id, | |
COUNT(DISTINCT(invoice.number)) AS number_of_orders, | |
julianday('2011-12-31') - julianday(MAX(invoice.date)) AS days_since_last_purchase, | |
SUM(product.unit_price * invoice_product.quantity) AS total_spent | |
FROM | |
customer | |
JOIN invoice ON invoice.customer_id = customer.id | |
JOIN invoice_product ON invoice_product.invoice_number = invoice.number | |
JOIN product ON product.stock_code = invoice_product.product_stock_code | |
WHERE customer.country_id = 1 | |
GROUP BY customer.id | |
HAVING number_of_orders < 170 | |
""") | |
# las condiciones having vienen del articulo, para obtener un dataset similar | |
count = 0 | |
number_of_orders = [] | |
days_since_last_order = [] | |
total_order_value = [] | |
for row in rs: | |
count += 1 | |
number_of_orders.append(row[1]) | |
days_since_last_order.append(row[2]) | |
total_order_value.append(row[3]) | |
#print(''.join(['DSLO min: ', str(min(days_since_last_order))])) | |
#print(''.join(['DSLO med: ', str(np.median(days_since_last_order))])) | |
#print(''.join(['DSLO max: ', str(max(days_since_last_order))])) | |
print count | |
print(''.join(['Frequency min: ', str(min(number_of_orders))])) | |
print(''.join(['Frequency med: ', str(np.average(number_of_orders))])) | |
print(''.join(['Frequency max: ', str(max(number_of_orders))])) | |
print(''.join(['Monetary min: ', str(min(total_order_value))])) | |
print(''.join(['Monetary med: ', str(np.median(total_order_value))])) | |
print(''.join(['Monetary max: ', str(max(total_order_value))])) | |
if action == 'summary_frequency': | |
# not validated... damn | |
rs = conn.execute(""" | |
SELECT | |
customer.id, COUNT(invoice.number) AS counter | |
FROM customer | |
JOIN invoice ON invoice.customer_id = customer.id | |
WHERE customer.country_id = 1 | |
AND customer.id <> '' | |
GROUP BY customer.id | |
""") | |
frequencies = [] | |
for row in rs: | |
frequencies.append(row[1]) | |
print(len(frequencies)) | |
#print(min(frequencies)) | |
#print(np.median(frequencies)) | |
#print(max(frequencies)) | |
#plt.hist(frequencies, bins=(1, 7.7, 14.4, 21.2, 27.9, 34.6, 1000)) | |
#plt.show() | |
if action == 'show_all_countries': | |
rs = conn.execute('SELECT * FROM country') | |
for row in rs: | |
print(row) | |
if action == 'show_all_countries': | |
rs = conn.execute('SELECT * FROM country') | |
for row in rs: | |
print(row) | |
if action == 'show_top_10_customers': | |
rs = conn.execute(""" | |
SELECT | |
customer.id, | |
SUM(product.unit_price * invoice_product.quantity) | |
FROM customer | |
JOIN invoice ON customer.id = invoice.customer_id | |
JOIN invoice_product ON invoice.number = invoice_product.invoice_number | |
JOIN product ON invoice_product.product_stock_code = product.stock_code | |
GROUP BY customer.id | |
LIMIT 10 | |
""") | |
for row in rs: | |
print(row) | |
if action == 'show_total_purchased_by_customer': | |
cid = sys.argv[2] | |
sql = """SELECT DISTINCT(product.stock_code) | |
FROM customer | |
JOIN invoice ON customer.id = invoice.customer_id | |
JOIN invoice_product ON invoice.number = invoice_product.invoice_number | |
JOIN product ON invoice_product.product_stock_code = product.stock_code | |
WHERE customer.id = | |
""" | |
sql = ''.join([sql, ' "', cid, '"']) | |
rs = conn.execute(sql) | |
for row in rs: | |
print(rs) | |
if action == 'select_all_countries': | |
s = sql.select([country]) | |
result = conn.execute(s) | |
for row in result: | |
print(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment