Skip to content

Instantly share code, notes, and snippets.

@jobliz
Created February 23, 2018 13:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jobliz/0a8d8883392dab81c88d8c8a36b2d333 to your computer and use it in GitHub Desktop.
Save jobliz/0a8d8883392dab81c88d8c8a36b2d333 to your computer and use it in GitHub Desktop.
UCI retail dataset CSV to sqlalchemy. Old ideas, rough code.
# http://archive.ics.uci.edu/ml/datasets/online+retail
import csv
import sys
import time
import datetime
from sqlalchemy import *
from sqlalchemy import create_engine
from sqlalchemy import sql
import numpy as np
import matplotlib.pyplot as plt
metadata = MetaData()
# entity tables
invoice = Table('invoice', metadata,
Column('number', String(16), primary_key=True),
Column('cancelled', Boolean),
Column('customer_id', String(16), ForeignKey('customer.id')),
Column('date', Date)
)
product = Table('product', metadata,
Column('stock_code', String(16), primary_key=True),
Column('description', String(255)),
Column('unit_price', Float)
)
invoice_product = Table('invoice_product', metadata,
Column('id', Integer, primary_key=True),
Column('product_stock_code', String(16), ForeignKey('product.stock_code')),
Column('invoice_number', String(16), ForeignKey('invoice.number')),
Column('quantity', Integer)
)
country = Table('country', metadata,
Column('id', Integer, primary_key=True),
Column('name', String(32))
)
customer = Table('customer', metadata,
Column('id', String(16), primary_key=True),
Column('country_id', Integer, ForeignKey('country.id'))
)
# metric tables
rfm_analysis = Table('rfm_analysis', metadata,
Column('id', Integer, primary_key=True),
Column('start_date', Date),
Column('end_date', Date)
)
rfm_analysis_customer = Table('rfm_analysis_customer', metadata,
Column('id', Integer, primary_key=True),
Column('customer_id', String(16), ForeignKey('customer.id')),
Column('days_since_last_purchase', Integer),
Column('net_spending', Float),
Column('number_of_orders', Integer),
Column('average_spending_per_order', Float)
)
def create_db(db, name='retail.csv'):
"""
Creates a SQLite database file.
"""
# dicts, allow for faster lookups than lists
invoice_lookup = {}
product_lookup = {}
customer_lookup = {}
country_lookup = {}
# mass insert lists, 'payload'
invoice_payload = []
product_payload = []
customer_payload = []
country_payload = []
invoice_product_payload = []
# counters, for new integer incremental ids
country_counter = 0
invoice_product_counter = 0
# counter of rows without customer id
no_customer_id_counter = 0
iteration_counter = 0
with open('retail.csv', 'rb') as f:
reader = csv.reader(f)
next(reader) # skip first line
for row in reader:
invoice_number = row[0]
product_stock_code = row[1]
product_description = row[2].decode('utf-8')
order_quantity = row[3]
invoice_date = datetime.datetime.strptime(row[4], '%m/%d/%Y %H:%M')
product_unit_price = row[5]
customer_id = row[6]
country_name = row[7]
# paper says "filter transaction with no postcode"
# perhaps this is an equivalent?
if customer_id == '':
no_customer_id_counter += 1
continue
else:
iteration_counter += 1
# new country. id is created here
if country_name not in country_lookup:
country_counter += 1
country_payload.append({'id': country_counter, 'name': country_name})
country_lookup[country_name] = country_counter
# new customer. id comes from data
if customer_id not in customer_lookup:
customer_payload.append({
'id': customer_id,
'country_id': country_lookup[country_name]
})
customer_lookup[customer_id] = customer_id
# new product
# we're assuming product prices DO NOT change across time and invoices!
if product_stock_code not in product_lookup:
product_payload.append({
'stock_code': product_stock_code,
'description': product_description,
'unit_price': product_unit_price
})
product_lookup[product_stock_code] = product_stock_code
# new invoice
if invoice_number not in invoice_lookup:
if invoice_number[0] == 'c':
cancelled = True
else:
cancelled = False
invoice_payload.append({
'number': invoice_number,
'cancelled': cancelled,
'customer_id': customer_id,
'date': invoice_date
})
invoice_lookup[invoice_number] = invoice_number
# fill invoice/product association table
invoice_product_counter += 1
invoice_product_payload.append({
'id': invoice_product_counter,
'product_stock_code': product_stock_code,
'invoice_number': invoice_number,
'quantity': order_quantity
})
print(''.join(['Skipped ', str(no_customer_id_counter), ' iterations without customer ID']))
print(''.join([str(iteration_counter), ' transaction iterations done']))
print('Created entities:')
print(''.join(['Countries:\t', str(len(country_payload))]))
print(''.join(['Customers:\t', str(len(customer_payload))]))
print(''.join(['Products:\t', str(len(product_payload))]))
print(''.join(['Invoices:\t', str(len(invoice_payload))]))
# print("Invoice/Product entries: ", len(invoice_product_payload))
print("\nSaving into database...")
start = time.time()
db.execute(country.insert(), country_payload)
db.execute(customer.insert(), customer_payload)
db.execute(product.insert(), product_payload)
db.execute(invoice.insert(), invoice_payload)
db.execute(invoice_product.insert(), invoice_product_payload)
duration = time.time() - start
print(''.join(['Saving to DB took ', str(duration), ' seconds.']))
if __name__ == '__main__':
action = sys.argv[1]
engine = create_engine('sqlite:///retail.db')
conn = engine.connect()
if action == 'create_database_mysql':
engine = create_engine('mysql://root:clave@localhost/online_retail_dataset')
metadata.create_all(engine)
conn = engine.connect()
create_db(conn)
if action == 'create_database':
metadata.create_all(engine)
create_db(conn)
if action == 'first_summary':
rs = conn.execute("""SELECT AVG(invoices) FROM
(SELECT COUNT(invoice.number) AS invoices
FROM customer
JOIN invoice ON customer.id = invoice.customer_id
GROUP BY customer.id)""")
print("On average, each customer as these transactions:")
print(str(rs.fetchone()[0])) # fetchall is the other method!
# average number of distinct products contained in each transaction
# is 18.3, or total transactions / invoices (= 406 830 / 22190)
if action == 'summary_median':
# not sure if this is the right table to be analyzing
# http://stackoverflow.com/questions/5669473/sqlite-select-date-for-a-specific-month
# http://tiebing.blogspot.com/2011/07/sqlite-3-string-to-integer-conversion.html
rs = conn.execute("""
SELECT
MIN(CAST(strftime('%m', invoice.date) as integer))
FROM customer
JOIN invoice ON customer.id = invoice.customer_id
WHERE customer.country_id = 1
GROUP BY customer.id
""")
max_dates = []
for row in rs:
max_dates.append(row[0])
print(np.median(max_dates))
print(max(max_dates))
if action == 'idea':
rs = conn.execute("""
SELECT
customer.id,
COUNT(DISTINCT(invoice.number)) AS number_of_orders,
julianday('2011-12-31') - julianday(MAX(invoice.date)) AS days_since_last_purchase,
SUM(product.unit_price * invoice_product.quantity) AS total_spent
FROM
customer
JOIN invoice ON invoice.customer_id = customer.id
JOIN invoice_product ON invoice_product.invoice_number = invoice.number
JOIN product ON product.stock_code = invoice_product.product_stock_code
WHERE customer.country_id = 1
GROUP BY customer.id
HAVING number_of_orders < 170
""")
# las condiciones having vienen del articulo, para obtener un dataset similar
count = 0
number_of_orders = []
days_since_last_order = []
total_order_value = []
for row in rs:
count += 1
number_of_orders.append(row[1])
days_since_last_order.append(row[2])
total_order_value.append(row[3])
#print(''.join(['DSLO min: ', str(min(days_since_last_order))]))
#print(''.join(['DSLO med: ', str(np.median(days_since_last_order))]))
#print(''.join(['DSLO max: ', str(max(days_since_last_order))]))
print count
print(''.join(['Frequency min: ', str(min(number_of_orders))]))
print(''.join(['Frequency med: ', str(np.average(number_of_orders))]))
print(''.join(['Frequency max: ', str(max(number_of_orders))]))
print(''.join(['Monetary min: ', str(min(total_order_value))]))
print(''.join(['Monetary med: ', str(np.median(total_order_value))]))
print(''.join(['Monetary max: ', str(max(total_order_value))]))
if action == 'summary_frequency':
# not validated... damn
rs = conn.execute("""
SELECT
customer.id, COUNT(invoice.number) AS counter
FROM customer
JOIN invoice ON invoice.customer_id = customer.id
WHERE customer.country_id = 1
AND customer.id <> ''
GROUP BY customer.id
""")
frequencies = []
for row in rs:
frequencies.append(row[1])
print(len(frequencies))
#print(min(frequencies))
#print(np.median(frequencies))
#print(max(frequencies))
#plt.hist(frequencies, bins=(1, 7.7, 14.4, 21.2, 27.9, 34.6, 1000))
#plt.show()
if action == 'show_all_countries':
rs = conn.execute('SELECT * FROM country')
for row in rs:
print(row)
if action == 'show_all_countries':
rs = conn.execute('SELECT * FROM country')
for row in rs:
print(row)
if action == 'show_top_10_customers':
rs = conn.execute("""
SELECT
customer.id,
SUM(product.unit_price * invoice_product.quantity)
FROM customer
JOIN invoice ON customer.id = invoice.customer_id
JOIN invoice_product ON invoice.number = invoice_product.invoice_number
JOIN product ON invoice_product.product_stock_code = product.stock_code
GROUP BY customer.id
LIMIT 10
""")
for row in rs:
print(row)
if action == 'show_total_purchased_by_customer':
cid = sys.argv[2]
sql = """SELECT DISTINCT(product.stock_code)
FROM customer
JOIN invoice ON customer.id = invoice.customer_id
JOIN invoice_product ON invoice.number = invoice_product.invoice_number
JOIN product ON invoice_product.product_stock_code = product.stock_code
WHERE customer.id =
"""
sql = ''.join([sql, ' "', cid, '"'])
rs = conn.execute(sql)
for row in rs:
print(rs)
if action == 'select_all_countries':
s = sql.select([country])
result = conn.execute(s)
for row in result:
print(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment