Skip to content

Instantly share code, notes, and snippets.

@jpotts18
Last active August 26, 2016 23:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpotts18/fe7087f04b74ec2b4bbac0857c548813 to your computer and use it in GitHub Desktop.
Save jpotts18/fe7087f04b74ec2b4bbac0857c548813 to your computer and use it in GitHub Desktop.
Data Vault / Kimball Modeling Exercise with Sakila Database
USE sakila;
-- Film
/*
This step represents moving the data from production tables into a new database.
This stage allows data manipulation without locking tables or impacting the production system.
*/
DROP TABLE IF EXISTS src_film;
CREATE TABLE src_film LIKE film;
INSERT src_film SELECT * FROM film;
/*
The creation of hub tables insulates your data warehouse from external systems.
This step might sound like overkill but this table allows you to own your keys and provides a
mapping table for quick lookups
*/
DROP TABLE IF EXISTS hub_film;
CREATE TABLE hub_film (
`key` INT NOT NULL auto_increment primary key,
`id` INT NOT NULL,
`created_at` datetime,
`updated_at` datetime,
`source` varchar(100)
);
INSERT INTO hub_film (id, created_at, updated_at, source)
SELECT
film_id,
NOW(),
NOW(),
'sakila'
FROM src_film
ON DUPLICATE KEY UPDATE updated_at=NOW();
-- Customer
DROP TABLE IF EXISTS src_customer;
CREATE TABLE src_customer LIKE customer; -- using LIKE copies the data types and indexes over
INSERT src_customer SELECT * FROM customer;
DROP TABLE IF EXISTS hub_customer;
CREATE TABLE hub_customer (
`key` INT NOT NULL auto_increment primary key,
`id` INT NOT NULL,
`email` varchar(100),
`created_at` datetime,
`updated_at` datetime,
`source` varchar(100)
);
INSERT INTO hub_customer (id, email, created_at, updated_at, source)
SELECT
customer_id,
email,
NOW(),
NOW(),
'sakila'
FROM src_customer
ON DUPLICATE KEY UPDATE updated_at=NOW();
/*
Dimension tables contain details about each instance of an object.
For example, the items dimension table would contain a record for each item sold in the store.
It might include information such as the cost of the item, the supplier, color, sizes, and similar data.
http://databases.about.com/od/datamining/a/Facts-Vs-Dimensions.htm
*/
DROP TABLE IF EXISTS dim_customer;
CREATE TABLE dim_customer
SELECT
h.`key`,
c.*,
CONCAT(c.first_name,' ',c.last_name) as full_name
FROM src_customer c
JOIN hub_customer h ON c.customer_id = h.id;
-- Store
DROP TABLE IF EXISTS src_store;
CREATE TABLE src_store LIKE store;
INSERT src_store SELECT * FROM store;
DROP TABLE IF EXISTS hub_store;
CREATE TABLE hub_store LIKE hub_film; -- Easier to write
INSERT INTO hub_store (id, created_at, updated_at, source)
SELECT
s.store_id,
NOW(),
NOW(),
'sakila'
FROM src_store s
ON DUPLICATE KEY UPDATE updated_at=NOW();
DROP TABLE IF EXISTS dim_store;
CREATE TABLE dim_store
SELECT
h.`key`,
s.*
FROM src_store s
JOIN hub_store h on s.store_id = h.id
DROP TABLE IF EXISTS fact_rental;
CREATE TABLE fact_rental (
rental_id integer not null,
inventory_id integer not null,
customer_id integer not null,
film_id integer not null,
staff_id integer not null,
payment_id integer not null,
rental_date datetime,
payment_date datetime,
rental_return_date datetime,
rental_duration integer,
payment_amount integer,
rental_rate integer
);
INSERT INTO fact_rental (
rental_id,
inventory_id,
customer_id,
film_id,
staff_id,
payment_id,
rental_date,
payment_date,
rental_return_date,
rental_duration,
payment_amount,
rental_rate
)
SELECT
-- Dimension
r.rental_id,
r.inventory_id,
r.customer_id,
f.film_id,
r.staff_id,
p.payment_id,
-- State transitions
r.rental_date,
p.payment_date,
r.return_date as rental_return_date,
--
f.rental_duration,
p.amount as payment_amount,
f.rental_rate
FROM rental r
JOIN inventory i ON i.inventory_id = r.inventory_id
JOIN film f on i.film_id = f.film_id
JOIN payment p on p.rental_id = r.rental_id;
-- Check integrity
SELECT
(SELECT COUNT(*) FROM payment) payment_row_count,
(SELECT COUNT(*) FROM fact_rental) fact_rental_row_count,
(SELECT COUNT(*) FROM rental) rental_row_count;
-- Looks like we have 5 payments without rentals
SELECT * FROM payment WHERE rental_id IS NULL;
-- Aggregates
SELECT
COUNT(*) num_rental,
MAX(payment_amount) max_sale,
MIN(payment_amount) min_sale,
SUM(payment_amount) sales,
AVG(payment_amount) average_sale,
STD(payment_amount) std_sale
FROM fact_rental;
SELECT
f.customer_id,
COUNT(*) rentals,
SUM(payment_amount) sales,
c.*
FROM fact_rental f
JOIN customer c ON c.customer_id = f.customer_id
GROUP BY 1
ORDER BY rentals DESC;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment