Skip to content

Instantly share code, notes, and snippets.

@rpietro
Created December 18, 2012 07:10
Show Gist options
  • Save rpietro/4325744 to your computer and use it in GitHub Desktop.
Save rpietro/4325744 to your computer and use it in GitHub Desktop.
sqldf tutorial from http://goo.gl/dSuj8 - i just moved the data into the script (using dput) to make it self-contained
#Load sqldf package, which will load all others necessary
#By default, SQLite runs in background to do processing, could use others DB engines if you wanted
library("sqldf")
#Import employees data
employees <- structure(list(id = 1:20, lastname = structure(c(5L, 14L, 13L, 15L, 6L, 16L, 9L, 1L, 3L, 12L, 10L, 8L, 12L, 3L, 11L, 13L, 10L, 7L, 2L, 4L), .Label = c("a", "b", "c", "f", "g", "h", "i", "j", "n", "o", "p", "r", "s", "t", "w", "z"), class = "factor"), firstname = structure(c(12L, 6L, 5L, 12L, 11L, 15L, 9L, 18L, 17L, 7L, 8L, 10L, 4L, 14L, 19L, 16L, 1L, 13L, 2L, 3L), .Label = c("chris", "dima", "drew", "eric", "hila", "jason", "jeremy", "joe", "jon", "jowanza", "lashanda", "matt", "michael", "michelle", "randy", "rudi", "solon", "stewart", "tim"), class = "factor"), gender = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("f", "m"), class = "factor")), .Names = c("id", "lastname", "firstname", "gender"), class = "data.frame", row.names = c(NA, -20L))
#Import orders data - Matt G, Jason T, Matt W, and La Shanda H go to sushi lunch
#taking a potential client with them
orders <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 5L, 5L, 5L, NA, NA), item = structure(c(8L, 9L, 12L, 7L, 10L, 13L, 5L, 11L, 8L, 15L, 13L, 16L, 1L, 3L, 14L, 6L, 15L, 2L, 4L), .Label = c("california roll", "chicken teriaki hibachi", "cucumber roll", "diet coke", "edamame", "firecracker roll", "keystone roll", "playboy roll", "rockstar roll", "salmon sashimi", "salmon skin roll", "spider roll", "tuna sashimi", "unagi roll", "unobtanium roll", "yellowtail hand roll"), class = "factor"), quantity_ordered = c(1L, 1L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), item_cost = c(12, 10, 8, 25, 3, 2.5, 6, 8, 12, 35, 2.5, 7, 4, 3.5, 6.5, 9, 35, 7.95, 1.95)), .Names = c("id", "item", "quantity_ordered", "item_cost"), class = "data.frame", row.names = c(NA, -19L))
##### Single Table operations ####
#1 - Select all employees that are male
male_employees <- sqldf("SELECT * FROM employees WHERE gender = 'm'")
#2 - Get a count by first name
name_counts <- sqldf("SELECT firstname, COUNT (firstname) as occurances FROM employees GROUP BY firstname")
name_counts
#3 - Get a count by first name, excluding non-employees
name_counts_emponly <- sqldf("SELECT firstname, COUNT(firstname) as occurances
FROM employees
WHERE firstname != 'rudi'
GROUP BY firstname")
name_counts_emponly
#4 - Use a case statement to define a new data column of california employees,
# using "lower" to evaluate all names as lowercase to ensure case insensitivity
employees_cali <- sqldf("SELECT *,
CASE
WHEN lower(firstname) = 'stewart' THEN 1
WHEN lower(firstname) = 'hila' THEN 1
WHEN lower(firstname) = 'jon' THEN 1
WHEN lower(firstname) = 'solon' THEN 1
ELSE 0
END as cali_emp
FROM employees
")
employees_cali
#5 - Sort employees_cali by cali_emp descending, first name ascending (ascending is default)
employees_cali_sorted <- sqldf("SELECT *,
CASE
WHEN lower(firstname) = 'stewart' THEN 1
WHEN lower(firstname) = 'hila' THEN 1
WHEN lower(firstname) = 'jon' THEN 1
WHEN lower(firstname) = 'solon' THEN 1
ELSE 0
END as cali_emp
FROM employees
ORDER BY cali_emp DESC, firstname
")
##### Multi-Table operations ####
#1. Left join employees and orders table (keep all records from employees table,
# matching records from orders)
left_join <- sqldf("SELECT *
FROM employees a
LEFT JOIN orders b ON a.id=b.id
WHERE a.firstname != 'rudi'
")
#2. "Right join" isn't supported in sqldf package, but switching order of tables and left join
# is functionally equivalent
right_join_equiv <- sqldf("SELECT *
FROM orders b
LEFT JOIN employees a ON a.id=b.id
")
#3. Inner join...select only records that match both tables
inner_join <- sqldf("SELECT *
FROM employees a, orders b
WHERE a.id=b.id
")
#4. Matt G. sees bill, wonders how bill can be so low!
# Join orders to employees, find who is ordering items less than 10 dollars, sorted by lowest cost
inexpensive_items <- sqldf("SELECT *
FROM orders a
LEFT JOIN employees b ON a.id= b.id
WHERE item_cost < 10
ORDER BY item_cost
")
#4a. Realizing some things are priced by piece, figure out who spent less than $20 on any one
# type of food
inexpensive_line_items <- sqldf("SELECT *,
(item_cost * quantity_ordered) as item_level_cost
FROM orders a
LEFT JOIN employees b ON a.id= b.id
WHERE item_level_cost < 20
ORDER BY item_level_cost
")
#5. Realizing that even item level cost is wrong question, Matt G. wants to know whose total lunch < $30
# Need to use GROUP BY to get totals by person, then use HAVING instead of WHERE because
# of the use of the GROUP BY summary function (WHERE is a record level operator)
lunch_under_30 <- sqldf("SELECT lastname, firstname,
SUM(item_cost * quantity_ordered) as lunch_cost
FROM orders a
LEFT JOIN employees b ON a.id= b.id
GROUP BY a.id
HAVING lunch_cost < 30
")
#6. Matt G. wants to keep track of food consumption per person, particularly who the "lightweights"
# are in the group. Who's eating less than average on a cost basis?
# This requires a subquery to first determine the average cost of this meal, passing that value
# to the HAVING clause
#Subquery: returns a single value for the average lunch cost for employees (those with valid ID num)
#"SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost
# FROM orders WHERE id != 'NA'")
lower_than_average_cost <- sqldf("SELECT lastname, firstname,
SUM(item_cost * quantity_ordered) as lunch_cost
FROM orders a
LEFT JOIN employees b ON a.id= b.id
WHERE a.id != 'NA'
GROUP BY a.id
HAVING lunch_cost < (
SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost
FROM orders
WHERE id != 'NA'
)
")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment