Skip to content

Instantly share code, notes, and snippets.

@zhangce
Created March 11, 2022 06:59
Show Gist options
  • Save zhangce/8b71ad118e80d48da4ba8b53007cef57 to your computer and use it in GitHub Desktop.
Save zhangce/8b71ad118e80d48da4ba8b53007cef57 to your computer and use it in GitHub Desktop.
------------------------------------
-- Script for Interactive Session --
------------------------------------
-- Have messages displayed in English
SET lc_messages TO 'en_US.UTF-8';
-- CREATE Employees -----------------------------------------------------------
CREATE TABLE employees (
emp_no INT NOT NULL,
birth_date DATE NULL,
first_name VARCHAR(14) NOT NULL,
last_name VARCHAR(16) NOT NULL,
gender CHAR(1) NOT NULL DEFAULT '?',
hire_date DATE NOT NULL,
PRIMARY KEY (emp_no)
);
SELECT * FROM employees;
INSERT INTO employees
VALUES (10001, '1980-10-02', 'Ce', 'Zhang', 'M', '2016-01-01');
INSERT INTO employees
(emp_no, birth_date, first_name, last_name, gender, hire_date)
VALUES (10002, '1980-10-02', 'Gustavo', 'Alonso', 'M', '2016-01-01');
INSERT INTO employees
(emp_no, first_name, last_name, gender, hire_date)
VALUES (10003, 'Ingo', 'Mueller', 'M', '2016-01-01');
SELECT * FROM employees;
-- Things that can go wrong:
-- Insert the same emp_no again (forbidden by primary key)
INSERT INTO employees
VALUES (10001, '1953-09-02', 'Georgi', 'Facello', 'M', '1986-06-26');
-- Wrong type (int)
INSERT INTO employees
VALUES ('asdf', '1953-09-02', 'Georgi', 'Facello', 'M', '1986-06-26');
-- Wrong type (date)
INSERT INTO employees
VALUES (10004, 'hello', 'Georgi', 'Facello', 'M', '1986-06-26');
-- Wrong type (CHAR length)
INSERT INTO employees
VALUES (10004, '1953-09-02', 'Georgi', 'Facello', 'MM', '1986-06-26');
-- Missing value (no first name)
INSERT INTO employees
(emp_no, birth_date, last_name, gender, hire_date)
VALUES (10004, '1953-09-02', 'Facello', 'M', '1986-06-26');
-- Missing value with DEFAULT works!
INSERT INTO employees
(emp_no, birth_date, first_name, last_name, hire_date)
VALUES (10004, '1953-09-02', 'Georgi', 'Facello', '1986-06-26');
SELECT * FROM employees;
-- More on SELECT -------------------------------------------------------------
SELECT first_name, last_name FROM employees; -- projection
SELECT first_name AS fn, last_name AS ln FROM employees; -- rename columns
SELECT 'Hello', first_name, last_name FROM employees; -- select constants
SELECT emp_no, emp_no FROM employees; -- selection columns several times
SELECT emp_no AS no1, emp_no AS no2 FROM employees; -- rename
SELECT emp_no AS no1, emp_no * 2 AS no2 FROM employees; -- use expressions
SELECT emp_no AS no1, emp_no * 2 AS no2, emp_no + emp_no FROM employees; -- use anonymous expressions
SELECT first_name || ' ' || last_name FROM employees; -- use functions
SELECT first_name || ' ' || last_name AS name FROM employees; -- use functions
SELECT birth_date, birth_date + 5 FROM employees; -- special expressions for dates
SELECT AGE(birth_date) FROM employees; -- special functions for dates
-- CREATE Departments ---------------------------------------------------------
CREATE TABLE departments (
dept_no INT NOT NULL,
dept_name VARCHAR(40) NOT NULL,
PRIMARY KEY (dept_no)
);
SELECT * FROM departments;
INSERT INTO departments VALUES (1, 'Sales');
INSERT INTO departments (dept_name, dept_no) VALUES ('Human Resources', 2);
INSERT INTO departments VALUES (3, 'Marketing'), (4, 'Legal');
INSERT INTO departments
SELECT 5, 'Development'; -- use SELECT to generate new row
SELECT * FROM departments;
-- Create dept_emp, which connects employees with departments
CREATE TABLE dept_emp (
emp_no INT NOT NULL,
dept_no INT NOT NULL,
from_date DATE NOT NULL,
to_date DATE NOT NULL,
PRIMARY KEY (emp_no, dept_no, from_date)
);
SELECT * FROM dept_emp;
-- Use SELECT to connect every employee with a department
INSERT INTO dept_emp
SELECT emp_no, 5 AS dept_no, NOW(), '9999-01-01' FROM employees;
SELECT * FROM dept_emp;
-- Things that can go wrong:
-- Insert the same records again (forbidden by primary key)
INSERT INTO dept_emp
SELECT emp_no, 5 AS dept_no, NOW(), '9999-01-01' FROM employees;
-- Changing one of the three key attributes is enough
INSERT INTO dept_emp
SELECT emp_no, 5 AS dept_no, '1999-01-01', NOW() FROM employees;
-- Insert a record of a non-existing employee/department
-- currently works (we do not use no foreign keys yes!)
INSERT INTO dept_emp VALUES (10001, 6, '1980-01-01', '1999-01-01');
-- JOINS ----------------------------------------------------------------------
SELECT * FROM employees;
SELECT * FROM departments;
SELECT * FROM dept_emp;
SELECT * FROM employees, departments, dept_emp; -- start with a Cartesian product
SELECT *
FROM employees, departments, dept_emp
WHERE employees.emp_no = dept_emp.emp_no
AND dept_emp.dept_no = departments.dept_no; -- select only those rows that join
-- Project to most interesting columns
SELECT emp_no, first_name, last_name, dept_no, dept_name -- does not work: ambiguous columns!
FROM employees, departments, dept_emp
WHERE employees.emp_no = dept_emp.emp_no
AND dept_emp.dept_no = departments.dept_no;
SELECT employees.emp_no, first_name, last_name,
departments.dept_no, dept_name
from_date, to_date
FROM employees, departments, dept_emp -- better :)
WHERE employees.emp_no = dept_emp.emp_no
AND dept_emp.dept_no = departments.dept_no;
-- This returns too many records:
-- In our schema, time plays a role!
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date
FROM employees, departments, dept_emp
WHERE employees.emp_no = dept_emp.emp_no
AND dept_emp.dept_no = departments.dept_no
AND to_date > NOW(); -- only join on *current* dept_emp entries
-- Join variants
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date
FROM employees
CROSS JOIN departments -- make cartesian product explicit
CROSS JOIN dept_emp -- make cartesian product explicit
WHERE employees.emp_no = dept_emp.emp_no
AND dept_emp.dept_no = departments.dept_no
AND to_date > NOW();
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date
FROM employees
JOIN dept_emp ON employees.emp_no = dept_emp.emp_no -- make join explicit
JOIN departments ON dept_emp.dept_no = departments.dept_no -- make join explicit
WHERE
to_date > NOW();
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date
FROM employees
JOIN dept_emp USING (emp_no) -- join on equality
JOIN departments USING (dept_no) -- join on equality
WHERE
to_date > NOW();
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date
FROM employees
NATURAL JOIN dept_emp -- join on equality
NATURAL JOIN departments -- of columns with the same name
WHERE
to_date > NOW();
-- Aggregations and nesting ---------------------------------------------------
DROP TABLE employees;
DROP TABLE departments;
DROP TABLE dept_emp;
SELECT * FROM employees; -- does not work: table does not exist anymore!
-- (insert the data sets from our website here)
SELECT * FROM employees; -- should display something again
SELECT COUNT(*) FROM employees; -- 1024
SELECT MAX(emp_no) FROM employees;
-- some last names occur several times,
-- so number of distinct values is less than 1024
SELECT COUNT(DISTINCT last_name) FROM employees;
SELECT DISTINCT last_name FROM employees;
-- By sorting, we can see that SELECT DISTINCT actually removes duplicates
SELECT DISTINCT last_name
FROM employees
ORDER BY last_name DESC;
-- Side note: we can also sort by several columns
SELECT first_name, last_name
FROM employees
ORDER BY last_name ASC, first_name DESC; -- even in different directions
-- How to find how often each last name occurs?
SELECT last_name, COUNT(*)
FROM employees
GROUP BY last_name;
SELECT last_name, COUNT(*)
FROM employees
GROUP BY last_name
ORDER BY COUNT(*) DESC; -- common names on top
-- How could we show any first_name per group?
-- Apply an aggregate function
SELECT MAX(first_name) last_name, COUNT(*) as c
FROM employees
GROUP BY last_name
ORDER BY c DESC;
SELECT (ARRAY_AGG(first_name))[1], last_name, COUNT(*) as c
FROM employees
GROUP BY last_name
ORDER BY c DESC;
-- How to see only names occurring 4 times?
SELECT last_name, COUNT(*)
FROM employees
GROUP BY last_name
-- WHERE? No, WHERE is done *before* the grouping!
HAVING COUNT(*) = 4;
/*
Think of the WHERE predicate as a function applied on a single tuple at a time
which come from the FROM clause.
The COUNT colomn in this example does not exist yet when the result in the FROM clause is produced.
*/
-- Why does this work?
SELECT * FROM
(
SELECT last_name, COUNT(*) AS c
FROM employees
GROUP BY last_name
) AS r1
WHERE c = 4; -- We can use WHERE by using a subquery
-- How to select the most common name (without using the constant 4)?
-- Step 1: compute the number of occurrences of the most common name(s)
SELECT MAX(c)
FROM
(
SELECT DISTINCT COUNT(*) AS c
FROM employees
GROUP BY last_name
) AS r2; -- returns 4
-- Step 2: use that query as subquery
SELECT * FROM
(
SELECT last_name, COUNT(*) AS c
FROM employees
GROUP BY last_name
) AS r1
WHERE c = (
SELECT MAX(c)
FROM
(
SELECT DISTINCT COUNT(*) AS c
FROM employees
GROUP BY last_name
) AS r2
);
-- Using WITH clause
WITH name_counts AS
(
SELECT last_name, COUNT(*) as count
FROM employees
GROUP BY last_name
), max_name_count AS
(
SELECT MAX(count) as max_count
FROM name_counts --we can use name_counts here because it was defined before
)
SELECT last_name, count
FROM name_counts, max_name_count
WHERE count = max_count
ORDER BY max_count DESC;
-- Employees with the min salary per department
/*
Always try to reduce the complexity by dissecting the problem in smaller steps
*/
-- Step 1: employees, salary, department
SELECT emp_no, salary, dept_no
FROM employees
JOIN salaries USING(emp_no)
JOIN dept_emp USING(emp_no)
ORDER BY emp_no;
-- Step 2: min salary, department
SELECT MIN(salary) AS sal, dept_no
FROM employees
JOIN salaries USING(emp_no)
JOIN dept_emp USING(emp_no)
GROUP BY dept_no
ORDER BY sal;
-- Step 3: Join on common columns
WITH emp_sal_dept AS
(
SELECT emp_no, salary, dept_no
FROM employees as e
JOIN salaries as s USING(emp_no)
JOIN dept_emp as de USING(emp_no)
ORDER BY emp_no
), min_salaries AS
(
SELECT MIN(salary) as salary, dept_no
FROM emp_sal_dept
GROUP BY dept_no
ORDER BY salary
)
SELECT emp_no, dept_no, salary
FROM emp_sal_dept
JOIN min_salaries USING(dept_no, salary) -- pay attention, we join on both attributes
ORDER BY dept_no;
/*
The above query is not accurate because the salaries and employments have to and from dates associated
with them. We need to join only the employements and the salaries which coexisted at the same time for a particular employee.
Drawing for emp_no = 10004, brackets means start and the end of the interval:
salaries: [ sal_1][ sal_2 ]
dept_emp: [ dept_1 ][ dept_2 ]
Now image employee 10004 always got a raise so sal_1 < sal_2 < sal_3.
If we join without looking at the temporal relation we would get as the result:
emp_no | salary | dept_no
10004 | sal_1 | dept_1
10004 | sal_1 | dept_2 <--wrong
10004 | sal_2 | dept_1
10004 | sal_2 | dept_2
The sal_1 dept_2 combination never happend if we would look at the temporal relationship, the employee was earning sal_2
during his work at dept_2. This means that once when we compute the minimal salary per department the entry (sal_1, dept_2)
might pull the minimal salary down for dept_2 even though there was never an employee working for that department for that
salary
*/
-- Correct query
WITH emp_sal_dept AS
(
SELECT emp_no, salary, dept_no
FROM employees as e
JOIN salaries as s USING(emp_no)
JOIN dept_emp as de USING(emp_no)
WHERE (s.from_date, s.to_date) OVERLAPS (de.from_date, de.to_date) -- This eliminates incorrect entries
ORDER BY emp_no
), min_salaries AS
(
SELECT MIN(salary) as salary, dept_no
FROM emp_sal_dept
GROUP BY dept_no
ORDER BY salary
)
SELECT emp_no, dept_no, salary
FROM emp_sal_dept
JOIN min_salaries USING(dept_no, salary)
ORDER BY dept_no;
-- Correlated subquery in select
-- Show employees, their salary and the average salary for that department
-- Step 1: employee, salary, department:
SELECT emp_no,
salary,
dept_no
FROM employees AS e
JOIN salaries AS s USING(emp_no)
JOIN dept_emp AS de USING(emp_no)
WHERE (s.from_date,
s.to_date) OVERLAPS (de.from_date,
de.to_date)
ORDER BY emp_no;
-- Step 2: average salary for a particular department
WITH emp_sal_dept AS
( SELECT emp_no,
salary,
dept_no
FROM employees AS e
JOIN salaries AS s USING(emp_no)
JOIN dept_emp AS de USING(emp_no)
WHERE (s.from_date,
s.to_date) OVERLAPS (de.from_date,
de.to_date)
ORDER BY emp_no)
SELECT avg(salary) AS salary
FROM emp_sal_dept
WHERE dept_no = 'd005';
-- Step 3: add the average to the employee, salary, department table
WITH emp_sal_dept AS
( SELECT emp_no,
salary,
dept_no
FROM employees AS e
JOIN salaries AS s USING(emp_no)
JOIN dept_emp AS de USING(emp_no)
WHERE (s.from_date,
s.to_date) OVERLAPS (de.from_date,
de.to_date)
ORDER BY emp_no)
SELECT emp_no,
salary,
( SELECT avg(salary) AS avg_salary
FROM emp_sal_dept
WHERE emp_sal_dept.dept_no = emp_sal_dept_o.dept_no --reference to the outer relation
) , dept_no
FROM emp_sal_dept AS emp_sal_dept_o;
-- Alternatively, solve with a join
WITH emp_sal_dept AS
( SELECT emp_no,
salary,
dept_no
FROM employees AS e
JOIN salaries AS s USING(emp_no)
JOIN dept_emp AS de USING(emp_no)
WHERE (s.from_date,
s.to_date) OVERLAPS (de.from_date,
de.to_date)
ORDER BY emp_no),
avg_salaries AS
( SELECT dept_no,
avg(salary) AS avg_salary
FROM emp_sal_dept
GROUP BY dept_no)
SELECT emp_no,
salary,
avg_salary,
dept_no
FROM emp_sal_dept
JOIN avg_salaries USING(dept_no);
-- Correlated query in the WHERE
-- Oldest Managers per department
WITH max_age_dept AS
( SELECT max(age(birth_date)) AS max_age,
dept_no
FROM employees AS e
JOIN dept_manager AS dm USING(emp_no)
GROUP BY dept_no
ORDER BY dept_no)
SELECT emp_no,
age(birth_date)
FROM employees AS eo
JOIN dept_manager AS dmo USING(emp_no)
WHERE age(eo.birth_date) >=
( SELECT max_age
FROM max_age_dept
WHERE dept_no = dmo.dept_no );
-- Exercise: implement the above query with a join
/*
Try to write a correlated subquery in the FROM clause
Why does this not work?
*/
WITH emp_sal_dept AS
( SELECT emp_no,
salary,
dept_no
FROM employees AS e
JOIN salaries AS s USING(emp_no)
JOIN dept_emp AS de USING(emp_no)
WHERE (s.from_date,
s.to_date) OVERLAPS (de.from_date,
de.to_date)
ORDER BY emp_no)
SELECT emp_no,
salary,
avg_salary,
dept_no
FROM emp_sal_dept AS emp_sal_dept_o,
( SELECT avg(salary) AS avg_salary
FROM emp_sal_dept
WHERE emp_sal_dept.dept_no = emp_sal_dept_o.dept_no ) AS average_salary;
/*
Answer: the subquery has to be evaluated before the outer query,
so emp_sal_dept_o is not available to the subquery yet
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment