Skip to content

Instantly share code, notes, and snippets.

@Amir-Sak
Last active May 9, 2024 17:42
Show Gist options
  • Save Amir-Sak/84d48c040da4f609031bf5c46336ea76 to your computer and use it in GitHub Desktop.
Save Amir-Sak/84d48c040da4f609031bf5c46336ea76 to your computer and use it in GitHub Desktop.
sql project
-- SQL Project - Data Cleaning
-- Step 1: Selecting all data from the 'layoffs' table
SELECT *
FROM layoffs;
-- Remove Duplicates, Standardize Data, Handle Null Values or Blanks, and Remove any Unnecessary Columns
-- Creating a backup table for precautionary purposes
CREATE TABLE layoffs_test_backup LIKE layoffs;
-- Copying data from the 'layoffs' table to the backup table
INSERT INTO layoffs_test_backup
SELECT *
FROM layoffs;
-- -------------------------------------------------------------------
-- Task 1: Removing Duplicates
-- Checking for duplicate rows in the 'layoffs' table
WITH duplicate_cte AS (
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions
ORDER BY company -- Ordering can be adjusted as needed
) AS row_num
FROM world_layoffs.layoffs_test
)
-- Selecting duplicate rows
SELECT *
FROM duplicate_cte
WHERE row_num > 1;
-- Example: Checking duplicates for a specific company (e.g., Casper)
SELECT *
FROM world_layoffs.layoffs_test
WHERE company = 'Casper';
-- Deleting duplicate rows from the 'layoffs' table
SET SQL_SAFE_UPDATES = 0; -- Enabling deletion without WHERE clause (Use with caution)
DELETE FROM world_layoffs.layoffs_test
WHERE (company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) IN (
SELECT company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions
FROM duplicate_cte
WHERE row_num > 1
);
-- Resetting SQL_SAFE_UPDATES to its default value
SET SQL_SAFE_UPDATES = 1;
-- Verifying that duplicates are removed
SELECT *
FROM world_layoffs.layoffs_test
WHERE row_num > 1;
-- Creating a new table 'layoffs_test2' for further analysis
CREATE TABLE `layoffs_test2` (
`company` TEXT,
`location` TEXT,
`industry` TEXT,
`total_laid_off` BIGINT DEFAULT NULL,
`percentage_laid_off` TEXT,
`date` TEXT,
`stage` TEXT,
`country` TEXT,
`funds_raised_millions` INT DEFAULT NULL,
`row_num` INT
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
-- Populating 'layoffs_test2' table with cleaned data
INSERT INTO world_layoffs.layoffs_test2
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) AS row_num
FROM world_layoffs.layoffs_test;
-- Viewing cleaned data in 'layoffs_test2' table
SELECT *
FROM world_layoffs.layoffs_test2
WHERE row_num > 1;
-- Deleting unnecessary duplicate rows from 'layoffs_test2' table
SET SQL_SAFE_UPDATES = 0; -- Enabling deletion without WHERE clause (Use with caution)
DELETE FROM world_layoffs.layoffs_test2
WHERE row_num > 1;
SET SQL_SAFE_UPDATES = 1; -- Resetting SQL_SAFE_UPDATES to its default value
-- Verify that duplicates are removed
SELECT *
FROM world_layoffs.layoffs
WHERE row_num > 1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment