-
-
Save Amir-Sak/84d48c040da4f609031bf5c46336ea76 to your computer and use it in GitHub Desktop.
sql project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- SQL Project - Data Cleaning | |
-- Step 1: Selecting all data from the 'layoffs' table | |
SELECT * | |
FROM layoffs; | |
-- Remove Duplicates, Standardize Data, Handle Null Values or Blanks, and Remove any Unnecessary Columns | |
-- Creating a backup table for precautionary purposes | |
CREATE TABLE layoffs_test_backup LIKE layoffs; | |
-- Copying data from the 'layoffs' table to the backup table | |
INSERT INTO layoffs_test_backup | |
SELECT * | |
FROM layoffs; | |
-- ------------------------------------------------------------------- | |
-- Task 1: Removing Duplicates | |
-- Checking for duplicate rows in the 'layoffs' table | |
WITH duplicate_cte AS ( | |
SELECT *, | |
ROW_NUMBER() OVER ( | |
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions | |
ORDER BY company -- Ordering can be adjusted as needed | |
) AS row_num | |
FROM world_layoffs.layoffs_test | |
) | |
-- Selecting duplicate rows | |
SELECT * | |
FROM duplicate_cte | |
WHERE row_num > 1; | |
-- Example: Checking duplicates for a specific company (e.g., Casper) | |
SELECT * | |
FROM world_layoffs.layoffs_test | |
WHERE company = 'Casper'; | |
-- Deleting duplicate rows from the 'layoffs' table | |
SET SQL_SAFE_UPDATES = 0; -- Enabling deletion without WHERE clause (Use with caution) | |
DELETE FROM world_layoffs.layoffs_test | |
WHERE (company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) IN ( | |
SELECT company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions | |
FROM duplicate_cte | |
WHERE row_num > 1 | |
); | |
-- Resetting SQL_SAFE_UPDATES to its default value | |
SET SQL_SAFE_UPDATES = 1; | |
-- Verifying that duplicates are removed | |
SELECT * | |
FROM world_layoffs.layoffs_test | |
WHERE row_num > 1; | |
-- Creating a new table 'layoffs_test2' for further analysis | |
CREATE TABLE `layoffs_test2` ( | |
`company` TEXT, | |
`location` TEXT, | |
`industry` TEXT, | |
`total_laid_off` BIGINT DEFAULT NULL, | |
`percentage_laid_off` TEXT, | |
`date` TEXT, | |
`stage` TEXT, | |
`country` TEXT, | |
`funds_raised_millions` INT DEFAULT NULL, | |
`row_num` INT | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; | |
-- Populating 'layoffs_test2' table with cleaned data | |
INSERT INTO world_layoffs.layoffs_test2 | |
SELECT *, | |
ROW_NUMBER() OVER( | |
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) AS row_num | |
FROM world_layoffs.layoffs_test; | |
-- Viewing cleaned data in 'layoffs_test2' table | |
SELECT * | |
FROM world_layoffs.layoffs_test2 | |
WHERE row_num > 1; | |
-- Deleting unnecessary duplicate rows from 'layoffs_test2' table | |
SET SQL_SAFE_UPDATES = 0; -- Enabling deletion without WHERE clause (Use with caution) | |
DELETE FROM world_layoffs.layoffs_test2 | |
WHERE row_num > 1; | |
SET SQL_SAFE_UPDATES = 1; -- Resetting SQL_SAFE_UPDATES to its default value | |
-- Verify that duplicates are removed | |
SELECT * | |
FROM world_layoffs.layoffs | |
WHERE row_num > 1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment