Last active
July 25, 2019 06:58
-
-
Save lao-tseu-is-alive/30a36a305fe911b4404bb7bb35d12b78 to your computer and use it in GitHub Desktop.
Using CTE to cleanup records with identical firstname like "janis (1)" and "janis (2)" and combine the information in other attributes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* I got data like this in a table firstname | |
id firstname gender origin frequency | |
5220 janis (1) m latvian 0 | |
5221 janis (2) f english 0 | |
9841 shay (1) m irish, english 0 | |
9842 shay (2) m,f jewish 0 | |
10002 simone (1) f french 0 | |
10003 simone (2) m italian 0 | |
and I need to remove the (1) and group the information in gender, origin and frequency like this : | |
5221 janis f,m english, latvian 0 | |
9842 shay f,m irish, english, jewish 0 | |
10003 simone f,m french, italian 0 | |
here are the queries | |
*/ | |
WITH cte AS ( | |
SELECT max(id) as id, | |
trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom, | |
string_agg(DISTINCT genre, ',') as genre, | |
string_agg(DISTINCT langage, ',') as langage, | |
max(frequence) as frequence | |
FROM firstname | |
WHERE prenom ilike '% (%' | |
GROUP BY trim(both ' ' from substring(prenom, '^[^ (]+')) | |
) | |
UPDATE firstname | |
SET prenom = cte.prenom, | |
genre = cte.genre, | |
langage = cte.langage, | |
frequence = cte.frequence | |
FROM cte | |
WHERE firstname.id = cte.id; | |
-- now cleanup gender field and remove duplicates like m,m,f | |
WITH cte AS ( | |
SELECT f1.id | |
, f1.prenom | |
, (SELECT string_agg(DISTINCT genres.genr, ',') | |
FROM (SELECT f.id, genr | |
FROM french_firstname f, | |
Unnest(string_to_array(f.genre, ',')) AS genr | |
WHERE f.id = f1.id | |
ORDER BY genr | |
) | |
AS genres | |
GROUP BY genres.id) as new_genre | |
FROM french_firstname f1 | |
WHERE f1.prenom IN (SELECT trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom | |
FROM french_firstname | |
WHERE prenom ilike '% (%)' | |
) | |
) | |
UPDATE french_firstname | |
SET genre = cte.new_genre | |
FROM cte | |
WHERE french_firstname.id = cte.id; | |
-- finaly remove the other records | |
DELETE FROM firstname | |
WHERE prenom IN (SELECT trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom | |
FROM french_firstname | |
WHERE prenom ilike '% (%)' | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment