lao-tseu-is-alive/clean_firstname_data.sql

## clean_firstname_data.sql
/* I got data like this in a table firstname
id    firstname gender  origin  frequency
5220	janis (1)	  m	    latvian	0
5221	janis (2)	  f	    english	0
9841	shay (1)	  m	    irish, english	0
9842	shay (2)	  m,f	  jewish	0
10002	simone (1)	f	    french	0
10003	simone (2)	m	    italian	0

and I need to remove the (1) and group the information in gender, origin and frequency like this :
5221	janis	    f,m	    english, latvian	0
9842	shay	    f,m	    irish, english, jewish	0
10003	simone	  f,m	    french, italian	0
here are the queries
*/

WITH cte AS (
    SELECT max(id)                                          as id,
           trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom,
           string_agg(DISTINCT genre, ',')                  as genre,
           string_agg(DISTINCT langage, ',')                as langage,
           max(frequence)                                   as frequence
    FROM firstname
    WHERE prenom ilike '% (%'
    GROUP BY trim(both ' ' from substring(prenom, '^[^ (]+'))
)
UPDATE firstname
SET prenom    = cte.prenom,
    genre     = cte.genre,
    langage   = cte.langage,
    frequence = cte.frequence
FROM cte
WHERE firstname.id = cte.id;

-- now cleanup gender field and remove duplicates like m,m,f

WITH cte AS (
    SELECT f1.id
         , f1.prenom
         , (SELECT string_agg(DISTINCT genres.genr, ',')
            FROM (SELECT f.id, genr
                  FROM french_firstname f,
                       Unnest(string_to_array(f.genre, ',')) AS genr
                  WHERE f.id = f1.id
                  ORDER BY genr
                 )
                     AS genres
            GROUP BY genres.id) as new_genre

    FROM french_firstname f1
    WHERE f1.prenom IN (SELECT trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom
                        FROM french_firstname
                        WHERE prenom ilike '% (%)'
    )
)
UPDATE french_firstname
SET genre     = cte.new_genre
FROM cte
WHERE french_firstname.id = cte.id;

-- finaly remove the other records

DELETE FROM firstname
WHERE prenom IN (SELECT trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom
                 FROM french_firstname
                 WHERE prenom ilike '% (%)'
)
	/* I got data like this in a table firstname
	id firstname gender origin frequency
	5220 janis (1) m latvian 0
	5221 janis (2) f english 0
	9841 shay (1) m irish, english 0
	9842 shay (2) m,f jewish 0
	10002 simone (1) f french 0
	10003 simone (2) m italian 0

	and I need to remove the (1) and group the information in gender, origin and frequency like this :
	5221 janis f,m english, latvian 0
	9842 shay f,m irish, english, jewish 0
	10003 simone f,m french, italian 0
	here are the queries
	*/

	WITH cte AS (
	SELECT max(id) as id,
	trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom,
	string_agg(DISTINCT genre, ',') as genre,
	string_agg(DISTINCT langage, ',') as langage,
	max(frequence) as frequence
	FROM firstname
	WHERE prenom ilike '% (%'
	GROUP BY trim(both ' ' from substring(prenom, '^[^ (]+'))
	)
	UPDATE firstname
	SET prenom = cte.prenom,
	genre = cte.genre,
	langage = cte.langage,
	frequence = cte.frequence
	FROM cte
	WHERE firstname.id = cte.id;

	-- now cleanup gender field and remove duplicates like m,m,f

	WITH cte AS (
	SELECT f1.id
	, f1.prenom
	, (SELECT string_agg(DISTINCT genres.genr, ',')
	FROM (SELECT f.id, genr
	FROM french_firstname f,
	Unnest(string_to_array(f.genre, ',')) AS genr
	WHERE f.id = f1.id
	ORDER BY genr
	)
	AS genres
	GROUP BY genres.id) as new_genre

	FROM french_firstname f1
	WHERE f1.prenom IN (SELECT trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom
	FROM french_firstname
	WHERE prenom ilike '% (%)'
	)
	)
	UPDATE french_firstname
	SET genre = cte.new_genre
	FROM cte
	WHERE french_firstname.id = cte.id;

	-- finaly remove the other records

	DELETE FROM firstname
	WHERE prenom IN (SELECT trim(both ' ' from substring(prenom, '^[^ (]+')) as prenom
	FROM french_firstname
	WHERE prenom ilike '% (%)'
	)