Last active
December 19, 2023 19:15
-
-
Save kaleidawave/d51829a1209c87fa3fffb436a59c0a83 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Create the new table | |
CREATE TABLE words AS SELECT * FROM original_words; | |
-- Can use strlen instead | |
ALTER TABLE words DROP Count; | |
-- rogue row | |
DELETE FROM words WHERE NOT prefix(Definition, '"'); | |
-- Some columns have their POS leak into the definition | |
UPDATE words SET POS = POS || split_part(Definition, '"', 1) || '"', Definition = '"' || split_part(Definition, '"', 3) || '"', WHERE NOT suffix(POS, '"'); | |
-- Some didn't end in quotations which breaks the next update | |
UPDATE words SET Definition = Definition || '"' WHERE NOT suffix(Definition, '"'); | |
-- Remove quotations | |
UPDATE words SET POS = POS [2:-1], Definition = Definition [2:-1]; | |
-- Remove double spaces | |
UPDATE words SET Definition = regexp_replace(Definition, ' {2}', ' ', 'g'); | |
-- Remove duplicate rows | |
DELETE FROM words WHERE rowid IN ( SELECT MAX(rowid) FROM words GROUP BY Word, Definition HAVING COUNT(*) > 1 ); | |
-- Some characters | |
update words set word = regexp_replace(Word, '[^a-zA-Z-\x27 ]', '', 'g') where word not similar to '[a-zA-Z-\x27 ]+'; | |
-- Output | |
COPY words TO 'output.csv' (HEADER, DELIMITER ','); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment