Skip to content

Instantly share code, notes, and snippets.

@kaleidawave
Last active December 19, 2023 19:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaleidawave/d51829a1209c87fa3fffb436a59c0a83 to your computer and use it in GitHub Desktop.
Save kaleidawave/d51829a1209c87fa3fffb436a59c0a83 to your computer and use it in GitHub Desktop.
-- Create the new table
CREATE TABLE words AS SELECT * FROM original_words;
-- Can use strlen instead
ALTER TABLE words DROP Count;
-- rogue row
DELETE FROM words WHERE NOT prefix(Definition, '"');
-- Some columns have their POS leak into the definition
UPDATE words SET POS = POS || split_part(Definition, '"', 1) || '"', Definition = '"' || split_part(Definition, '"', 3) || '"', WHERE NOT suffix(POS, '"');
-- Some didn't end in quotations which breaks the next update
UPDATE words SET Definition = Definition || '"' WHERE NOT suffix(Definition, '"');
-- Remove quotations
UPDATE words SET POS = POS [2:-1], Definition = Definition [2:-1];
-- Remove double spaces
UPDATE words SET Definition = regexp_replace(Definition, ' {2}', ' ', 'g');
-- Remove duplicate rows
DELETE FROM words WHERE rowid IN ( SELECT MAX(rowid) FROM words GROUP BY Word, Definition HAVING COUNT(*) > 1 );
-- Some characters
update words set word = regexp_replace(Word, '[^a-zA-Z-\x27 ]', '', 'g') where word not similar to '[a-zA-Z-\x27 ]+';
-- Output
COPY words TO 'output.csv' (HEADER, DELIMITER ',');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment