Created
September 3, 2022 14:00
-
-
Save arildm/394bd95f06ba2e12686e430b1206e392 to your computer and use it in GitHub Desktop.
Script for generating a Swedish word list for Wordle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
# Get saldo.xml at https://spraakbanken.gu.se/resurser/saldo | |
tree = ET.parse('saldo.xml') | |
root = tree.getroot() | |
words = set() | |
chars = set() | |
for fr in root.findall(".//FormRepresentation"): | |
pos_el = fr.find("./feat[@att='partOfSpeech']") | |
word_el = fr.find("./feat[@att='writtenForm']") | |
# Find nouns, adjectives, verbs, adverbs and pronouns. (Most importantly, NOT proper nouns.) | |
if pos_el is not None and pos_el.attrib.get('val') in ['nn', 'av', 'vb', 'ab', 'pp']: | |
word = word_el.attrib.get('val') | |
# Skip words with punctuation, remove diacritics. | |
if len(word) == 5 and '-' not in word and ':' not in word: | |
word = word.upper() \ | |
.replace('É', 'E').replace('È', 'E').replace('Ê', 'E') \ | |
.replace('Ñ', 'N') \ | |
.replace('Ü', 'U') | |
words.add(word) | |
chars.update(*word.lower()) | |
words = sorted(words) | |
chars = sorted(chars) # returns äåö in this wrong order, but it doesn't matter | |
with open('sv_5words.txt', 'w') as f: | |
f.writelines(word + '\n' for word in words) | |
with open('sv_characters.txt', 'w') as f: | |
f.writelines(char + '\n' for char in chars) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment