Skip to content

Instantly share code, notes, and snippets.

@arildm
Created September 3, 2022 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arildm/394bd95f06ba2e12686e430b1206e392 to your computer and use it in GitHub Desktop.
Save arildm/394bd95f06ba2e12686e430b1206e392 to your computer and use it in GitHub Desktop.
Script for generating a Swedish word list for Wordle
import xml.etree.ElementTree as ET
# Get saldo.xml at https://spraakbanken.gu.se/resurser/saldo
tree = ET.parse('saldo.xml')
root = tree.getroot()
words = set()
chars = set()
for fr in root.findall(".//FormRepresentation"):
pos_el = fr.find("./feat[@att='partOfSpeech']")
word_el = fr.find("./feat[@att='writtenForm']")
# Find nouns, adjectives, verbs, adverbs and pronouns. (Most importantly, NOT proper nouns.)
if pos_el is not None and pos_el.attrib.get('val') in ['nn', 'av', 'vb', 'ab', 'pp']:
word = word_el.attrib.get('val')
# Skip words with punctuation, remove diacritics.
if len(word) == 5 and '-' not in word and ':' not in word:
word = word.upper() \
.replace('É', 'E').replace('È', 'E').replace('Ê', 'E') \
.replace('Ñ', 'N') \
.replace('Ü', 'U')
words.add(word)
chars.update(*word.lower())
words = sorted(words)
chars = sorted(chars) # returns äåö in this wrong order, but it doesn't matter
with open('sv_5words.txt', 'w') as f:
f.writelines(word + '\n' for word in words)
with open('sv_characters.txt', 'w') as f:
f.writelines(char + '\n' for char in chars)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment