ZiTAL/clean_words_merged.py

## clean_words_merged.py
#!/usr/bin/python3
# -*- coding: utf-8 -*-

from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')

# Remove all words that have not allowed characters (specified in letter_allowed)
# we will unify all the letters into a REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"

for w in wl:
    conditions = (
        re.search(letter_allowed, w.text) != None or        # letter not allowed
        len(w.text)==1 or                                   # lengh = 1
        re.search(vowels_allowed, w.text) == None or        # there is no vowel
        (len(w.text)==2 and re.search("['\-]+", w.text))    # length = 2 and has ' or -
        )
    if conditions:
        w.getparent().remove(w) # remove the node

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

## clean_words_merged.xml
<?xml version='1.0' encoding='UTF-8'?>
<ws>
    <w f="1">ks</w>
    <w f="65">la</w>
</ws>

## words_merged.xml
<?xml version="1.0" encoding="UTF-8"?>
<ws>
    <w f="1">Veuchey</w>
    <w f="1">ks</w>
    <w f="65">la</w>
    <w f="65">ñ</w>
</ws>
	#!/usr/bin/python3
	# -- coding: utf-8 --

	from lxml import etree
	import re

	file = open("words_merged.xml", "r")
	dom = etree.parse(file)
	wl = dom.xpath('//wordlist/w')

	# Remove all words that have not allowed characters (specified in letter_allowed)
	# we will unify all the letters into a REGEX
	letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter
	letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
	vowels_allowed = "[AEIOUaeiou]+"

	for w in wl:
	conditions = (
	re.search(letter_allowed, w.text) != None or # letter not allowed
	len(w.text)==1 or # lengh = 1
	re.search(vowels_allowed, w.text) == None or # there is no vowel
	(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or -
	)
	if conditions:
	w.getparent().remove(w) # remove the node

	dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

	# Rename clean_words_merged.xml into words_merged.xml and use it
	<?xml version='1.0' encoding='UTF-8'?>
	<ws>
	<w f="1">ks</w>
	<w f="65">la</w>
	</ws>
	<?xml version="1.0" encoding="UTF-8"?>
	<ws>
	<w f="1">Veuchey</w>
	<w f="1">ks</w>
	<w f="65">la</w>
	<w f="65">ñ</w>
	</ws>