Skip to content

Instantly share code, notes, and snippets.

@ZiTAL
Last active September 15, 2020 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
python3: read xml
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# Remove all words that have not allowed characters (specified in letter_allowed)
# we will unify all the letters into a REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or -
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
<?xml version='1.0' encoding='UTF-8'?>
<ws>
<w f="1">ks</w>
<w f="65">la</w>
</ws>
<?xml version="1.0" encoding="UTF-8"?>
<ws>
<w f="1">Veuchey</w>
<w f="1">ks</w>
<w f="65">la</w>
<w f="65">ñ</w>
</ws>
@ZiTAL
Copy link
Author

ZiTAL commented Sep 15, 2020

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import sys
from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)

# hasierako eta bukaerako # eta ' kendu

letters_to_remove = "'#"

wl = dom.xpath('//wordlist/w')
for w in wl:
	conditions = (
		re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
		)
	if conditions:
		w.text = re.sub("^['#]{1}", '', w.text)
		w.text = re.sub("['#]{1}$", '', w.text)

# errepikatuak kendu

all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
	text = w.text
	if text in all:
		all[text] = all[text] + 1
		if text not in repeated:
			repeated.append(text)
	else:
		all[text] = 1

for repeat in repeated:
	while True:
		expr = '//wordlist/w[.= $var]'
		result = dom.xpath(expr, var = repeat)
		result_len = len(result)
		if result_len > 1:
			remove_index = result_len - 1
			remove_node = result[remove_index]
			remove_node.getparent().remove(remove_node)
		else:
			break

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment