Last active
September 15, 2020 16:06
-
-
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
python3: read xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
from lxml import etree | |
import re | |
file = open("words_merged.xml", "r") | |
dom = etree.parse(file) | |
wl = dom.xpath('//wordlist/w') | |
# Remove all words that have not allowed characters (specified in letter_allowed) | |
# we will unify all the letters into a REGEX | |
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter | |
letter_allowed = "[^"+re.escape(letter_allowed)+"]+" | |
vowels_allowed = "[AEIOUaeiou]+" | |
for w in wl: | |
conditions = ( | |
re.search(letter_allowed, w.text) != None or # letter not allowed | |
len(w.text)==1 or # lengh = 1 | |
re.search(vowels_allowed, w.text) == None or # there is no vowel | |
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or - | |
) | |
if conditions: | |
w.getparent().remove(w) # remove the node | |
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml | |
# Rename clean_words_merged.xml into words_merged.xml and use it |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version='1.0' encoding='UTF-8'?> | |
<ws> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
</ws> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<ws> | |
<w f="1">Veuchey</w> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
<w f="65">ñ</w> | |
</ws> |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
# hasierako eta bukaerako # eta ' kendu
letters_to_remove = "'#"
wl = dom.xpath('//wordlist/w')
for w in wl:
conditions = (
re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
)
if conditions:
w.text = re.sub("^['#]{1}", '', w.text)
w.text = re.sub("['#]{1}$", '', w.text)
# errepikatuak kendu
all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
text = w.text
if text in all:
all[text] = all[text] + 1
if text not in repeated:
repeated.append(text)
else:
all[text] = 1
for repeat in repeated:
while True:
expr = '//wordlist/w[.= $var]'
result = dom.xpath(expr, var = repeat)
result_len = len(result)
if result_len > 1:
remove_index = result_len - 1
remove_node = result[remove_index]
remove_node.getparent().remove(remove_node)
else:
break
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"