Last active
September 15, 2020 16:06
-
-
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
python3: read xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
from lxml import etree | |
import re | |
file = open("words_merged.xml", "r") | |
dom = etree.parse(file) | |
wl = dom.xpath('//wordlist/w') | |
# Remove all words that have not allowed characters (specified in letter_allowed) | |
# we will unify all the letters into a REGEX | |
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter | |
letter_allowed = "[^"+re.escape(letter_allowed)+"]+" | |
vowels_allowed = "[AEIOUaeiou]+" | |
for w in wl: | |
conditions = ( | |
re.search(letter_allowed, w.text) != None or # letter not allowed | |
len(w.text)==1 or # lengh = 1 | |
re.search(vowels_allowed, w.text) == None or # there is no vowel | |
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or - | |
) | |
if conditions: | |
w.getparent().remove(w) # remove the node | |
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml | |
# Rename clean_words_merged.xml into words_merged.xml and use it |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version='1.0' encoding='UTF-8'?> | |
<ws> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
</ws> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<ws> | |
<w f="1">Veuchey</w> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
<w f="65">ñ</w> | |
</ws> |
kodea sinplifikatu dizut eta badaezpada "-" eskapatu "-" regex-ak egiteko letra berezia delako
beno dena jarri dizut if batean or-ekin oraindik kodea laburragoa da :D
Hasieran in nahi dituten ta nola ez dakiten bi baldintza daude.
Azpian ondo dabilen beste baldintza bat gehituet. Optimizatu daiteke baño nahiago det lenaoko funtzioaz aparte jarri, errexo kendu ahal izateko.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# ', " edo - hizkiakin hasi ta bukatze dianei hoiek kendu: 'hitza' edo "hitza" -hitza- --> hitza
# ' hizkiakin haste dianei hoiek kendu: 'hitza --> hitza
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-"
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
biCharacter_allowed = ['aĉ', 'aĥ', 'aj', 'al', 'aŭ', 'ba', 'be', 'ci', 'ĉe', 'ĉi', 'ĉu', 'da', 'de', 'do', 'du', 'eĉ', 'el', 'en', 'fi', 'fu', 'ĝi', 'ha', 'hm', 'ho', 'hu', 'je', 'ke', 'la', 'li', 'lo', 'mi', 'ne', 'ni', 'nu', 'oj', 'ok', 'ol', 'ri', 'se', 'si', 'ŝi', 'vi', 'uf', 'uk' 'ŭa']
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) or # length = 2 and has ' or -
(len(w.text)==2 and (w.text).lower not in biCharacter_allowed) # length = 2 and not allowed bi-character
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Gehitueten azkenak ez dit ondo funtzionatu, zerrendan dauden hitzak ere ezabatu dizkit ta
arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
# hasierako eta bukaerako # eta ' kendu
letters_to_remove = "'#"
wl = dom.xpath('//wordlist/w')
for w in wl:
conditions = (
re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
)
if conditions:
w.text = re.sub("^['#]{1}", '', w.text)
w.text = re.sub("['#]{1}$", '', w.text)
# errepikatuak kendu
all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
text = w.text
if text in all:
all[text] = all[text] + 1
if text not in repeated:
repeated.append(text)
else:
all[text] = 1
for repeat in repeated:
while True:
expr = '//wordlist/w[.= $var]'
result = dom.xpath(expr, var = repeat)
result_len = len(result)
if result_len > 1:
remove_index = result_len - 1
remove_node = result[remove_index]
remove_node.getparent().remove(remove_node)
else:
break
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
funtzioan aldatzen den WL aldagaia, funtzio barrukoa delako, ez kanpokoa, funtzio barruan gertatzen dena bertan geratzen delako, ez duelako kanpoko aldagaiekin zerikusirik, goian jarri dizut