-
-
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
from lxml import etree | |
import re | |
file = open("words_merged.xml", "r") | |
dom = etree.parse(file) | |
wl = dom.xpath('//wordlist/w') | |
# Remove all words that have not allowed characters (specified in letter_allowed) | |
# we will unify all the letters into a REGEX | |
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter | |
letter_allowed = "[^"+re.escape(letter_allowed)+"]+" | |
vowels_allowed = "[AEIOUaeiou]+" | |
for w in wl: | |
conditions = ( | |
re.search(letter_allowed, w.text) != None or # letter not allowed | |
len(w.text)==1 or # lengh = 1 | |
re.search(vowels_allowed, w.text) == None or # there is no vowel | |
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or - | |
) | |
if conditions: | |
w.getparent().remove(w) # remove the node | |
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml | |
# Rename clean_words_merged.xml into words_merged.xml and use it |
<?xml version='1.0' encoding='UTF-8'?> | |
<ws> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
</ws> |
<?xml version="1.0" encoding="UTF-8"?> | |
<ws> | |
<w f="1">Veuchey</w> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
<w f="65">ñ</w> | |
</ws> |
return ws/wl hoi funtzioen azken lerrotzat? Zer da bueltatu behar duna? Ite duna kasu hontan lerroak ezabatu dalako
barkatu "wl" da eta ez "ws" funtzioen azken lerroan jarri behar duzu return-a, funtzio barruan dagoena enkapsulatuta dagoelako, eta kanpoko ezerekin ez dutelako zerikusirik, beraz aldagaia parametro moduan pasatu behar diozu eta aldaketetik bueltatu, programan dagoen "wl" originala aldatzeko
Hola guztitan, ezta?
def removeMonoCharacters(wl):
for w in wl:
# Remove all monocharacter entries. USE WITH CARE!! Some languages DO HAVE monocharacter words
if len(w.text)==1:
w.getparent().remove(w)
return(wl)
bai, eta gero:
wl = removeMonoCharacters(wl)
Bukaerako bi puntu hoiek gabe esango nuke.
zeba berdindu behar da wl
parametrodun funtzioai deia wl
kin?
funtzioan aldatzen den WL aldagaia, funtzio barrukoa delako, ez kanpokoa, funtzio barruan gertatzen dena bertan geratzen delako, ez duelako kanpoko aldagaiekin zerikusirik, goian jarri dizut
kodea sinplifikatu dizut eta badaezpada "-" eskapatu "-" regex-ak egiteko letra berezia delako
beno dena jarri dizut if batean or-ekin oraindik kodea laburragoa da :D
Hasieran in nahi dituten ta nola ez dakiten bi baldintza daude.
Azpian ondo dabilen beste baldintza bat gehituet. Optimizatu daiteke baño nahiago det lenaoko funtzioaz aparte jarri, errexo kendu ahal izateko.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# ', " edo - hizkiakin hasi ta bukatze dianei hoiek kendu: 'hitza' edo "hitza" -hitza- --> hitza
# ' hizkiakin haste dianei hoiek kendu: 'hitza --> hitza
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-"
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
biCharacter_allowed = ['aĉ', 'aĥ', 'aj', 'al', 'aŭ', 'ba', 'be', 'ci', 'ĉe', 'ĉi', 'ĉu', 'da', 'de', 'do', 'du', 'eĉ', 'el', 'en', 'fi', 'fu', 'ĝi', 'ha', 'hm', 'ho', 'hu', 'je', 'ke', 'la', 'li', 'lo', 'mi', 'ne', 'ni', 'nu', 'oj', 'ok', 'ol', 'ri', 'se', 'si', 'ŝi', 'vi', 'uf', 'uk' 'ŭa']
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) or # length = 2 and has ' or -
(len(w.text)==2 and (w.text).lower not in biCharacter_allowed) # length = 2 and not allowed bi-character
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Gehitueten azkenak ez dit ondo funtzionatu, zerrendan dauden hitzak ere ezabatu dizkit ta
arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
# hasierako eta bukaerako # eta ' kendu
letters_to_remove = "'#"
wl = dom.xpath('//wordlist/w')
for w in wl:
conditions = (
re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
)
if conditions:
w.text = re.sub("^['#]{1}", '', w.text)
w.text = re.sub("['#]{1}$", '', w.text)
# errepikatuak kendu
all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
text = w.text
if text in all:
all[text] = all[text] + 1
if text not in repeated:
repeated.append(text)
else:
all[text] = 1
for repeat in repeated:
while True:
expr = '//wordlist/w[.= $var]'
result = dom.xpath(expr, var = repeat)
result_len = len(result)
if result_len > 1:
remove_index = result_len - 1
remove_node = result[remove_index]
remove_node.getparent().remove(remove_node)
else:
break
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
funtzioei ez diezu "wl" aldagaia pasatzen: