-
-
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
from lxml import etree | |
import re | |
file = open("words_merged.xml", "r") | |
dom = etree.parse(file) | |
wl = dom.xpath('//wordlist/w') | |
# Remove all words that have not allowed characters (specified in letter_allowed) | |
# we will unify all the letters into a REGEX | |
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter | |
letter_allowed = "[^"+re.escape(letter_allowed)+"]+" | |
vowels_allowed = "[AEIOUaeiou]+" | |
for w in wl: | |
conditions = ( | |
re.search(letter_allowed, w.text) != None or # letter not allowed | |
len(w.text)==1 or # lengh = 1 | |
re.search(vowels_allowed, w.text) == None or # there is no vowel | |
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or - | |
) | |
if conditions: | |
w.getparent().remove(w) # remove the node | |
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml | |
# Rename clean_words_merged.xml into words_merged.xml and use it |
<?xml version='1.0' encoding='UTF-8'?> | |
<ws> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
</ws> |
<?xml version="1.0" encoding="UTF-8"?> | |
<ws> | |
<w f="1">Veuchey</w> | |
<w f="1">ks</w> | |
<w f="65">la</w> | |
<w f="65">ñ</w> | |
</ws> |
Hau izan da nere kasura moldatuta ta inglesezko iruzkinekin, beste batzukin lanian ai naiz ta (AnySoftKeyboarderako da):
from lxml import etree
import re
# we will unify all the letters into a REGEX
letter_allowed = ("A", "B", "D", "C", "Ĉ", "E", "F", "G", "Ĝ", "H", "Ĥ", "I", "J", "Ĵ", "K", "L", "N", "M", "O", "P", "R", "S", "Ŝ", "T", "U", "Ŭ", "V", "Z", "a", "b", "d", "c", "ĉ", "e", "f", "g", "ĝ", "h", "ĥ", "i", "j", "ĵ", "k", "l", "n", "m", "o", "p", "r", "s", "ŝ", "t", "u", "ŭ", "v", "z", "'", "-")
letter_allowed = ''.join(letter_allowed)
letter_allowed = "[^"+letter_allowed+"]+" # REGEX to search any other letter
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
for w in wl:
r = re.search(letter_allowed, w.text) # search letters
if r != None: # if it finds any other letter, remove the it
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Zuk in dezula aipatzea naezu, hasieran iruzkin batekin edo?
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
def removeForeignWords():
# we will unify all the letters into a REGEX
letter_allowed = ("A", "B", "D", "C", "Ĉ", "E", "F", "G", "Ĝ", "H", "Ĥ", "I", "J", "Ĵ", "K", "L", "N", "M", "O", "P", "R", "S", "Ŝ", "T", "U", "Ŭ", "V", "Z", "a", "b", "d", "c", "ĉ", "e", "f", "g", "ĝ", "h", "ĥ", "i", "j", "ĵ", "k", "l", "n", "m", "o", "p", "r", "s", "ŝ", "t", "u", "ŭ", "v", "z", "'", "-")
letter_allowed = ''.join(letter_allowed)
letter_allowed = "[^"+letter_allowed+"]+" # REGEX to search any other letter
for w in wl:
r = re.search(letter_allowed, w.text) # search letters
if r != None: # if it finds any other letter, remove the it
w.getparent().remove(w) # remove the node
def removeMonoCharacters():
for w in wl:
# Remove all monocharacter entries. USE WITH CARE!! Some languages DO HAVE monocharacter words
if len(w.text)==1:
w.getparent().remove(w)
def removeWordsNoVowel():
vowels_allowed = ("A", "E", "I", "O", "U", "a", "e", "i", "o", "u")
vowels_allowed = ''.join(vowels_allowed)
vowels_allowed = "["+vowels_allowed+"]+"
for w in wl:
r = re.search(vowels_allowed, w.text) # search letters
if r == None:
w.getparent().remove(w)
removeForeignWords()
removeMonoCharacters()
removeWordsNoVowel()
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Hau da garatueten scripta. Halare errore bat emateit removeWordsNoVowel()
funtzioko azkeneko lerroan, w.getparent().remove(w)
:
[porru@matejaro script_clean_words]$ /usr/sbin/python /home/porru/Sorkuntza/GNU/ASK/script_clean_words/clean_words_merged.py
Traceback (most recent call last):
File "/home/porru/Sorkuntza/GNU/ASK/script_clean_words/clean_words_merged.py", line 45, in <module>
removeWordsNoVowel()
File "/home/porru/Sorkuntza/GNU/ASK/script_clean_words/clean_words_merged.py", line 41, in removeWordsNoVowel
w.getparent().remove(w)
AttributeError: 'NoneType' object has no attribute 'remove'
funtzioei ez diezu "wl" aldagaia pasatzen:
def removeMonoCharacters(ws):
#...
return ws
def removeMonoCharacters(ws):
#...
return ws
def removeWordsNoVowel(ws):
#...
return ws
#...
ws = removeForeignWords(ws)
ws = removeMonoCharacters(ws)
ws = removeWordsNoVowel(ws)
return ws/wl hoi funtzioen azken lerrotzat? Zer da bueltatu behar duna? Ite duna kasu hontan lerroak ezabatu dalako
barkatu "wl" da eta ez "ws" funtzioen azken lerroan jarri behar duzu return-a, funtzio barruan dagoena enkapsulatuta dagoelako, eta kanpoko ezerekin ez dutelako zerikusirik, beraz aldagaia parametro moduan pasatu behar diozu eta aldaketetik bueltatu, programan dagoen "wl" originala aldatzeko
Hola guztitan, ezta?
def removeMonoCharacters(wl):
for w in wl:
# Remove all monocharacter entries. USE WITH CARE!! Some languages DO HAVE monocharacter words
if len(w.text)==1:
w.getparent().remove(w)
return(wl)
bai, eta gero:
wl = removeMonoCharacters(wl)
Bukaerako bi puntu hoiek gabe esango nuke.
zeba berdindu behar da wl
parametrodun funtzioai deia wl
kin?
funtzioan aldatzen den WL aldagaia, funtzio barrukoa delako, ez kanpokoa, funtzio barruan gertatzen dena bertan geratzen delako, ez duelako kanpoko aldagaiekin zerikusirik, goian jarri dizut
kodea sinplifikatu dizut eta badaezpada "-" eskapatu "-" regex-ak egiteko letra berezia delako
beno dena jarri dizut if batean or-ekin oraindik kodea laburragoa da :D
Hasieran in nahi dituten ta nola ez dakiten bi baldintza daude.
Azpian ondo dabilen beste baldintza bat gehituet. Optimizatu daiteke baño nahiago det lenaoko funtzioaz aparte jarri, errexo kendu ahal izateko.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# ', " edo - hizkiakin hasi ta bukatze dianei hoiek kendu: 'hitza' edo "hitza" -hitza- --> hitza
# ' hizkiakin haste dianei hoiek kendu: 'hitza --> hitza
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-"
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
biCharacter_allowed = ['aĉ', 'aĥ', 'aj', 'al', 'aŭ', 'ba', 'be', 'ci', 'ĉe', 'ĉi', 'ĉu', 'da', 'de', 'do', 'du', 'eĉ', 'el', 'en', 'fi', 'fu', 'ĝi', 'ha', 'hm', 'ho', 'hu', 'je', 'ke', 'la', 'li', 'lo', 'mi', 'ne', 'ni', 'nu', 'oj', 'ok', 'ol', 'ri', 'se', 'si', 'ŝi', 'vi', 'uf', 'uk' 'ŭa']
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) or # length = 2 and has ' or -
(len(w.text)==2 and (w.text).lower not in biCharacter_allowed) # length = 2 and not allowed bi-character
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Gehitueten azkenak ez dit ondo funtzionatu, zerrendan dauden hitzak ere ezabatu dizkit ta
arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
# hasierako eta bukaerako # eta ' kendu
letters_to_remove = "'#"
wl = dom.xpath('//wordlist/w')
for w in wl:
conditions = (
re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
)
if conditions:
w.text = re.sub("^['#]{1}", '', w.text)
w.text = re.sub("['#]{1}$", '', w.text)
# errepikatuak kendu
all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
text = w.text
if text in all:
all[text] = all[text] + 1
if text not in repeated:
repeated.append(text)
else:
all[text] = 1
for repeat in repeated:
while True:
expr = '//wordlist/w[.= $var]'
result = dom.xpath(expr, var = repeat)
result_len = len(result)
if result_len > 1:
remove_index = result_len - 1
remove_node = result[remove_index]
remove_node.getparent().remove(remove_node)
else:
break
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
Honek errore hau emateit:
Probatu nahiko bazenu, nundik ahal dizut pasa .xml fitxategia?