Skip to content

Instantly share code, notes, and snippets.

@ZiTAL
Last active September 15, 2020 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
python3: read xml
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# Remove all words that have not allowed characters (specified in letter_allowed)
# we will unify all the letters into a REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or -
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
<?xml version='1.0' encoding='UTF-8'?>
<ws>
<w f="1">ks</w>
<w f="65">la</w>
</ws>
<?xml version="1.0" encoding="UTF-8"?>
<ws>
<w f="1">Veuchey</w>
<w f="1">ks</w>
<w f="65">la</w>
<w f="65">ñ</w>
</ws>
@Porrumentzio
Copy link

Porrumentzio commented Sep 1, 2020

# Tuple with legal characters
l = ("A", "B", "D", "C", "Ĉ", "E", "F", "G", "Ĝ", "H", "Ĥ", "I", "J", "Ĵ", "K", "L", "N", "M", "O", "P", "R", "S", "Ŝ", "T", "U", "Ŭ", "V", "Z", "a", "b", "d", "c", "ĉ", "e", "f", "g", "ĝ", "h", "ĥ", "i", "j", "ĵ", "k", "l", "n", "m", "o", "p", "r", "s", "ŝ", "t", "u", "ŭ", "v", "z", "'", "-")

from lxml import etree

f = open("words_merged.xml", "r")
dom = etree.parse(f)
wl = dom.xpath('//wordlist/w')

result = []

for w in wl:
	for i in w.text:
		if i not in l:
    			result.append(w.text) # zerrendan gehitu ordez, lerro hoi ezabatu beharko luke, baño hoi gerogo. Oain hitz garbiko karaktere bat l-n ez bado, hitz hoi zerrendan gehitzea naet

print(result)

Honek errore hau emateit:

    result.append(w.text) # zerrendan gehitu ordez, lerro hoi ezabatu beharko luke, baño hoi gerogo. Oain hitz garbiko karaktere bat l-n ez bado, hitz hoi zerrendan gehitzea naet
                                                                                                                                                                                  ^
TabError: inconsistent use of tabs and spaces in indentation

Probatu nahiko bazenu, nundik ahal dizut pasa .xml fitxategia?

@Porrumentzio
Copy link

Porrumentzio commented Sep 1, 2020

Hau izan da nere kasura moldatuta ta inglesezko iruzkinekin, beste batzukin lanian ai naiz ta (AnySoftKeyboarderako da):

from lxml import etree
import re

# we will unify all the letters into a REGEX
letter_allowed = ("A", "B", "D", "C", "Ĉ", "E", "F", "G", "Ĝ", "H", "Ĥ", "I", "J", "Ĵ", "K", "L", "N", "M", "O", "P", "R", "S", "Ŝ", "T", "U", "Ŭ", "V", "Z", "a", "b", "d", "c", "ĉ", "e", "f", "g", "ĝ", "h", "ĥ", "i", "j", "ĵ", "k", "l", "n", "m", "o", "p", "r", "s", "ŝ", "t", "u", "ŭ", "v", "z", "'", "-")
letter_allowed = ''.join(letter_allowed)
letter_allowed = "[^"+letter_allowed+"]+" # REGEX to search any other letter

file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')

for w in wl:
	r = re.search(letter_allowed, w.text) # search letters
	if r != None: # if it finds any other letter, remove the it
		w.getparent().remove(w) # remove the node

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

Zuk in dezula aipatzea naezu, hasieran iruzkin batekin edo?

@Porrumentzio
Copy link

from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')

def removeForeignWords():
	# we will unify all the letters into a REGEX
	letter_allowed = ("A", "B", "D", "C", "Ĉ", "E", "F", "G", "Ĝ", "H", "Ĥ", "I", "J", "Ĵ", "K", "L", "N", "M", "O", "P", "R", "S", "Ŝ", "T", "U", "Ŭ", "V", "Z", "a", "b", "d", "c", "ĉ", "e", "f", "g", "ĝ", "h", "ĥ", "i", "j", "ĵ", "k", "l", "n", "m", "o", "p", "r", "s", "ŝ", "t", "u", "ŭ", "v", "z", "'", "-")
	letter_allowed = ''.join(letter_allowed)
	letter_allowed = "[^"+letter_allowed+"]+" # REGEX to search any other letter

	for w in wl:
		r = re.search(letter_allowed, w.text) # search letters
		if r != None: # if it finds any other letter, remove the it
			w.getparent().remove(w) # remove the node

def removeMonoCharacters():
	for w in wl:
		# Remove all monocharacter entries. USE WITH CARE!! Some languages DO HAVE monocharacter words
		if len(w.text)==1:
			w.getparent().remove(w)

def removeWordsNoVowel():

	vowels_allowed = ("A", "E", "I", "O", "U", "a", "e", "i", "o", "u")
	vowels_allowed = ''.join(vowels_allowed)
	vowels_allowed = "["+vowels_allowed+"]+"

	for w in wl:
		r = re.search(vowels_allowed, w.text) # search letters
		if r == None:
			w.getparent().remove(w)

removeForeignWords()
removeMonoCharacters()
removeWordsNoVowel()

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

Hau da garatueten scripta. Halare errore bat emateit removeWordsNoVowel() funtzioko azkeneko lerroan, w.getparent().remove(w):

[porru@matejaro script_clean_words]$ /usr/sbin/python /home/porru/Sorkuntza/GNU/ASK/script_clean_words/clean_words_merged.py
Traceback (most recent call last):
  File "/home/porru/Sorkuntza/GNU/ASK/script_clean_words/clean_words_merged.py", line 45, in <module>
    removeWordsNoVowel()
  File "/home/porru/Sorkuntza/GNU/ASK/script_clean_words/clean_words_merged.py", line 41, in removeWordsNoVowel
    w.getparent().remove(w)
AttributeError: 'NoneType' object has no attribute 'remove'

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

funtzioei ez diezu "wl" aldagaia pasatzen:

def removeMonoCharacters(ws):
    #...
    return ws

def removeMonoCharacters(ws):
    #...
    return ws

def removeWordsNoVowel(ws):
    #...
    return ws

#...

ws = removeForeignWords(ws)
ws = removeMonoCharacters(ws)
ws = removeWordsNoVowel(ws)

@Porrumentzio
Copy link

return ws/wl hoi funtzioen azken lerrotzat? Zer da bueltatu behar duna? Ite duna kasu hontan lerroak ezabatu dalako

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

barkatu "wl" da eta ez "ws" funtzioen azken lerroan jarri behar duzu return-a, funtzio barruan dagoena enkapsulatuta dagoelako, eta kanpoko ezerekin ez dutelako zerikusirik, beraz aldagaia parametro moduan pasatu behar diozu eta aldaketetik bueltatu, programan dagoen "wl" originala aldatzeko

@Porrumentzio
Copy link

Hola guztitan, ezta?

def removeMonoCharacters(wl):
	for w in wl:
		# Remove all monocharacter entries. USE WITH CARE!! Some languages DO HAVE monocharacter words
		if len(w.text)==1:
			w.getparent().remove(w)
	return(wl)

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

bai, eta gero:

wl = removeMonoCharacters(wl)

@Porrumentzio
Copy link

Porrumentzio commented Sep 2, 2020

Bukaerako bi puntu hoiek gabe esango nuke.

zeba berdindu behar da wl parametrodun funtzioai deia wlkin?

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

funtzioan aldatzen den WL aldagaia, funtzio barrukoa delako, ez kanpokoa, funtzio barruan gertatzen dena bertan geratzen delako, ez duelako kanpoko aldagaiekin zerikusirik, goian jarri dizut

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

kodea sinplifikatu dizut eta badaezpada "-" eskapatu "-" regex-ak egiteko letra berezia delako

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

beno dena jarri dizut if batean or-ekin oraindik kodea laburragoa da :D

@Porrumentzio
Copy link

Porrumentzio commented Sep 2, 2020

Hasieran in nahi dituten ta nola ez dakiten bi baldintza daude.
Azpian ondo dabilen beste baldintza bat gehituet. Optimizatu daiteke baño nahiago det lenaoko funtzioaz aparte jarri, errexo kendu ahal izateko.

#!/usr/bin/python3
# -*- coding: utf-8 -*-

# ', " edo - hizkiakin hasi ta bukatze dianei hoiek kendu: 'hitza' edo "hitza" -hitza-  --> hitza
# ' hizkiakin haste dianei hoiek kendu: 'hitza  --> hitza

from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')

# REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-"
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
biCharacter_allowed = ['aĉ', 'aĥ', 'aj', 'al', 'aŭ', 'ba', 'be', 'ci', 'ĉe', 'ĉi', 'ĉu', 'da', 'de', 'do', 'du', 'eĉ', 'el', 'en', 'fi', 'fu', 'ĝi', 'ha', 'hm', 'ho', 'hu', 'je', 'ke', 'la', 'li', 'lo', 'mi', 'ne', 'ni', 'nu', 'oj', 'ok', 'ol', 'ri', 'se', 'si', 'ŝi', 'vi', 'uf', 'uk' 'ŭa']

for w in wl:
    conditions = (
        re.search(letter_allowed, w.text) != None or                    # letter not allowed
        len(w.text)==1 or                                               # lengh = 1
        re.search(vowels_allowed, w.text) == None or                    # there is no vowel
        (len(w.text)==2 and re.search("['\-]+", w.text)) or             # length = 2 and has ' or -
        (len(w.text)==2 and (w.text).lower not in biCharacter_allowed)  # length = 2 and not allowed bi-character
        )
    if conditions:
        w.getparent().remove(w) # remove the node

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

@Porrumentzio
Copy link

Gehitueten azkenak ez dit ondo funtzionatu, zerrendan dauden hitzak ere ezabatu dizkit ta

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"

@ZiTAL
Copy link
Author

ZiTAL commented Sep 15, 2020

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import sys
from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)

# hasierako eta bukaerako # eta ' kendu

letters_to_remove = "'#"

wl = dom.xpath('//wordlist/w')
for w in wl:
	conditions = (
		re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
		)
	if conditions:
		w.text = re.sub("^['#]{1}", '', w.text)
		w.text = re.sub("['#]{1}$", '', w.text)

# errepikatuak kendu

all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
	text = w.text
	if text in all:
		all[text] = all[text] + 1
		if text not in repeated:
			repeated.append(text)
	else:
		all[text] = 1

for repeat in repeated:
	while True:
		expr = '//wordlist/w[.= $var]'
		result = dom.xpath(expr, var = repeat)
		result_len = len(result)
		if result_len > 1:
			remove_index = result_len - 1
			remove_node = result[remove_index]
			remove_node.getparent().remove(remove_node)
		else:
			break

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment