Skip to content

Instantly share code, notes, and snippets.

@ZiTAL
Last active September 15, 2020 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
python3: read xml
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# Remove all words that have not allowed characters (specified in letter_allowed)
# we will unify all the letters into a REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or -
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
<?xml version='1.0' encoding='UTF-8'?>
<ws>
<w f="1">ks</w>
<w f="65">la</w>
</ws>
<?xml version="1.0" encoding="UTF-8"?>
<ws>
<w f="1">Veuchey</w>
<w f="1">ks</w>
<w f="65">la</w>
<w f="65">ñ</w>
</ws>
@Porrumentzio
Copy link

return ws/wl hoi funtzioen azken lerrotzat? Zer da bueltatu behar duna? Ite duna kasu hontan lerroak ezabatu dalako

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

barkatu "wl" da eta ez "ws" funtzioen azken lerroan jarri behar duzu return-a, funtzio barruan dagoena enkapsulatuta dagoelako, eta kanpoko ezerekin ez dutelako zerikusirik, beraz aldagaia parametro moduan pasatu behar diozu eta aldaketetik bueltatu, programan dagoen "wl" originala aldatzeko

@Porrumentzio
Copy link

Hola guztitan, ezta?

def removeMonoCharacters(wl):
	for w in wl:
		# Remove all monocharacter entries. USE WITH CARE!! Some languages DO HAVE monocharacter words
		if len(w.text)==1:
			w.getparent().remove(w)
	return(wl)

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

bai, eta gero:

wl = removeMonoCharacters(wl)

@Porrumentzio
Copy link

Porrumentzio commented Sep 2, 2020

Bukaerako bi puntu hoiek gabe esango nuke.

zeba berdindu behar da wl parametrodun funtzioai deia wlkin?

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

funtzioan aldatzen den WL aldagaia, funtzio barrukoa delako, ez kanpokoa, funtzio barruan gertatzen dena bertan geratzen delako, ez duelako kanpoko aldagaiekin zerikusirik, goian jarri dizut

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

kodea sinplifikatu dizut eta badaezpada "-" eskapatu "-" regex-ak egiteko letra berezia delako

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

beno dena jarri dizut if batean or-ekin oraindik kodea laburragoa da :D

@Porrumentzio
Copy link

Porrumentzio commented Sep 2, 2020

Hasieran in nahi dituten ta nola ez dakiten bi baldintza daude.
Azpian ondo dabilen beste baldintza bat gehituet. Optimizatu daiteke baño nahiago det lenaoko funtzioaz aparte jarri, errexo kendu ahal izateko.

#!/usr/bin/python3
# -*- coding: utf-8 -*-

# ', " edo - hizkiakin hasi ta bukatze dianei hoiek kendu: 'hitza' edo "hitza" -hitza-  --> hitza
# ' hizkiakin haste dianei hoiek kendu: 'hitza  --> hitza

from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')

# REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-"
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
biCharacter_allowed = ['aĉ', 'aĥ', 'aj', 'al', 'aŭ', 'ba', 'be', 'ci', 'ĉe', 'ĉi', 'ĉu', 'da', 'de', 'do', 'du', 'eĉ', 'el', 'en', 'fi', 'fu', 'ĝi', 'ha', 'hm', 'ho', 'hu', 'je', 'ke', 'la', 'li', 'lo', 'mi', 'ne', 'ni', 'nu', 'oj', 'ok', 'ol', 'ri', 'se', 'si', 'ŝi', 'vi', 'uf', 'uk' 'ŭa']

for w in wl:
    conditions = (
        re.search(letter_allowed, w.text) != None or                    # letter not allowed
        len(w.text)==1 or                                               # lengh = 1
        re.search(vowels_allowed, w.text) == None or                    # there is no vowel
        (len(w.text)==2 and re.search("['\-]+", w.text)) or             # length = 2 and has ' or -
        (len(w.text)==2 and (w.text).lower not in biCharacter_allowed)  # length = 2 and not allowed bi-character
        )
    if conditions:
        w.getparent().remove(w) # remove the node

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

@Porrumentzio
Copy link

Gehitueten azkenak ez dit ondo funtzionatu, zerrendan dauden hitzak ere ezabatu dizkit ta

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"

@ZiTAL
Copy link
Author

ZiTAL commented Sep 15, 2020

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import sys
from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)

# hasierako eta bukaerako # eta ' kendu

letters_to_remove = "'#"

wl = dom.xpath('//wordlist/w')
for w in wl:
	conditions = (
		re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
		)
	if conditions:
		w.text = re.sub("^['#]{1}", '', w.text)
		w.text = re.sub("['#]{1}$", '', w.text)

# errepikatuak kendu

all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
	text = w.text
	if text in all:
		all[text] = all[text] + 1
		if text not in repeated:
			repeated.append(text)
	else:
		all[text] = 1

for repeat in repeated:
	while True:
		expr = '//wordlist/w[.= $var]'
		result = dom.xpath(expr, var = repeat)
		result_len = len(result)
		if result_len > 1:
			remove_index = result_len - 1
			remove_node = result[remove_index]
			remove_node.getparent().remove(remove_node)
		else:
			break

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment