Skip to content

Instantly share code, notes, and snippets.

@ZiTAL
Last active September 15, 2020 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
Save ZiTAL/94696f22ee6fca6e20e7fc367af38d4e to your computer and use it in GitHub Desktop.
python3: read xml
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import re
file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')
# Remove all words that have not allowed characters (specified in letter_allowed)
# we will unify all the letters into a REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-" # REGEX to search any other letter
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
for w in wl:
conditions = (
re.search(letter_allowed, w.text) != None or # letter not allowed
len(w.text)==1 or # lengh = 1
re.search(vowels_allowed, w.text) == None or # there is no vowel
(len(w.text)==2 and re.search("['\-]+", w.text)) # length = 2 and has ' or -
)
if conditions:
w.getparent().remove(w) # remove the node
dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml
# Rename clean_words_merged.xml into words_merged.xml and use it
<?xml version='1.0' encoding='UTF-8'?>
<ws>
<w f="1">ks</w>
<w f="65">la</w>
</ws>
<?xml version="1.0" encoding="UTF-8"?>
<ws>
<w f="1">Veuchey</w>
<w f="1">ks</w>
<w f="65">la</w>
<w f="65">ñ</w>
</ws>
@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

kodea sinplifikatu dizut eta badaezpada "-" eskapatu "-" regex-ak egiteko letra berezia delako

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

beno dena jarri dizut if batean or-ekin oraindik kodea laburragoa da :D

@Porrumentzio
Copy link

Porrumentzio commented Sep 2, 2020

Hasieran in nahi dituten ta nola ez dakiten bi baldintza daude.
Azpian ondo dabilen beste baldintza bat gehituet. Optimizatu daiteke baño nahiago det lenaoko funtzioaz aparte jarri, errexo kendu ahal izateko.

#!/usr/bin/python3
# -*- coding: utf-8 -*-

# ', " edo - hizkiakin hasi ta bukatze dianei hoiek kendu: 'hitza' edo "hitza" -hitza-  --> hitza
# ' hizkiakin haste dianei hoiek kendu: 'hitza  --> hitza

from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)
wl = dom.xpath('//wordlist/w')

# REGEX
letter_allowed = "ABDCĈEFGĜHĤIJĴKLNMOPRSŜTUŬVZabdcĉefgĝhĥijĵklnmoprsŝtuŭvz'-"
letter_allowed = "[^"+re.escape(letter_allowed)+"]+"
vowels_allowed = "[AEIOUaeiou]+"
biCharacter_allowed = ['aĉ', 'aĥ', 'aj', 'al', 'aŭ', 'ba', 'be', 'ci', 'ĉe', 'ĉi', 'ĉu', 'da', 'de', 'do', 'du', 'eĉ', 'el', 'en', 'fi', 'fu', 'ĝi', 'ha', 'hm', 'ho', 'hu', 'je', 'ke', 'la', 'li', 'lo', 'mi', 'ne', 'ni', 'nu', 'oj', 'ok', 'ol', 'ri', 'se', 'si', 'ŝi', 'vi', 'uf', 'uk' 'ŭa']

for w in wl:
    conditions = (
        re.search(letter_allowed, w.text) != None or                    # letter not allowed
        len(w.text)==1 or                                               # lengh = 1
        re.search(vowels_allowed, w.text) == None or                    # there is no vowel
        (len(w.text)==2 and re.search("['\-]+", w.text)) or             # length = 2 and has ' or -
        (len(w.text)==2 and (w.text).lower not in biCharacter_allowed)  # length = 2 and not allowed bi-character
        )
    if conditions:
        w.getparent().remove(w) # remove the node

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

@Porrumentzio
Copy link

Gehitueten azkenak ez dit ondo funtzionatu, zerrendan dauden hitzak ere ezabatu dizkit ta

@ZiTAL
Copy link
Author

ZiTAL commented Sep 2, 2020

arazoa "lower" ez dela propietatea, metodoa baizik, beraz erabili "lower()"

@ZiTAL
Copy link
Author

ZiTAL commented Sep 15, 2020

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import sys
from lxml import etree
import re

file = open("words_merged.xml", "r")
dom = etree.parse(file)

# hasierako eta bukaerako # eta ' kendu

letters_to_remove = "'#"

wl = dom.xpath('//wordlist/w')
for w in wl:
	conditions = (
		re.search("^["+letters_to_remove+"]{1}", w.text) != None and re.search("["+letters_to_remove+"]{1}$", w.text) != None
		)
	if conditions:
		w.text = re.sub("^['#]{1}", '', w.text)
		w.text = re.sub("['#]{1}$", '', w.text)

# errepikatuak kendu

all = {};
repeated = [];
wl = dom.xpath('//wordlist/w')
for w in wl:
	text = w.text
	if text in all:
		all[text] = all[text] + 1
		if text not in repeated:
			repeated.append(text)
	else:
		all[text] = 1

for repeat in repeated:
	while True:
		expr = '//wordlist/w[.= $var]'
		result = dom.xpath(expr, var = repeat)
		result_len = len(result)
		if result_len > 1:
			remove_index = result_len - 1
			remove_node = result[remove_index]
			remove_node.getparent().remove(remove_node)
		else:
			break

dom.write('clean_words_merged.xml', pretty_print=True, xml_declaration=True, encoding="utf-8") # save the output into clean_words_merged.xml

# Rename clean_words_merged.xml into words_merged.xml and use it

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment