Created
March 31, 2017 14:39
-
-
Save litil/a0f248dbe8334f1f4874986868fc28f4 to your computer and use it in GitHub Desktop.
Convert WoneF database file into a Solr formatted synonyms file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import xml.etree.ElementTree as ET | |
import re | |
import unicodedata | |
# This method adds all given synonyms into the correct dictionary entry. | |
def extendDictEntry(dict, key, xmlSynonyms): | |
for child in xmlSynonyms: | |
childText = child.text.encode('utf-8') | |
if (childText not in dict[key]): | |
dict[key].extend([childText]) | |
return dict | |
# This method buils the synonyms dictionary from the WoNeF file. | |
def buildSynonymDictionary(): | |
tree = ET.parse('wonef-fscore-0.1.xml') | |
root = tree.getroot() | |
dict = {} | |
# fill synonyms dictionary | |
for synset in root: | |
for child in synset: | |
if child.tag == "SYNONYM": | |
for literal in child: | |
currLiteralText = literal.text.encode('utf-8') | |
if currLiteralText in dict: | |
# add all SYNONYM tags text into the correct entry of the map | |
extendDictEntry(dict, currLiteralText, child) | |
else: | |
# create a new entry in the map | |
dict[currLiteralText] = [currLiteralText] | |
extendDictEntry(dict, currLiteralText, child) | |
return dict | |
def removeAccents(str): | |
return ''.join(c for c in unicodedata.normalize('NFD', str.decode('utf-8')) | |
if unicodedata.category(c) != 'Mn').encode('utf-8') | |
# This method writes the synonym file in the Solr format | |
def writeSolrSynonymFile(): | |
dict = buildSynonymDictionary() | |
file = open("solr_synonym.txt","w") | |
file.write("# Solr Synonmys File \n\n") | |
for key in dict: | |
try: | |
file.write( | |
removeAccents(key) + | |
" => " + | |
removeAccents(", ".join(dict[key])) + | |
"\n") | |
except UnicodeEncodeError: | |
print("UnicodeEncodeError: " + key + " - " + ", ".join(dict[key])) | |
file.close() | |
writeSolrSynonymFile() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment