Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Convert WoneF database file into a Solr formatted synonyms file
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import re
import unicodedata
# This method adds all given synonyms into the correct dictionary entry.
def extendDictEntry(dict, key, xmlSynonyms):
for child in xmlSynonyms:
childText = child.text.encode('utf-8')
if (childText not in dict[key]):
dict[key].extend([childText])
return dict
# This method buils the synonyms dictionary from the WoNeF file.
def buildSynonymDictionary():
tree = ET.parse('wonef-fscore-0.1.xml')
root = tree.getroot()
dict = {}
# fill synonyms dictionary
for synset in root:
for child in synset:
if child.tag == "SYNONYM":
for literal in child:
currLiteralText = literal.text.encode('utf-8')
if currLiteralText in dict:
# add all SYNONYM tags text into the correct entry of the map
extendDictEntry(dict, currLiteralText, child)
else:
# create a new entry in the map
dict[currLiteralText] = [currLiteralText]
extendDictEntry(dict, currLiteralText, child)
return dict
def removeAccents(str):
return ''.join(c for c in unicodedata.normalize('NFD', str.decode('utf-8'))
if unicodedata.category(c) != 'Mn').encode('utf-8')
# This method writes the synonym file in the Solr format
def writeSolrSynonymFile():
dict = buildSynonymDictionary()
file = open("solr_synonym.txt","w")
file.write("# Solr Synonmys File \n\n")
for key in dict:
try:
file.write(
removeAccents(key) +
" => " +
removeAccents(", ".join(dict[key])) +
"\n")
except UnicodeEncodeError:
print("UnicodeEncodeError: " + key + " - " + ", ".join(dict[key]))
file.close()
writeSolrSynonymFile()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.