Skip to content

Instantly share code, notes, and snippets.

Created August 20, 2012 23:04
Show Gist options
  • Save anonymous/3408990 to your computer and use it in GitHub Desktop.
Save anonymous/3408990 to your computer and use it in GitHub Desktop.
Scrapes a downloaded wiktionary dump
# -*- coding: utf-8 -*-
"""
Copyright (c) 2012 Jacob Silterra
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
"""
Read through dump of wiktionary database
Scrape words and their pronunciations
"""
import os
import sys
import bz2
import re
#Different phonetic alphabets
phon_alphs = ["enPR","IPA","X-SAMPA"]
def open_file(file_path):
if(file_path.endswith("bz2")):
f = bz2.BZ2File(file_path,'r')
else:
f = open(file_path, 'r')
return f
def _get_xml_data(file_path):
f = open_file(file_path)
data = etree.parse(f)
f.close()
return data
def format_cur_word(cur_word):
pronstr = cur_word['pronunciation']
if pronstr is None:
pronstr = "xxNonexx"
pronstr = pronstr.replace("\n","")
outstr = cur_word['title'] + "\t" + pronstr
return outstr
def create_full_word_list(file_path, keep_filter= lambda x: True):
f = open_file(file_path)
#max_lines = 50# int(sys.argv[1])
max_words = -1
#print "max_lines: %d" % (max_lines)
#We don't parse the full xml tree, python seems to run
#out of memory when we do that
#Which means we're going to have some ugly looking in-page type stuff
inpage = False
ineng = False
inpron = False
numlines = 0
cur_word = {}
all_words = []
for line in f:
numlines += 1
#if max_lines > 0 and numlines >= max_lines:
#break
if max_words > 0 and len(all_words) >= max_words:
break
if "<page>" in line:
inpage = True
cur_word = {}
if "</page>" in line:
assert inpage
has_spaces = " " in cur_word['title']
if ineng and not has_spaces and keep_filter(cur_word['title']):
all_words.append(cur_word)
#print format_cur_word(cur_word)
inpage = False
ineng = False
inpron = False
if inpage:
#The most obvious place to get the word
#is actually the title of the page. Will get
#some false positives this way
if "<title>" in line:
title = line.replace("<title>","")
title = title.replace("</title>","")
cur_word['title'] = title.strip().replace("\n","")
for phon_alph in phon_alphs:
cur_word[phon_alph] = None
if "==English==" in line:
#We are only taking english words
ineng = True
pronstr = None
elif ineng and "==Pronunciation==" in line:
inpron = True
pronstr = ""
elif ineng and inpron and "==" in line:
pron_props = split_pron_text(pronstr.strip())
cur_word.update(pron_props)
pronstr = None
inpron = False
elif inpron:
pronstr += line + "\n"
return all_words
def get_out_keys():
keys = ['word','ie', 'ei', 'after_c']
keys.extend(phon_alphs)
return keys
def format_wordprops(props):
keys = get_out_keys()
outstr = props[keys[0]]
for key in keys[1::]:
outstr += "\t" + str(props[key])
return outstr
def split_pron_text(pron_string):
props = {alph:None for alph in phon_alphs}
if "This entry needs pronunciation information" in pron_string:
return props
#This should split pronunciation text
#into separate lines
try:
pron_toks = pron_string.split("*")
except IndexError:
pron_toks = []
for phon_alph in phon_alphs:
for tok in pron_toks:
#Only use US dialect if specified
if "{{a|US" not in tok and "{{a|" in tok:
continue
tok = tok.replace("{{a|US}}","")
tok = tok.replace("\n","")
#Different alphabets are comma separated
phon_toks = tok.split(",")
for phon_tok in phon_toks:
for phon_alph in phon_alphs:
phon_loc = phon_tok.find(phon_alph)
if phon_loc >= 0:
strt_loc = phon_loc + len(phon_alph) + 1
end_loc = phon_tok.find("}}",strt_loc)
assert strt_loc >= 0
pron = phon_tok[strt_loc:end_loc]
pron = re.sub("lang=.{0,2}","", pron)
pron = pron.strip("|")
pron = pron.strip("/")
props[phon_alph] = pron
return props
def parse_word_list_file(word_list_path):
#Parse string back to python data types
fi = open(word_list_path, 'r')
keys = fi.readline().split("\t")
keys = [key.strip() for key in keys]
word_list = []
for line in fi:
word_props = {}
ind = 0
for tok in line.split("\t"):
val = tok.strip()
if tok == 'None':
val = None
elif tok == 'True':
val = True
elif tok == 'False':
val = False
if key == "word":
val = val.lower()
word_props[keys[ind]] = val
ind += 1
word_list.append(word_props)
fi.close()
return word_list
def create_word_list(db_file_path, word_list_path):
#Parse wiki db, extract english words and pronunciations only
def keep_func(word):
temp = word.lower()
return 'ie' in temp or 'ei' in temp
#Generate this temporary file, apparently it doesn't exist
word_list = create_full_word_list(db_file_path, keep_func)
fi = open(word_list_path, 'w')
header_line = "\t".join(get_out_keys())
fi.write(header_line + "\n")
for word_props in word_list:
word = word_props['title'].lower()
word_props['word'] = word
word_props['ie'] = 'ie' in word
word_props['ei'] = 'ei' in word
word_props['after_c'] = 'cie' in word or 'cei' in word
out_line = format_wordprops(word_props)
#print out_line
fi.write(out_line + "\n")
fi.close()
return word_list
if __name__ == "__main__":
db_file_path = "enwiktionary-latest-pages-articles.xml.bz2.xml.bz2"
filtered_word_list_path = 'ie_ei_only.txt'
try:
word_list = parse_word_list_file(filtered_word_list_path)
except IOError:
print "Print word list %s not found, generating" % (filtered_word_list_path)
word_list = create_word_list(db_file_path, filtered_word_list_path)
#Compile some statistics on words, mostly checking
#to see how much pronunciation info we have
num_words = len(word_list)
have_any = 0
pron_counts = {x:0 for x in phon_alphs}
trouble = 0
for word in word_list:
have_pron = False
for alph in phon_alphs:
if alph in word and word[alph] is not None:
pron_counts[alph] += 1
have_pron = True
if have_pron:
have_any += 1
if word['ie'] and word['ei']:
trouble += 1
def unambiguous_ipa(word):
good = word['IPA'] is not None
good = good and (word['ei'] is not word['ie'])
return good
have_ipa = filter(unambiguous_ipa, word_list)
#print "%d words, %d have some pronunciation info" % (num_words, have_any)
#print pron_counts
#Note: As of Aug. 18, results:
#18967 words, 761 have some pronunciation info
#{'IPA': 753, 'X-SAMPA': 182, 'enPR': 65}
#Clearly IPA is the most popular
summary = []
#We think this is the right sound, although there
#may be others which are close
long_a_sound = u"eɪ"
for word in have_ipa:
reason = None
cur_sum = {'word': word['word'].lower()}
#If the phoneme is not in the string at all, we
#know that in can't be a long a sound
ipa_str = unicode(word['IPA'], 'utf-8')
if word['ei'] and word['after_c']:
follows = True
reason = "ei after c"
if word['ie'] and not word['after_c'] and long_a_sound not in ipa_str:
follows = True
reason = "ie, not after c, no weigh sound"
if word['ie'] and word['after_c']:
follows = False
reason = "ie after c"
if reason is None:
#Okay, easy cases done. At this point we need to check
#if the 'ei' or 'ie' has a long a sound. If the word
#has 'ei' and that syllable has a long a, or if the word
#is 'ie' and does NOT have that sound, we follow the rule.
#otherwise we break it
if word['ei']:
sp = 'ei'
reason = 'ei'
elif word['ie']:
sp = 'ie'
reason = 'ie'
sp_loc = word['word'].find(sp)
#Mapping characters onto pronunciation is tricky
#We take the ratio of the length of the IPA string
#to the written spelling to get an idea of how many pronunciation
#characters are used per written character
pron_chars_per_glyph = 1.0 * len(ipa_str) / len(word['word'])
pron_loc = int(sp_loc * pron_chars_per_glyph)
start = max(0, pron_loc-1)
end = min(start + len(long_a_sound) + 3, len(ipa_str))
sub_pron_str = ipa_str[start:end]
## word_out = word['word'].lower()
## word_out = word_out.replace(sp,sp.upper())
## print "%s: %s" % (word_out,sub_pron_str)
if long_a_sound in sub_pron_str:
follows = word['ei']
reason += " with weigh"
elif long_a_sound not in sub_pron_str:
follows = word['ie']
reason += " with no weigh"
cur_sum['follows'] = follows
cur_sum['reason'] = reason
summary.append(cur_sum)
#print "%d follow the rule up to 'after c', out of %d" % (tot_ie + tot_cei,
# len(have_ipa) )
#print "%d have a long a sound" % (len(have_a))
#Check integrity
follow_rule = filter(lambda x: x['follows'], summary)
break_rule = filter(lambda x: not x['follows'], summary)
total_checked = len(have_ipa)
num_follow = len(follow_rule)
num_break = len(break_rule)
print "Total: %d Follow rule: %d Break rule: %d Error: %d" % (total_checked,
num_follow, num_break, total_checked - num_follow - num_break)
for follow in follow_rule:
assert follow not in break_rule
keys = ['word','follows','reason']
summary = sorted(summary, key=lambda x: x['word'])
for el in summary:
assert 'ie' in el['word'] or 'ei' in el['word']
line = '\t'.join(["%s" % el[key] for key in keys])
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment