Created
August 20, 2012 23:04
-
-
Save anonymous/3408990 to your computer and use it in GitHub Desktop.
Scrapes a downloaded wiktionary dump
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Copyright (c) 2012 Jacob Silterra | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in | |
all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
THE SOFTWARE. | |
""" | |
""" | |
Read through dump of wiktionary database | |
Scrape words and their pronunciations | |
""" | |
import os | |
import sys | |
import bz2 | |
import re | |
#Different phonetic alphabets | |
phon_alphs = ["enPR","IPA","X-SAMPA"] | |
def open_file(file_path): | |
if(file_path.endswith("bz2")): | |
f = bz2.BZ2File(file_path,'r') | |
else: | |
f = open(file_path, 'r') | |
return f | |
def _get_xml_data(file_path): | |
f = open_file(file_path) | |
data = etree.parse(f) | |
f.close() | |
return data | |
def format_cur_word(cur_word): | |
pronstr = cur_word['pronunciation'] | |
if pronstr is None: | |
pronstr = "xxNonexx" | |
pronstr = pronstr.replace("\n","") | |
outstr = cur_word['title'] + "\t" + pronstr | |
return outstr | |
def create_full_word_list(file_path, keep_filter= lambda x: True): | |
f = open_file(file_path) | |
#max_lines = 50# int(sys.argv[1]) | |
max_words = -1 | |
#print "max_lines: %d" % (max_lines) | |
#We don't parse the full xml tree, python seems to run | |
#out of memory when we do that | |
#Which means we're going to have some ugly looking in-page type stuff | |
inpage = False | |
ineng = False | |
inpron = False | |
numlines = 0 | |
cur_word = {} | |
all_words = [] | |
for line in f: | |
numlines += 1 | |
#if max_lines > 0 and numlines >= max_lines: | |
#break | |
if max_words > 0 and len(all_words) >= max_words: | |
break | |
if "<page>" in line: | |
inpage = True | |
cur_word = {} | |
if "</page>" in line: | |
assert inpage | |
has_spaces = " " in cur_word['title'] | |
if ineng and not has_spaces and keep_filter(cur_word['title']): | |
all_words.append(cur_word) | |
#print format_cur_word(cur_word) | |
inpage = False | |
ineng = False | |
inpron = False | |
if inpage: | |
#The most obvious place to get the word | |
#is actually the title of the page. Will get | |
#some false positives this way | |
if "<title>" in line: | |
title = line.replace("<title>","") | |
title = title.replace("</title>","") | |
cur_word['title'] = title.strip().replace("\n","") | |
for phon_alph in phon_alphs: | |
cur_word[phon_alph] = None | |
if "==English==" in line: | |
#We are only taking english words | |
ineng = True | |
pronstr = None | |
elif ineng and "==Pronunciation==" in line: | |
inpron = True | |
pronstr = "" | |
elif ineng and inpron and "==" in line: | |
pron_props = split_pron_text(pronstr.strip()) | |
cur_word.update(pron_props) | |
pronstr = None | |
inpron = False | |
elif inpron: | |
pronstr += line + "\n" | |
return all_words | |
def get_out_keys(): | |
keys = ['word','ie', 'ei', 'after_c'] | |
keys.extend(phon_alphs) | |
return keys | |
def format_wordprops(props): | |
keys = get_out_keys() | |
outstr = props[keys[0]] | |
for key in keys[1::]: | |
outstr += "\t" + str(props[key]) | |
return outstr | |
def split_pron_text(pron_string): | |
props = {alph:None for alph in phon_alphs} | |
if "This entry needs pronunciation information" in pron_string: | |
return props | |
#This should split pronunciation text | |
#into separate lines | |
try: | |
pron_toks = pron_string.split("*") | |
except IndexError: | |
pron_toks = [] | |
for phon_alph in phon_alphs: | |
for tok in pron_toks: | |
#Only use US dialect if specified | |
if "{{a|US" not in tok and "{{a|" in tok: | |
continue | |
tok = tok.replace("{{a|US}}","") | |
tok = tok.replace("\n","") | |
#Different alphabets are comma separated | |
phon_toks = tok.split(",") | |
for phon_tok in phon_toks: | |
for phon_alph in phon_alphs: | |
phon_loc = phon_tok.find(phon_alph) | |
if phon_loc >= 0: | |
strt_loc = phon_loc + len(phon_alph) + 1 | |
end_loc = phon_tok.find("}}",strt_loc) | |
assert strt_loc >= 0 | |
pron = phon_tok[strt_loc:end_loc] | |
pron = re.sub("lang=.{0,2}","", pron) | |
pron = pron.strip("|") | |
pron = pron.strip("/") | |
props[phon_alph] = pron | |
return props | |
def parse_word_list_file(word_list_path): | |
#Parse string back to python data types | |
fi = open(word_list_path, 'r') | |
keys = fi.readline().split("\t") | |
keys = [key.strip() for key in keys] | |
word_list = [] | |
for line in fi: | |
word_props = {} | |
ind = 0 | |
for tok in line.split("\t"): | |
val = tok.strip() | |
if tok == 'None': | |
val = None | |
elif tok == 'True': | |
val = True | |
elif tok == 'False': | |
val = False | |
if key == "word": | |
val = val.lower() | |
word_props[keys[ind]] = val | |
ind += 1 | |
word_list.append(word_props) | |
fi.close() | |
return word_list | |
def create_word_list(db_file_path, word_list_path): | |
#Parse wiki db, extract english words and pronunciations only | |
def keep_func(word): | |
temp = word.lower() | |
return 'ie' in temp or 'ei' in temp | |
#Generate this temporary file, apparently it doesn't exist | |
word_list = create_full_word_list(db_file_path, keep_func) | |
fi = open(word_list_path, 'w') | |
header_line = "\t".join(get_out_keys()) | |
fi.write(header_line + "\n") | |
for word_props in word_list: | |
word = word_props['title'].lower() | |
word_props['word'] = word | |
word_props['ie'] = 'ie' in word | |
word_props['ei'] = 'ei' in word | |
word_props['after_c'] = 'cie' in word or 'cei' in word | |
out_line = format_wordprops(word_props) | |
#print out_line | |
fi.write(out_line + "\n") | |
fi.close() | |
return word_list | |
if __name__ == "__main__": | |
db_file_path = "enwiktionary-latest-pages-articles.xml.bz2.xml.bz2" | |
filtered_word_list_path = 'ie_ei_only.txt' | |
try: | |
word_list = parse_word_list_file(filtered_word_list_path) | |
except IOError: | |
print "Print word list %s not found, generating" % (filtered_word_list_path) | |
word_list = create_word_list(db_file_path, filtered_word_list_path) | |
#Compile some statistics on words, mostly checking | |
#to see how much pronunciation info we have | |
num_words = len(word_list) | |
have_any = 0 | |
pron_counts = {x:0 for x in phon_alphs} | |
trouble = 0 | |
for word in word_list: | |
have_pron = False | |
for alph in phon_alphs: | |
if alph in word and word[alph] is not None: | |
pron_counts[alph] += 1 | |
have_pron = True | |
if have_pron: | |
have_any += 1 | |
if word['ie'] and word['ei']: | |
trouble += 1 | |
def unambiguous_ipa(word): | |
good = word['IPA'] is not None | |
good = good and (word['ei'] is not word['ie']) | |
return good | |
have_ipa = filter(unambiguous_ipa, word_list) | |
#print "%d words, %d have some pronunciation info" % (num_words, have_any) | |
#print pron_counts | |
#Note: As of Aug. 18, results: | |
#18967 words, 761 have some pronunciation info | |
#{'IPA': 753, 'X-SAMPA': 182, 'enPR': 65} | |
#Clearly IPA is the most popular | |
summary = [] | |
#We think this is the right sound, although there | |
#may be others which are close | |
long_a_sound = u"eɪ" | |
for word in have_ipa: | |
reason = None | |
cur_sum = {'word': word['word'].lower()} | |
#If the phoneme is not in the string at all, we | |
#know that in can't be a long a sound | |
ipa_str = unicode(word['IPA'], 'utf-8') | |
if word['ei'] and word['after_c']: | |
follows = True | |
reason = "ei after c" | |
if word['ie'] and not word['after_c'] and long_a_sound not in ipa_str: | |
follows = True | |
reason = "ie, not after c, no weigh sound" | |
if word['ie'] and word['after_c']: | |
follows = False | |
reason = "ie after c" | |
if reason is None: | |
#Okay, easy cases done. At this point we need to check | |
#if the 'ei' or 'ie' has a long a sound. If the word | |
#has 'ei' and that syllable has a long a, or if the word | |
#is 'ie' and does NOT have that sound, we follow the rule. | |
#otherwise we break it | |
if word['ei']: | |
sp = 'ei' | |
reason = 'ei' | |
elif word['ie']: | |
sp = 'ie' | |
reason = 'ie' | |
sp_loc = word['word'].find(sp) | |
#Mapping characters onto pronunciation is tricky | |
#We take the ratio of the length of the IPA string | |
#to the written spelling to get an idea of how many pronunciation | |
#characters are used per written character | |
pron_chars_per_glyph = 1.0 * len(ipa_str) / len(word['word']) | |
pron_loc = int(sp_loc * pron_chars_per_glyph) | |
start = max(0, pron_loc-1) | |
end = min(start + len(long_a_sound) + 3, len(ipa_str)) | |
sub_pron_str = ipa_str[start:end] | |
## word_out = word['word'].lower() | |
## word_out = word_out.replace(sp,sp.upper()) | |
## print "%s: %s" % (word_out,sub_pron_str) | |
if long_a_sound in sub_pron_str: | |
follows = word['ei'] | |
reason += " with weigh" | |
elif long_a_sound not in sub_pron_str: | |
follows = word['ie'] | |
reason += " with no weigh" | |
cur_sum['follows'] = follows | |
cur_sum['reason'] = reason | |
summary.append(cur_sum) | |
#print "%d follow the rule up to 'after c', out of %d" % (tot_ie + tot_cei, | |
# len(have_ipa) ) | |
#print "%d have a long a sound" % (len(have_a)) | |
#Check integrity | |
follow_rule = filter(lambda x: x['follows'], summary) | |
break_rule = filter(lambda x: not x['follows'], summary) | |
total_checked = len(have_ipa) | |
num_follow = len(follow_rule) | |
num_break = len(break_rule) | |
print "Total: %d Follow rule: %d Break rule: %d Error: %d" % (total_checked, | |
num_follow, num_break, total_checked - num_follow - num_break) | |
for follow in follow_rule: | |
assert follow not in break_rule | |
keys = ['word','follows','reason'] | |
summary = sorted(summary, key=lambda x: x['word']) | |
for el in summary: | |
assert 'ie' in el['word'] or 'ei' in el['word'] | |
line = '\t'.join(["%s" % el[key] for key in keys]) | |
print line | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment