/wiki_word_check.py

## wiki_word_check.py
# -*- coding: utf-8 -*-
"""
Copyright (c) 2012 Jacob Silterra

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

"""

"""

Read through dump of wiktionary database
Scrape words and their pronunciations


"""

import os
import sys
import bz2
import re

#Different phonetic alphabets
phon_alphs = ["enPR","IPA","X-SAMPA"]

def open_file(file_path):
    if(file_path.endswith("bz2")):
        f = bz2.BZ2File(file_path,'r')
    else:
        f = open(file_path, 'r')
    return f

def _get_xml_data(file_path):
    f = open_file(file_path)
    data = etree.parse(f)
    f.close()
    return data

def format_cur_word(cur_word):
    pronstr = cur_word['pronunciation']
    if pronstr is None:
        pronstr = "xxNonexx"
    pronstr = pronstr.replace("\n","")
    outstr = cur_word['title'] + "\t" + pronstr
    return outstr

def create_full_word_list(file_path, keep_filter= lambda x: True):
    f = open_file(file_path)
    #max_lines = 50# int(sys.argv[1])
    max_words = -1
    #print "max_lines: %d" % (max_lines)

    #We don't parse the full xml tree, python seems to run
    #out of memory when we do that

    #Which means we're going to have some ugly looking in-page type stuff
    inpage = False
    ineng = False
    inpron = False

    numlines = 0
    cur_word = {}
    all_words = []

    for line in f:
        numlines += 1
        #if max_lines > 0 and numlines >= max_lines:
            #break
        if max_words > 0 and len(all_words) >= max_words:
            break

        if "<page>" in line:
            inpage = True
            cur_word = {}

        if "</page>" in line:
            assert inpage
            has_spaces = " " in cur_word['title']
            if ineng and not has_spaces and keep_filter(cur_word['title']):
                all_words.append(cur_word)
                #print format_cur_word(cur_word)
            inpage = False
            ineng = False
            inpron = False

        if inpage:

            #The most obvious place to get the word
            #is actually the title of the page. Will get
            #some false positives this way
            if "<title>" in line:
                title = line.replace("<title>","")
                title = title.replace("</title>","")
                cur_word['title'] = title.strip().replace("\n","")
                for phon_alph in phon_alphs:
                    cur_word[phon_alph] = None


            if "==English==" in line:
                #We are only taking english words
                ineng = True
                pronstr = None
            elif ineng and "==Pronunciation==" in line:
                inpron = True
                pronstr = ""
            elif ineng and inpron and "==" in line:
                pron_props = split_pron_text(pronstr.strip())
                cur_word.update(pron_props)
                pronstr = None
                inpron = False
            elif inpron:
                pronstr += line + "\n"
    return all_words

def get_out_keys():
    keys = ['word','ie', 'ei', 'after_c']
    keys.extend(phon_alphs)
    return keys

def format_wordprops(props):
    keys = get_out_keys()
    outstr = props[keys[0]]
    for key in keys[1::]:
        outstr += "\t" + str(props[key])
    return outstr

def split_pron_text(pron_string):

    props = {alph:None for alph in phon_alphs}
    if "This entry needs pronunciation information" in pron_string:
        return props

    #This should split pronunciation text
    #into separate lines
    try:
        pron_toks = pron_string.split("*")
    except IndexError:
        pron_toks = []

    for phon_alph in phon_alphs:
        for tok in pron_toks:
            #Only use US dialect if specified
            if "{{a|US" not in tok and "{{a|" in tok:
                continue
            tok = tok.replace("{{a|US}}","")
            tok = tok.replace("\n","")

            #Different alphabets are comma separated
            phon_toks = tok.split(",")

            for phon_tok in phon_toks:
                for phon_alph in phon_alphs:
                    phon_loc = phon_tok.find(phon_alph)
                    if phon_loc >= 0:
                        strt_loc = phon_loc + len(phon_alph) + 1
                        end_loc = phon_tok.find("}}",strt_loc)
                        assert strt_loc >= 0
                        pron = phon_tok[strt_loc:end_loc]
                        pron = re.sub("lang=.{0,2}","", pron)
                        pron = pron.strip("|")
                        pron = pron.strip("/")
                        props[phon_alph] = pron

    return props

def parse_word_list_file(word_list_path):
    #Parse string back to python data types
    fi = open(word_list_path, 'r')
    keys = fi.readline().split("\t")
    keys = [key.strip() for key in keys]
    word_list = []
    for line in fi:
        word_props = {}
        ind = 0
        for tok in line.split("\t"):
            val = tok.strip()
            if tok == 'None':
                val = None
            elif tok == 'True':
                val = True
            elif tok == 'False':
                val = False

            if key == "word":
                val = val.lower()
            word_props[keys[ind]] = val
            ind += 1
        word_list.append(word_props)
    fi.close()
    return word_list

def create_word_list(db_file_path, word_list_path):

    #Parse wiki db, extract english words and pronunciations only
    def keep_func(word):
        temp = word.lower()
        return 'ie' in temp or 'ei' in temp
    #Generate this temporary file, apparently it doesn't exist
    word_list = create_full_word_list(db_file_path, keep_func)

    fi = open(word_list_path, 'w')
    header_line = "\t".join(get_out_keys())
    fi.write(header_line + "\n")
    for word_props in word_list:
        word = word_props['title'].lower()
        word_props['word'] = word
        word_props['ie'] = 'ie' in word
        word_props['ei'] = 'ei' in word
        word_props['after_c'] = 'cie' in word or 'cei' in word
        out_line = format_wordprops(word_props)
        #print out_line
        fi.write(out_line + "\n")
    fi.close()
    return word_list

if __name__ == "__main__":
    db_file_path = "enwiktionary-latest-pages-articles.xml.bz2.xml.bz2"
    filtered_word_list_path = 'ie_ei_only.txt'

    try:
        word_list = parse_word_list_file(filtered_word_list_path)
    except IOError:
        print "Print word list %s not found, generating" % (filtered_word_list_path)
        word_list = create_word_list(db_file_path, filtered_word_list_path)

    #Compile some statistics on words, mostly checking
    #to see how much pronunciation info we have
    num_words = len(word_list)
    have_any = 0
    pron_counts = {x:0 for x in phon_alphs}
    trouble = 0
    for word in word_list:
        have_pron = False
        for alph in phon_alphs:
            if alph in word and word[alph] is not None:
                pron_counts[alph] += 1
                have_pron = True
        if have_pron:
            have_any += 1
        if word['ie'] and word['ei']:
            trouble += 1

    def unambiguous_ipa(word):
        good = word['IPA'] is not None
        good = good and (word['ei'] is not word['ie'])
        return good
    have_ipa = filter(unambiguous_ipa, word_list)

    #print "%d words, %d have some pronunciation info" % (num_words, have_any)
    #print pron_counts
    #Note: As of Aug. 18, results:
    #18967 words, 761 have some pronunciation info
    #{'IPA': 753, 'X-SAMPA': 182, 'enPR': 65}
    #Clearly IPA is the most popular

    summary = []

    #We think this is the right sound, although there
    #may be others which are close
    long_a_sound = u"eɪ"

    for word in have_ipa:
        reason = None
        cur_sum = {'word': word['word'].lower()}
        #If the phoneme is not in the string at all, we
        #know that in can't be a long a sound
        ipa_str = unicode(word['IPA'], 'utf-8')
        if word['ei'] and word['after_c']:
            follows = True
            reason = "ei after c"
        if word['ie'] and not word['after_c'] and long_a_sound not in ipa_str:
            follows = True
            reason = "ie, not after c, no weigh sound"
        if word['ie'] and word['after_c']:
            follows = False
            reason = "ie after c"

        if reason is None:
            #Okay, easy cases done. At this point we need to check
            #if the 'ei' or 'ie' has a long a sound. If the word
            #has 'ei' and that syllable has a long a, or if the word
            #is 'ie' and does NOT have that sound, we follow the rule.
            #otherwise we break it

            if word['ei']:
                sp = 'ei'
                reason = 'ei'
            elif word['ie']:
                sp = 'ie'
                reason = 'ie'
            sp_loc = word['word'].find(sp)
            #Mapping characters onto pronunciation is tricky
            #We take the ratio of the length of the IPA string
            #to the written spelling to get an idea of how many pronunciation
            #characters are used per written character
            pron_chars_per_glyph = 1.0 * len(ipa_str) / len(word['word'])
            pron_loc = int(sp_loc * pron_chars_per_glyph)
            start = max(0, pron_loc-1)
            end = min(start + len(long_a_sound) + 3, len(ipa_str))
            sub_pron_str = ipa_str[start:end]


##          word_out = word['word'].lower()
##          word_out = word_out.replace(sp,sp.upper())
##          print "%s: %s" % (word_out,sub_pron_str)

            if long_a_sound in sub_pron_str:
                follows = word['ei']
                reason += " with weigh"
            elif long_a_sound not in sub_pron_str:
                follows = word['ie']
                reason += " with no weigh"

        cur_sum['follows'] = follows
        cur_sum['reason'] = reason
        summary.append(cur_sum)

    #print "%d follow the rule up to 'after c', out of %d" % (tot_ie + tot_cei,
    #                                                         len(have_ipa) )

    #print "%d have a long a sound" % (len(have_a))


    #Check integrity
    follow_rule = filter(lambda x: x['follows'], summary)
    break_rule = filter(lambda x: not x['follows'], summary)
    total_checked = len(have_ipa)
    num_follow = len(follow_rule)
    num_break = len(break_rule)
    print "Total: %d Follow rule: %d Break rule: %d Error: %d" % (total_checked,
                    num_follow, num_break, total_checked - num_follow - num_break)

    for follow in follow_rule:
        assert follow not in break_rule

    keys = ['word','follows','reason']
    summary = sorted(summary, key=lambda x: x['word'])
    for el in summary:
        assert 'ie' in el['word'] or 'ei' in el['word']
        line = '\t'.join(["%s" % el[key] for key in keys])
        print line
	# -- coding: utf-8 --
	"""
	Copyright (c) 2012 Jacob Silterra

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in
	all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	THE SOFTWARE.

	"""

	"""

	Read through dump of wiktionary database
	Scrape words and their pronunciations


	"""

	import os
	import sys
	import bz2
	import re

	#Different phonetic alphabets
	phon_alphs = ["enPR","IPA","X-SAMPA"]

	def open_file(file_path):
	if(file_path.endswith("bz2")):
	f = bz2.BZ2File(file_path,'r')
	else:
	f = open(file_path, 'r')
	return f

	def _get_xml_data(file_path):
	f = open_file(file_path)
	data = etree.parse(f)
	f.close()
	return data

	def format_cur_word(cur_word):
	pronstr = cur_word['pronunciation']
	if pronstr is None:
	pronstr = "xxNonexx"
	pronstr = pronstr.replace("\n","")
	outstr = cur_word['title'] + "\t" + pronstr
	return outstr

	def create_full_word_list(file_path, keep_filter= lambda x: True):
	f = open_file(file_path)
	#max_lines = 50# int(sys.argv[1])
	max_words = -1
	#print "max_lines: %d" % (max_lines)

	#We don't parse the full xml tree, python seems to run
	#out of memory when we do that

	#Which means we're going to have some ugly looking in-page type stuff
	inpage = False
	ineng = False
	inpron = False

	numlines = 0
	cur_word = {}
	all_words = []

	for line in f:
	numlines += 1
	#if max_lines > 0 and numlines >= max_lines:
	#break
	if max_words > 0 and len(all_words) >= max_words:
	break

	if "<page>" in line:
	inpage = True
	cur_word = {}

	if "</page>" in line:
	assert inpage
	has_spaces = " " in cur_word['title']
	if ineng and not has_spaces and keep_filter(cur_word['title']):
	all_words.append(cur_word)
	#print format_cur_word(cur_word)
	inpage = False
	ineng = False
	inpron = False

	if inpage:

	#The most obvious place to get the word
	#is actually the title of the page. Will get
	#some false positives this way
	if "<title>" in line:
	title = line.replace("<title>","")
	title = title.replace("</title>","")
	cur_word['title'] = title.strip().replace("\n","")
	for phon_alph in phon_alphs:
	cur_word[phon_alph] = None


	if "==English==" in line:
	#We are only taking english words
	ineng = True
	pronstr = None
	elif ineng and "==Pronunciation==" in line:
	inpron = True
	pronstr = ""
	elif ineng and inpron and "==" in line:
	pron_props = split_pron_text(pronstr.strip())
	cur_word.update(pron_props)
	pronstr = None
	inpron = False
	elif inpron:
	pronstr += line + "\n"
	return all_words

	def get_out_keys():
	keys = ['word','ie', 'ei', 'after_c']
	keys.extend(phon_alphs)
	return keys

	def format_wordprops(props):
	keys = get_out_keys()
	outstr = props[keys[0]]
	for key in keys[1::]:
	outstr += "\t" + str(props[key])
	return outstr

	def split_pron_text(pron_string):

	props = {alph:None for alph in phon_alphs}
	if "This entry needs pronunciation information" in pron_string:
	return props

	#This should split pronunciation text
	#into separate lines
	try:
	pron_toks = pron_string.split("*")
	except IndexError:
	pron_toks = []

	for phon_alph in phon_alphs:
	for tok in pron_toks:
	#Only use US dialect if specified
	if "{{a\|US" not in tok and "{{a\|" in tok:
	continue
	tok = tok.replace("{{a\|US}}","")
	tok = tok.replace("\n","")

	#Different alphabets are comma separated
	phon_toks = tok.split(",")

	for phon_tok in phon_toks:
	for phon_alph in phon_alphs:
	phon_loc = phon_tok.find(phon_alph)
	if phon_loc >= 0:
	strt_loc = phon_loc + len(phon_alph) + 1
	end_loc = phon_tok.find("}}",strt_loc)
	assert strt_loc >= 0
	pron = phon_tok[strt_loc:end_loc]
	pron = re.sub("lang=.{0,2}","", pron)
	pron = pron.strip("\|")
	pron = pron.strip("/")
	props[phon_alph] = pron

	return props

	def parse_word_list_file(word_list_path):
	#Parse string back to python data types
	fi = open(word_list_path, 'r')
	keys = fi.readline().split("\t")
	keys = [key.strip() for key in keys]
	word_list = []
	for line in fi:
	word_props = {}
	ind = 0
	for tok in line.split("\t"):
	val = tok.strip()
	if tok == 'None':
	val = None
	elif tok == 'True':
	val = True
	elif tok == 'False':
	val = False

	if key == "word":
	val = val.lower()
	word_props[keys[ind]] = val
	ind += 1
	word_list.append(word_props)
	fi.close()
	return word_list

	def create_word_list(db_file_path, word_list_path):

	#Parse wiki db, extract english words and pronunciations only
	def keep_func(word):
	temp = word.lower()
	return 'ie' in temp or 'ei' in temp
	#Generate this temporary file, apparently it doesn't exist
	word_list = create_full_word_list(db_file_path, keep_func)

	fi = open(word_list_path, 'w')
	header_line = "\t".join(get_out_keys())
	fi.write(header_line + "\n")
	for word_props in word_list:
	word = word_props['title'].lower()
	word_props['word'] = word
	word_props['ie'] = 'ie' in word
	word_props['ei'] = 'ei' in word
	word_props['after_c'] = 'cie' in word or 'cei' in word
	out_line = format_wordprops(word_props)
	#print out_line
	fi.write(out_line + "\n")
	fi.close()
	return word_list

	if __name__ == "__main__":
	db_file_path = "enwiktionary-latest-pages-articles.xml.bz2.xml.bz2"
	filtered_word_list_path = 'ie_ei_only.txt'

	try:
	word_list = parse_word_list_file(filtered_word_list_path)
	except IOError:
	print "Print word list %s not found, generating" % (filtered_word_list_path)
	word_list = create_word_list(db_file_path, filtered_word_list_path)

	#Compile some statistics on words, mostly checking
	#to see how much pronunciation info we have
	num_words = len(word_list)
	have_any = 0
	pron_counts = {x:0 for x in phon_alphs}
	trouble = 0
	for word in word_list:
	have_pron = False
	for alph in phon_alphs:
	if alph in word and word[alph] is not None:
	pron_counts[alph] += 1
	have_pron = True
	if have_pron:
	have_any += 1
	if word['ie'] and word['ei']:
	trouble += 1

	def unambiguous_ipa(word):
	good = word['IPA'] is not None
	good = good and (word['ei'] is not word['ie'])
	return good
	have_ipa = filter(unambiguous_ipa, word_list)

	#print "%d words, %d have some pronunciation info" % (num_words, have_any)
	#print pron_counts
	#Note: As of Aug. 18, results:
	#18967 words, 761 have some pronunciation info
	#{'IPA': 753, 'X-SAMPA': 182, 'enPR': 65}
	#Clearly IPA is the most popular

	summary = []

	#We think this is the right sound, although there
	#may be others which are close
	long_a_sound = u"eɪ"

	for word in have_ipa:
	reason = None
	cur_sum = {'word': word['word'].lower()}
	#If the phoneme is not in the string at all, we
	#know that in can't be a long a sound
	ipa_str = unicode(word['IPA'], 'utf-8')
	if word['ei'] and word['after_c']:
	follows = True
	reason = "ei after c"
	if word['ie'] and not word['after_c'] and long_a_sound not in ipa_str:
	follows = True
	reason = "ie, not after c, no weigh sound"
	if word['ie'] and word['after_c']:
	follows = False
	reason = "ie after c"

	if reason is None:
	#Okay, easy cases done. At this point we need to check
	#if the 'ei' or 'ie' has a long a sound. If the word
	#has 'ei' and that syllable has a long a, or if the word
	#is 'ie' and does NOT have that sound, we follow the rule.
	#otherwise we break it

	if word['ei']:
	sp = 'ei'
	reason = 'ei'
	elif word['ie']:
	sp = 'ie'
	reason = 'ie'
	sp_loc = word['word'].find(sp)
	#Mapping characters onto pronunciation is tricky
	#We take the ratio of the length of the IPA string
	#to the written spelling to get an idea of how many pronunciation
	#characters are used per written character
	pron_chars_per_glyph = 1.0 * len(ipa_str) / len(word['word'])
	pron_loc = int(sp_loc * pron_chars_per_glyph)
	start = max(0, pron_loc-1)
	end = min(start + len(long_a_sound) + 3, len(ipa_str))
	sub_pron_str = ipa_str[start:end]


	## word_out = word['word'].lower()
	## word_out = word_out.replace(sp,sp.upper())
	## print "%s: %s" % (word_out,sub_pron_str)

	if long_a_sound in sub_pron_str:
	follows = word['ei']
	reason += " with weigh"
	elif long_a_sound not in sub_pron_str:
	follows = word['ie']
	reason += " with no weigh"

	cur_sum['follows'] = follows
	cur_sum['reason'] = reason
	summary.append(cur_sum)

	#print "%d follow the rule up to 'after c', out of %d" % (tot_ie + tot_cei,
	# len(have_ipa) )

	#print "%d have a long a sound" % (len(have_a))


	#Check integrity
	follow_rule = filter(lambda x: x['follows'], summary)
	break_rule = filter(lambda x: not x['follows'], summary)
	total_checked = len(have_ipa)
	num_follow = len(follow_rule)
	num_break = len(break_rule)
	print "Total: %d Follow rule: %d Break rule: %d Error: %d" % (total_checked,
	num_follow, num_break, total_checked - num_follow - num_break)

	for follow in follow_rule:
	assert follow not in break_rule

	keys = ['word','follows','reason']
	summary = sorted(summary, key=lambda x: x['word'])
	for el in summary:
	assert 'ie' in el['word'] or 'ei' in el['word']
	line = '\t'.join(["%s" % el[key] for key in keys])
	print line