jiffyclub/aiclass_nlp2-2.py

## aiclass_nlp2-2.py
"""
This code is for part two of the AI Class optional Natural Language Processing
programming assignment. See youtu.be/KuSg1wcty3s for the complete problem
definition.

We are given a sentence that spans several lines and has been shredded
vertically so that there are 2 characters in each column. The columns are
arranged in random order and it's our job to assemble them into the correct
order.

In this code I use a simple dictionary method to reconstruct the original.
The code first identifies the first column by looking for a capital letter,
then tries to add on columns, only accepting a column as a match if every
word on every row matches a word in the dictionary or the beginning of a word
in the dictionary.

This technique is effective but slow since the dictonary is large and frequently
searched. (I used as a dictionary the list of words in /usr/share/dict/words.
This may be in different locations on different systems.)

Contrast this method with the letter-by-letter grammar method in
gist.github.com/1499949, which is much faster but unable to recognize
real English words.

I think it could be effective to combine the grammar and dictionary methods.
Many potential strings could be quickly ruled invalid by grammar rules and
only after passing those rules would a word be subjected to the more
time consuming dictionary tests.


The jumbled sentence is:

|de|  | f|Cl|nf|ed|au| i|ti|  |ma|ha|or|nn|ou| S|on|nd|on|
|ry|  |is|th|is| b|eo|as|  |  |f |wh| o|ic| t|, |  |he|h |
|ab|  |la|pr|od|ge|ob| m|an|  |s |is|el|ti|ng|il|d |ua|c |
|he|  |ea|of|ho| m| t|et|ha|  | t|od|ds|e |ki| c|t |ng|br|
|wo|m,|to|yo|hi|ve|u | t|ob|  |pr|d |s |us| s|ul|le|ol|e |
| t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er|
|ry|d |un|Th|" |io|eo|n,|is|  |bl|f |pu|Co|ic| o|he|at|mm|
|hi|  |  |in|  |  | t|  |  |  |  |ye|  |ar|  |s |  |  |. |

Reconstructed it reads:

Claude Shannon founded information
theory, which is the basis of
probabilistic language models and
of the code breaking methods that
you would use to solve this problem,
with the paper titled "A Mathematical
Theory of Communication," published
in this year.

Author
------
@jiffyclub
git.io/jiffyclub

"""

import string

S = [
'de|  | f|Cl|nf|ed|au| i|ti|  |ma|ha|or|nn|ou| S|on|nd|on',
'ry|  |is|th|is| b|eo|as|  |  |f |wh| o|ic| t|, |  |he|h ',
'ab|  |la|pr|od|ge|ob| m|an|  |s |is|el|ti|ng|il|d |ua|c ',
'he|  |ea|of|ho| m| t|et|ha|  | t|od|ds|e |ki| c|t |ng|br',
'wo|m,|to|yo|hi|ve|u | t|ob|  |pr|d |s |us| s|ul|le|ol|e ',
' t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er',
'ry|d |un|Th|" |io|eo|n,|is|  |bl|f |pu|Co|ic| o|he|at|mm',
'hi|  |  |in|  |  | t|  |  |  |  |ye|  |ar|  |s |  |  |. ']

VOWELS = 'aeiouAEIOU'
CONSONANTS = 'bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ'

# this may be in another location on other computers
WORD_DICT = '/usr/share/dict/words'

def read_words(words_file):
    """
    Read a list of words from a text file and return it as a Python list.

    Parameters
    ----------
    words_file : str
        Path to a text file containing words.

    Returns
    -------
    word_list : list of str
        List of words read from file.

    """
    f = open(words_file, 'r')

    words = f.readlines()

    f.close()

    return [w.strip() for w in words]  # strip new lines


def find_first_column(l):
    """
    The first letter of the first row should be a capital
    letter. Find the index of the column that meets this requirement.

    (This is using the prior knowledge that only one column will meet
    this condition.)

    Parameters
    ----------
    l : list of str
        First row of paragraph as a list of two character strings.

    Returns
    -------
    ind : int
        Index of best first column.

    """
    for i,s in enumerate(l):
        if s[0] in string.uppercase:
            return i

def check_string(s, words, known_words):
    """
    Check that a string contains allowed words or the beginnings of words.

    Parameters
    ----------
    s : str
        String to check.

    words : list of str
        List of allowed words.

    known_words : list of str
        List of words already confirmed in the paragraph. New words are
        added here.

    Returns
    -------
    okay : bool
        True if string contains allowed words, False otherwise.

    """
    split = s.split()

    for word in split:
        # assume True, prove otherwise
        is_word = True

        # remove punctuation, if word is pure punctuation just continue
        temp_word = [c for c in word if c not in string.punctuation]

        if not temp_word:
            continue
        else:
            word = reduce(lambda x, y: x + y, temp_word)

        # if we've already seen this word continue on
        if word in known_words:
            continue

        # treat the last bit of the string separately
        if not s.endswith(word):
            # dictionary doesn't have capitalized words, except for
            # proper names
            #
            # if nothing matched it could be because the word
            # is plural or conjugated, which wouldn't show up in the
            # dictionary.
            if not word in words and not word.lower() in words:
                if word.endswith('s'):
                    if not word[:-1] in words or not word[:-1].lower() in words:
                        is_word = False

                elif word.endswith('ed'):
                    if not word[:-2] in words or not word[:-2].lower() in words:
                        is_word = False

                else:
                    is_word = False

        # this word is at the end of the input string so could be just
        # a portion of a word
        else:
            if not word in words and not word.lower() in words:
                word_starts = [True for w in words
                                if w.startswith(word)
                                    or w.startswith(word.lower())]

                if not word_starts:
                    # if nothing matched it could be because the word
                    # is plural or conjugated, which wouldn't show up in the
                    # dictionary.
                    if word.endswith('s'):
                        word_starts = [True for w in words
                                        if w.startswith(word[:-1])
                                            or w.startswith(word[:-1].lower())]

                    elif word.endswith('ed'):
                        word_starts = [True for w in words
                                        if w.startswith(word[:-2])
                                            or w.startswith(word[:-2].lower())]

                if not word_starts:
                    is_word = False

        # if any word isn't in the dictionary then return false
        if not is_word:
            return is_word

    return is_word


def check_column(decoded, paragraph, col, words, known_words):
    """
    Check whether a given column of the paragraph fits in with what's
    already been added to the decoded paragraph.

    Parameters
    ----------
    decoded : list of str
        List of paragraph lines.

    paragraph : list of lists
        List of lists containing paragraph lines split into two
        character chunks.

    col : int
        Column to add to `decoded`.

    words : list of str
        List of allowed words.

    known_words : list of str
        List of words already confirmed in the paragraph.

    Returns
    -------
    correct : bool
        True if column `col` is consistent with what has already
        been decoded, False otherwise.

    """
    correct = [check_string(decoded[i] + paragraph[i][col], words, known_words)
                for i in range(len(decoded))]

    if correct.count(True) != len(decoded):
        return False

    else:
        # add good words the known_words
        for i in range(len(decoded)):
            s = decoded[i] + paragraph[i][col]

            split = s.split()

            for w in split:
                if not s.endswith(w) and not w in known_words:
                    known_words.append(w)

        return True


def add_column(decoded, paragraph, col):
    """
    Add a column from the paragraph to the decoded container.
    Also removes the column from use.

    Parameters
    ----------
    decoded : list of str
        List of paragraph lines.

    paragraph : list of lists
        List of lists containing paragraph lines split into two
        character chunks.

    col : int
        Column to add to `decoded`.

    Returns
    -------
    decoded : list of str
        Paragraph lines with column appended.

    paragraph : list of lists
        Line chunks with column `col` removed.

    """
    for i, line in enumerate(decoded):
        decoded[i] = line + paragraph[i].pop(col)

    return decoded, paragraph


def main():
    """
    Start with the column identified by `find_first_column` and then
    successively add columns that pass all of the rule tests.

    """
    # get list of allowed words
    words = read_words(WORD_DICT)

    # turn S into list of lists
    paragraph = [s.split('|') for s in S]

    # make a container for decoded paragraph
    decoded = [''] * len(paragraph)

    # this will contain the indices of the columns in their corrected
    # order.
    order = []

    # this will contain words that are confirmed part of the decoded
    # paragraph, to save looking in the big dictionary
    known_words = []

    # find the first column and remove it from paragraph
    order.append(find_first_column(paragraph[0]))

    decoded, paragraph = add_column(decoded, paragraph, order[-1])

    # now go through columns finding one that goes after the last
    # column in order
    while True:
        matched = False

        for col in range(len(paragraph[0])):
            if check_column(decoded, paragraph, col, words, known_words):
                order.append(col)

                decoded, paragraph = add_column(decoded, paragraph, order[-1])

                matched = True

                break

        if not matched:
            print decoded
            raise StandardError("No column matched your rules!")

        # check if we've matched all columns
        if len(paragraph[0]) == 0:
            break

    for line in decoded:
        print line


if __name__ == '__main__':
    main()
	"""
	This code is for part two of the AI Class optional Natural Language Processing
	programming assignment. See youtu.be/KuSg1wcty3s for the complete problem
	definition.

	We are given a sentence that spans several lines and has been shredded
	vertically so that there are 2 characters in each column. The columns are
	arranged in random order and it's our job to assemble them into the correct
	order.

	In this code I use a simple dictionary method to reconstruct the original.
	The code first identifies the first column by looking for a capital letter,
	then tries to add on columns, only accepting a column as a match if every
	word on every row matches a word in the dictionary or the beginning of a word
	in the dictionary.

	This technique is effective but slow since the dictonary is large and frequently
	searched. (I used as a dictionary the list of words in /usr/share/dict/words.
	This may be in different locations on different systems.)

	Contrast this method with the letter-by-letter grammar method in
	gist.github.com/1499949, which is much faster but unable to recognize
	real English words.

	I think it could be effective to combine the grammar and dictionary methods.
	Many potential strings could be quickly ruled invalid by grammar rules and
	only after passing those rules would a word be subjected to the more
	time consuming dictionary tests.


	The jumbled sentence is:

	\|de\| \| f\|Cl\|nf\|ed\|au\| i\|ti\| \|ma\|ha\|or\|nn\|ou\| S\|on\|nd\|on\|
	\|ry\| \|is\|th\|is\| b\|eo\|as\| \| \|f \|wh\| o\|ic\| t\|, \| \|he\|h \|
	\|ab\| \|la\|pr\|od\|ge\|ob\| m\|an\| \|s \|is\|el\|ti\|ng\|il\|d \|ua\|c \|
	\|he\| \|ea\|of\|ho\| m\| t\|et\|ha\| \| t\|od\|ds\|e \|ki\| c\|t \|ng\|br\|
	\|wo\|m,\|to\|yo\|hi\|ve\|u \| t\|ob\| \|pr\|d \|s \|us\| s\|ul\|le\|ol\|e \|
	\| t\|ca\| t\|wi\| M\|d \|th\|"A\|ma\|l \|he\| p\|at\|ap\|it\|he\|ti\|le\|er\|
	\|ry\|d \|un\|Th\|" \|io\|eo\|n,\|is\| \|bl\|f \|pu\|Co\|ic\| o\|he\|at\|mm\|
	\|hi\| \| \|in\| \| \| t\| \| \| \| \|ye\| \|ar\| \|s \| \| \|. \|

	Reconstructed it reads:

	Claude Shannon founded information
	theory, which is the basis of
	probabilistic language models and
	of the code breaking methods that
	you would use to solve this problem,
	with the paper titled "A Mathematical
	Theory of Communication," published
	in this year.

	Author
	------
	@jiffyclub
	git.io/jiffyclub

	"""

	import string

	S = [
	'de\| \| f\|Cl\|nf\|ed\|au\| i\|ti\| \|ma\|ha\|or\|nn\|ou\| S\|on\|nd\|on',
	'ry\| \|is\|th\|is\| b\|eo\|as\| \| \|f \|wh\| o\|ic\| t\|, \| \|he\|h ',
	'ab\| \|la\|pr\|od\|ge\|ob\| m\|an\| \|s \|is\|el\|ti\|ng\|il\|d \|ua\|c ',
	'he\| \|ea\|of\|ho\| m\| t\|et\|ha\| \| t\|od\|ds\|e \|ki\| c\|t \|ng\|br',
	'wo\|m,\|to\|yo\|hi\|ve\|u \| t\|ob\| \|pr\|d \|s \|us\| s\|ul\|le\|ol\|e ',
	' t\|ca\| t\|wi\| M\|d \|th\|"A\|ma\|l \|he\| p\|at\|ap\|it\|he\|ti\|le\|er',
	'ry\|d \|un\|Th\|" \|io\|eo\|n,\|is\| \|bl\|f \|pu\|Co\|ic\| o\|he\|at\|mm',
	'hi\| \| \|in\| \| \| t\| \| \| \| \|ye\| \|ar\| \|s \| \| \|. ']

	VOWELS = 'aeiouAEIOU'
	CONSONANTS = 'bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ'

	# this may be in another location on other computers
	WORD_DICT = '/usr/share/dict/words'

	def read_words(words_file):
	"""
	Read a list of words from a text file and return it as a Python list.

	Parameters
	----------
	words_file : str
	Path to a text file containing words.

	Returns
	-------
	word_list : list of str
	List of words read from file.

	"""
	f = open(words_file, 'r')

	words = f.readlines()

	f.close()

	return [w.strip() for w in words] # strip new lines


	def find_first_column(l):
	"""
	The first letter of the first row should be a capital
	letter. Find the index of the column that meets this requirement.

	(This is using the prior knowledge that only one column will meet
	this condition.)

	Parameters
	----------
	l : list of str
	First row of paragraph as a list of two character strings.

	Returns
	-------
	ind : int
	Index of best first column.

	"""
	for i,s in enumerate(l):
	if s[0] in string.uppercase:
	return i

	def check_string(s, words, known_words):
	"""
	Check that a string contains allowed words or the beginnings of words.

	Parameters
	----------
	s : str
	String to check.

	words : list of str
	List of allowed words.

	known_words : list of str
	List of words already confirmed in the paragraph. New words are
	added here.

	Returns
	-------
	okay : bool
	True if string contains allowed words, False otherwise.

	"""
	split = s.split()

	for word in split:
	# assume True, prove otherwise
	is_word = True

	# remove punctuation, if word is pure punctuation just continue
	temp_word = [c for c in word if c not in string.punctuation]

	if not temp_word:
	continue
	else:
	word = reduce(lambda x, y: x + y, temp_word)

	# if we've already seen this word continue on
	if word in known_words:
	continue

	# treat the last bit of the string separately
	if not s.endswith(word):
	# dictionary doesn't have capitalized words, except for
	# proper names
	#
	# if nothing matched it could be because the word
	# is plural or conjugated, which wouldn't show up in the
	# dictionary.
	if not word in words and not word.lower() in words:
	if word.endswith('s'):
	if not word[:-1] in words or not word[:-1].lower() in words:
	is_word = False

	elif word.endswith('ed'):
	if not word[:-2] in words or not word[:-2].lower() in words:
	is_word = False

	else:
	is_word = False

	# this word is at the end of the input string so could be just
	# a portion of a word
	else:
	if not word in words and not word.lower() in words:
	word_starts = [True for w in words
	if w.startswith(word)
	or w.startswith(word.lower())]

	if not word_starts:
	# if nothing matched it could be because the word
	# is plural or conjugated, which wouldn't show up in the
	# dictionary.
	if word.endswith('s'):
	word_starts = [True for w in words
	if w.startswith(word[:-1])
	or w.startswith(word[:-1].lower())]

	elif word.endswith('ed'):
	word_starts = [True for w in words
	if w.startswith(word[:-2])
	or w.startswith(word[:-2].lower())]

	if not word_starts:
	is_word = False

	# if any word isn't in the dictionary then return false
	if not is_word:
	return is_word

	return is_word


	def check_column(decoded, paragraph, col, words, known_words):
	"""
	Check whether a given column of the paragraph fits in with what's
	already been added to the decoded paragraph.

	Parameters
	----------
	decoded : list of str
	List of paragraph lines.

	paragraph : list of lists
	List of lists containing paragraph lines split into two
	character chunks.

	col : int
	Column to add to `decoded`.

	words : list of str
	List of allowed words.

	known_words : list of str
	List of words already confirmed in the paragraph.

	Returns
	-------
	correct : bool
	True if column `col` is consistent with what has already
	been decoded, False otherwise.

	"""
	correct = [check_string(decoded[i] + paragraph[i][col], words, known_words)
	for i in range(len(decoded))]

	if correct.count(True) != len(decoded):
	return False

	else:
	# add good words the known_words
	for i in range(len(decoded)):
	s = decoded[i] + paragraph[i][col]

	split = s.split()

	for w in split:
	if not s.endswith(w) and not w in known_words:
	known_words.append(w)

	return True


	def add_column(decoded, paragraph, col):
	"""
	Add a column from the paragraph to the decoded container.
	Also removes the column from use.

	Parameters
	----------
	decoded : list of str
	List of paragraph lines.

	paragraph : list of lists
	List of lists containing paragraph lines split into two
	character chunks.

	col : int
	Column to add to `decoded`.

	Returns
	-------
	decoded : list of str
	Paragraph lines with column appended.

	paragraph : list of lists
	Line chunks with column `col` removed.

	"""
	for i, line in enumerate(decoded):
	decoded[i] = line + paragraph[i].pop(col)

	return decoded, paragraph


	def main():
	"""
	Start with the column identified by `find_first_column` and then
	successively add columns that pass all of the rule tests.

	"""
	# get list of allowed words
	words = read_words(WORD_DICT)

	# turn S into list of lists
	paragraph = [s.split('\|') for s in S]

	# make a container for decoded paragraph
	decoded = [''] * len(paragraph)

	# this will contain the indices of the columns in their corrected
	# order.
	order = []

	# this will contain words that are confirmed part of the decoded
	# paragraph, to save looking in the big dictionary
	known_words = []

	# find the first column and remove it from paragraph
	order.append(find_first_column(paragraph[0]))

	decoded, paragraph = add_column(decoded, paragraph, order[-1])

	# now go through columns finding one that goes after the last
	# column in order
	while True:
	matched = False

	for col in range(len(paragraph[0])):
	if check_column(decoded, paragraph, col, words, known_words):
	order.append(col)

	decoded, paragraph = add_column(decoded, paragraph, order[-1])

	matched = True

	break

	if not matched:
	print decoded
	raise StandardError("No column matched your rules!")

	# check if we've matched all columns
	if len(paragraph[0]) == 0:
	break

	for line in decoded:
	print line


	if __name__ == '__main__':
	main()