Skip to content

Instantly share code, notes, and snippets.

@jiffyclub
Created December 20, 2011 02:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jiffyclub/1499952 to your computer and use it in GitHub Desktop.
Save jiffyclub/1499952 to your computer and use it in GitHub Desktop.
Python, dictionary based solution for part two of www.ai-class.com Natural Language Processing problem.
"""
This code is for part two of the AI Class optional Natural Language Processing
programming assignment. See youtu.be/KuSg1wcty3s for the complete problem
definition.
We are given a sentence that spans several lines and has been shredded
vertically so that there are 2 characters in each column. The columns are
arranged in random order and it's our job to assemble them into the correct
order.
In this code I use a simple dictionary method to reconstruct the original.
The code first identifies the first column by looking for a capital letter,
then tries to add on columns, only accepting a column as a match if every
word on every row matches a word in the dictionary or the beginning of a word
in the dictionary.
This technique is effective but slow since the dictonary is large and frequently
searched. (I used as a dictionary the list of words in /usr/share/dict/words.
This may be in different locations on different systems.)
Contrast this method with the letter-by-letter grammar method in
gist.github.com/1499949, which is much faster but unable to recognize
real English words.
I think it could be effective to combine the grammar and dictionary methods.
Many potential strings could be quickly ruled invalid by grammar rules and
only after passing those rules would a word be subjected to the more
time consuming dictionary tests.
The jumbled sentence is:
|de| | f|Cl|nf|ed|au| i|ti| |ma|ha|or|nn|ou| S|on|nd|on|
|ry| |is|th|is| b|eo|as| | |f |wh| o|ic| t|, | |he|h |
|ab| |la|pr|od|ge|ob| m|an| |s |is|el|ti|ng|il|d |ua|c |
|he| |ea|of|ho| m| t|et|ha| | t|od|ds|e |ki| c|t |ng|br|
|wo|m,|to|yo|hi|ve|u | t|ob| |pr|d |s |us| s|ul|le|ol|e |
| t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er|
|ry|d |un|Th|" |io|eo|n,|is| |bl|f |pu|Co|ic| o|he|at|mm|
|hi| | |in| | | t| | | | |ye| |ar| |s | | |. |
Reconstructed it reads:
Claude Shannon founded information
theory, which is the basis of
probabilistic language models and
of the code breaking methods that
you would use to solve this problem,
with the paper titled "A Mathematical
Theory of Communication," published
in this year.
Author
------
@jiffyclub
git.io/jiffyclub
"""
import string
S = [
'de| | f|Cl|nf|ed|au| i|ti| |ma|ha|or|nn|ou| S|on|nd|on',
'ry| |is|th|is| b|eo|as| | |f |wh| o|ic| t|, | |he|h ',
'ab| |la|pr|od|ge|ob| m|an| |s |is|el|ti|ng|il|d |ua|c ',
'he| |ea|of|ho| m| t|et|ha| | t|od|ds|e |ki| c|t |ng|br',
'wo|m,|to|yo|hi|ve|u | t|ob| |pr|d |s |us| s|ul|le|ol|e ',
' t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er',
'ry|d |un|Th|" |io|eo|n,|is| |bl|f |pu|Co|ic| o|he|at|mm',
'hi| | |in| | | t| | | | |ye| |ar| |s | | |. ']
VOWELS = 'aeiouAEIOU'
CONSONANTS = 'bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ'
# this may be in another location on other computers
WORD_DICT = '/usr/share/dict/words'
def read_words(words_file):
"""
Read a list of words from a text file and return it as a Python list.
Parameters
----------
words_file : str
Path to a text file containing words.
Returns
-------
word_list : list of str
List of words read from file.
"""
f = open(words_file, 'r')
words = f.readlines()
f.close()
return [w.strip() for w in words] # strip new lines
def find_first_column(l):
"""
The first letter of the first row should be a capital
letter. Find the index of the column that meets this requirement.
(This is using the prior knowledge that only one column will meet
this condition.)
Parameters
----------
l : list of str
First row of paragraph as a list of two character strings.
Returns
-------
ind : int
Index of best first column.
"""
for i,s in enumerate(l):
if s[0] in string.uppercase:
return i
def check_string(s, words, known_words):
"""
Check that a string contains allowed words or the beginnings of words.
Parameters
----------
s : str
String to check.
words : list of str
List of allowed words.
known_words : list of str
List of words already confirmed in the paragraph. New words are
added here.
Returns
-------
okay : bool
True if string contains allowed words, False otherwise.
"""
split = s.split()
for word in split:
# assume True, prove otherwise
is_word = True
# remove punctuation, if word is pure punctuation just continue
temp_word = [c for c in word if c not in string.punctuation]
if not temp_word:
continue
else:
word = reduce(lambda x, y: x + y, temp_word)
# if we've already seen this word continue on
if word in known_words:
continue
# treat the last bit of the string separately
if not s.endswith(word):
# dictionary doesn't have capitalized words, except for
# proper names
#
# if nothing matched it could be because the word
# is plural or conjugated, which wouldn't show up in the
# dictionary.
if not word in words and not word.lower() in words:
if word.endswith('s'):
if not word[:-1] in words or not word[:-1].lower() in words:
is_word = False
elif word.endswith('ed'):
if not word[:-2] in words or not word[:-2].lower() in words:
is_word = False
else:
is_word = False
# this word is at the end of the input string so could be just
# a portion of a word
else:
if not word in words and not word.lower() in words:
word_starts = [True for w in words
if w.startswith(word)
or w.startswith(word.lower())]
if not word_starts:
# if nothing matched it could be because the word
# is plural or conjugated, which wouldn't show up in the
# dictionary.
if word.endswith('s'):
word_starts = [True for w in words
if w.startswith(word[:-1])
or w.startswith(word[:-1].lower())]
elif word.endswith('ed'):
word_starts = [True for w in words
if w.startswith(word[:-2])
or w.startswith(word[:-2].lower())]
if not word_starts:
is_word = False
# if any word isn't in the dictionary then return false
if not is_word:
return is_word
return is_word
def check_column(decoded, paragraph, col, words, known_words):
"""
Check whether a given column of the paragraph fits in with what's
already been added to the decoded paragraph.
Parameters
----------
decoded : list of str
List of paragraph lines.
paragraph : list of lists
List of lists containing paragraph lines split into two
character chunks.
col : int
Column to add to `decoded`.
words : list of str
List of allowed words.
known_words : list of str
List of words already confirmed in the paragraph.
Returns
-------
correct : bool
True if column `col` is consistent with what has already
been decoded, False otherwise.
"""
correct = [check_string(decoded[i] + paragraph[i][col], words, known_words)
for i in range(len(decoded))]
if correct.count(True) != len(decoded):
return False
else:
# add good words the known_words
for i in range(len(decoded)):
s = decoded[i] + paragraph[i][col]
split = s.split()
for w in split:
if not s.endswith(w) and not w in known_words:
known_words.append(w)
return True
def add_column(decoded, paragraph, col):
"""
Add a column from the paragraph to the decoded container.
Also removes the column from use.
Parameters
----------
decoded : list of str
List of paragraph lines.
paragraph : list of lists
List of lists containing paragraph lines split into two
character chunks.
col : int
Column to add to `decoded`.
Returns
-------
decoded : list of str
Paragraph lines with column appended.
paragraph : list of lists
Line chunks with column `col` removed.
"""
for i, line in enumerate(decoded):
decoded[i] = line + paragraph[i].pop(col)
return decoded, paragraph
def main():
"""
Start with the column identified by `find_first_column` and then
successively add columns that pass all of the rule tests.
"""
# get list of allowed words
words = read_words(WORD_DICT)
# turn S into list of lists
paragraph = [s.split('|') for s in S]
# make a container for decoded paragraph
decoded = [''] * len(paragraph)
# this will contain the indices of the columns in their corrected
# order.
order = []
# this will contain words that are confirmed part of the decoded
# paragraph, to save looking in the big dictionary
known_words = []
# find the first column and remove it from paragraph
order.append(find_first_column(paragraph[0]))
decoded, paragraph = add_column(decoded, paragraph, order[-1])
# now go through columns finding one that goes after the last
# column in order
while True:
matched = False
for col in range(len(paragraph[0])):
if check_column(decoded, paragraph, col, words, known_words):
order.append(col)
decoded, paragraph = add_column(decoded, paragraph, order[-1])
matched = True
break
if not matched:
print decoded
raise StandardError("No column matched your rules!")
# check if we've matched all columns
if len(paragraph[0]) == 0:
break
for line in decoded:
print line
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment