Skip to content

Instantly share code, notes, and snippets.

@dovinmu
Created July 13, 2018 00:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dovinmu/e034a019566e4c93cfe126a3994b1959 to your computer and use it in GitHub Desktop.
Save dovinmu/e034a019566e4c93cfe126a3994b1959 to your computer and use it in GitHub Desktop.
def loadWordSet():
s = set()
try:
with open('/usr/share/dict/american-english', 'r') as f:
for line in f:
s.add(line.strip())
except:
print("Could not find built-in dictionary")
# rely on https://github.com/dwyl/english-words.git
try:
with open('words_alpha.txt') as f:
for line in f:
s.add(line.strip())
except:
print("You'll need to download the file 'words_alpha.txt' from https://github.com/dwyl/english-words into this folder")
return s
def getPlaintext(url):
import requests
r = requests.get(url)
return r.text
def getFileText(fname):
with open(fname) as f:
return f.read()
def getParagraphList(text):
# gets a list of paragraphs. the assumption that paragraphs will start with
# letters seems to hold up well to the real world, so I don't attempt to clean
# the text
result = []
text = text.split('\n')
temp_par = ''
for line in text:
if len(line) < 2:
if len(temp_par) > 0:
result.append(temp_par)
temp_par = ''
continue
temp_par += line.strip() + ' '
return result
def getWordList(text):
# clean words and return a list
result = []
for par in getParagraphList(text):
for char in set('\'"1234567890-=!@#$%^&*()_+{}[]|\;:,.<>/?~`'):
par = par.replace(char,'')
for word in par.split(' '):
if len(word) == 0:
continue
result.append(word)
return result
def findAcrostic(text, wordset, level='paragraph', min_word_len=4, max_word_len=20):
charbuf = ''
if level=='paragraph':
text = getParagraphList(text)
for i in range(len(text)):
par = text[i]
if par in ['\r'] or len(par) < 2:
continue
else:
charbuf += par[0].lower()
# check first n characters in the charbuf for a word, checking for largest first
for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1):
if charbuf[:j] in wordset:
# print the word
print(charbuf[:j].upper(), i-len(charbuf), '-', i)
# print the lines sequence that makes the acrostic, abbreviated
for par_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1):
print(text[par_idx][0], text[par_idx][1:100]+'...', end='\n')
charbuf = charbuf[j:]
print('\n')
break
# allow the charbuf to overflow
if len(charbuf) > max_word_len:
charbuf = charbuf[1:]
if level=='word':
text = getWordList(text)
for i in range(len(text)):
word = text[i]
charbuf += word[0].lower()
# check first n characters in the charbuf for a word, checking for largest first
for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1):
if charbuf[:j] in wordset:
# print the word
print(charbuf[:j].upper(), i-len(charbuf), '-', i)
# print the word sequence that makes the acrostic
for word_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1):
print(text[word_idx][0].upper() + text[word_idx][1:], end=' ')
print('\n')
charbuf = charbuf[j:]
break
# allow the charbuf to overflow
if len(charbuf) > max_word_len:
charbuf = charbuf[1:]
def gutenbergAcrostic(url, wordset):
#print('processing url {}'.format(url))
if url[:1] == '/':
text = getFileText(url)
else:
text = getPlaintext(url)
print('\t\t paragraphs'.upper())
findAcrostic(text, wordset, level='paragraph', min_word_len=4)
print('\t\t words'.upper())
findAcrostic(text, wordset, level='word', min_word_len=7)
if __name__ == "__main__":
wordset = loadWordSet()
for name,url in {
'A Christmas Carol':'http://www.gutenberg.org/cache/epub/46/pg46.txt',
'Pride and Prejudice':'http://www.gutenberg.org/files/1342/1342-0.txt',
"Alice's Adventures in Wonderland":'http://www.gutenberg.org/files/11/11-0.txt'
}.items():
print('\n\t\t'+name.upper())
gutenbergAcrostic(url, wordset)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment