dovinmu/acrostic_finder.py

## acrostic_finder.py
def loadWordSet():
    s = set()
    try:
        with open('/usr/share/dict/american-english', 'r') as f:
            for line in f:
                s.add(line.strip())
    except:
        print("Could not find built-in dictionary")
        # rely on https://github.com/dwyl/english-words.git
        try:
            with open('words_alpha.txt') as f:
                for line in f:
                    s.add(line.strip())
        except:
            print("You'll need to download the file 'words_alpha.txt' from https://github.com/dwyl/english-words into this folder")
    return s

def getPlaintext(url):
    import requests
    r = requests.get(url)
    return r.text

def getFileText(fname):
    with open(fname) as f:
        return f.read()

def getParagraphList(text):
    # gets a list of paragraphs. the assumption that paragraphs will start with
    # letters seems to hold up well to the real world, so I don't attempt to clean
    # the text
    result = []
    text = text.split('\n')
    temp_par = ''
    for line in text:
        if len(line) < 2:
            if len(temp_par) > 0:
                result.append(temp_par)
            temp_par = ''
            continue
        temp_par += line.strip() + ' '
    return result

def getWordList(text):
    # clean words and return a list
    result = []
    for par in getParagraphList(text):
        for char in set('\'"1234567890-=!@#$%^&*()_+{}[]|\;:,.<>/?~`'):
            par = par.replace(char,'')
        for word in par.split(' '):
            if len(word) == 0:
                continue
            result.append(word)
    return result


def findAcrostic(text, wordset, level='paragraph', min_word_len=4, max_word_len=20):
    charbuf = ''
    if level=='paragraph':
        text = getParagraphList(text)
        for i in range(len(text)):
            par = text[i]
            if par in ['\r'] or len(par) < 2:
                continue
            else:
                charbuf += par[0].lower()

            # check first n characters in the charbuf for a word, checking for largest first
            for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1):
                if charbuf[:j] in wordset:
                    # print the word
                    print(charbuf[:j].upper(), i-len(charbuf), '-', i)
                    # print the lines sequence that makes the acrostic, abbreviated
                    for par_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1):
                        print(text[par_idx][0], text[par_idx][1:100]+'...', end='\n')
                    charbuf = charbuf[j:]
                    print('\n')
                    break
            # allow the charbuf to overflow
            if len(charbuf) > max_word_len:
                charbuf = charbuf[1:]
    if level=='word':
        text = getWordList(text)
        for i in range(len(text)):
            word = text[i]
            charbuf += word[0].lower()

            # check first n characters in the charbuf for a word, checking for largest first
            for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1):
                if charbuf[:j] in wordset:
                    # print the word
                    print(charbuf[:j].upper(), i-len(charbuf), '-', i)
                    # print the word sequence that makes the acrostic
                    for word_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1):
                        print(text[word_idx][0].upper() + text[word_idx][1:], end=' ')
                    print('\n')
                    charbuf = charbuf[j:]
                    break
            # allow the charbuf to overflow
            if len(charbuf) > max_word_len:
                charbuf = charbuf[1:]

def gutenbergAcrostic(url, wordset):
    #print('processing url {}'.format(url))
    if url[:1] == '/':
        text = getFileText(url)
    else:
        text = getPlaintext(url)
    print('\t\t  paragraphs'.upper())
    findAcrostic(text, wordset, level='paragraph', min_word_len=4)
    print('\t\t  words'.upper())
    findAcrostic(text, wordset, level='word', min_word_len=7)

if __name__ == "__main__":
    wordset = loadWordSet()
    for name,url in {
       'A Christmas Carol':'http://www.gutenberg.org/cache/epub/46/pg46.txt',
       'Pride and Prejudice':'http://www.gutenberg.org/files/1342/1342-0.txt',
       "Alice's Adventures in Wonderland":'http://www.gutenberg.org/files/11/11-0.txt'
    }.items():
        print('\n\t\t'+name.upper())
        gutenbergAcrostic(url, wordset)
	def loadWordSet():
	s = set()
	try:
	with open('/usr/share/dict/american-english', 'r') as f:
	for line in f:
	s.add(line.strip())
	except:
	print("Could not find built-in dictionary")
	# rely on https://github.com/dwyl/english-words.git
	try:
	with open('words_alpha.txt') as f:
	for line in f:
	s.add(line.strip())
	except:
	print("You'll need to download the file 'words_alpha.txt' from https://github.com/dwyl/english-words into this folder")
	return s

	def getPlaintext(url):
	import requests
	r = requests.get(url)
	return r.text

	def getFileText(fname):
	with open(fname) as f:
	return f.read()

	def getParagraphList(text):
	# gets a list of paragraphs. the assumption that paragraphs will start with
	# letters seems to hold up well to the real world, so I don't attempt to clean
	# the text
	result = []
	text = text.split('\n')
	temp_par = ''
	for line in text:
	if len(line) < 2:
	if len(temp_par) > 0:
	result.append(temp_par)
	temp_par = ''
	continue
	temp_par += line.strip() + ' '
	return result

	def getWordList(text):
	# clean words and return a list
	result = []
	for par in getParagraphList(text):
	for char in set('\'"1234567890-=!@#$%^&*()_+{}[]\|\;:,.<>/?~`'):
	par = par.replace(char,'')
	for word in par.split(' '):
	if len(word) == 0:
	continue
	result.append(word)
	return result


	def findAcrostic(text, wordset, level='paragraph', min_word_len=4, max_word_len=20):
	charbuf = ''
	if level=='paragraph':
	text = getParagraphList(text)
	for i in range(len(text)):
	par = text[i]
	if par in ['\r'] or len(par) < 2:
	continue
	else:
	charbuf += par[0].lower()

	# check first n characters in the charbuf for a word, checking for largest first
	for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1):
	if charbuf[:j] in wordset:
	# print the word
	print(charbuf[:j].upper(), i-len(charbuf), '-', i)
	# print the lines sequence that makes the acrostic, abbreviated
	for par_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1):
	print(text[par_idx][0], text[par_idx][1:100]+'...', end='\n')
	charbuf = charbuf[j:]
	print('\n')
	break
	# allow the charbuf to overflow
	if len(charbuf) > max_word_len:
	charbuf = charbuf[1:]
	if level=='word':
	text = getWordList(text)
	for i in range(len(text)):
	word = text[i]
	charbuf += word[0].lower()

	# check first n characters in the charbuf for a word, checking for largest first
	for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1):
	if charbuf[:j] in wordset:
	# print the word
	print(charbuf[:j].upper(), i-len(charbuf), '-', i)
	# print the word sequence that makes the acrostic
	for word_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1):
	print(text[word_idx][0].upper() + text[word_idx][1:], end=' ')
	print('\n')
	charbuf = charbuf[j:]
	break
	# allow the charbuf to overflow
	if len(charbuf) > max_word_len:
	charbuf = charbuf[1:]

	def gutenbergAcrostic(url, wordset):
	#print('processing url {}'.format(url))
	if url[:1] == '/':
	text = getFileText(url)
	else:
	text = getPlaintext(url)
	print('\t\t paragraphs'.upper())
	findAcrostic(text, wordset, level='paragraph', min_word_len=4)
	print('\t\t words'.upper())
	findAcrostic(text, wordset, level='word', min_word_len=7)

	if __name__ == "__main__":
	wordset = loadWordSet()
	for name,url in {
	'A Christmas Carol':'http://www.gutenberg.org/cache/epub/46/pg46.txt',
	'Pride and Prejudice':'http://www.gutenberg.org/files/1342/1342-0.txt',
	"Alice's Adventures in Wonderland":'http://www.gutenberg.org/files/11/11-0.txt'
	}.items():
	print('\n\t\t'+name.upper())
	gutenbergAcrostic(url, wordset)