drscotthawley/text_shortener.py

## text_shortener.py
#!/usr/bin/env python

# Replaces lengthy words/phrases with shorter variants

# Author: Scott Hawley

import pandas as pd
import re
import os

def parse_garbl(df=None):
    """Scraping word-shortening lists by Gary B. Larson
       Currently it only selects the first of every possible re-mapping option
    """
    print("Scraping word-shortening lists by Gary B. Larson")
    urls = [r'https://garbl.info/stylemanual/words.htm',\
       r'https://garbl.info/stylemanual/phrases.htm',\
       r'https://garbl.info/stylemanual/redundant.htm']
    for url in urls:
        tables = pd.read_html(url)
        df2 = tables[0].iloc[2:-1,0:2]  # 1st two columns; skip first couple rows and last rows.
        for i in [0,1]:
            # strip parenthetical anything
            df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"\s*\(.*\)\s*","")
            # strip anything after a comma (use only the first option, for now)
            df2.iloc[:,i] = df2.iloc[:,i].str.replace(r",.*","")
            # grab x from 'either x or y'
            df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"either (.*) or .*","\\1")
        df2 = df2[df2.iloc[:,0].str.contains("Back to top") == False]
        if df is not None:
            df = df.append(df2)
        else:
            df = df2
    return df

def parse_brockway():
    print("Scraping Laura Hale Brockway's list from PR Daily")
    url = 'http://m.prdaily.com/Main/Articles/20_phrases_you_can_replace_with_one_word__11285.aspx'
    tables = pd.read_html(url)
    df = tables[1].iloc[:,0:2]
    for i in [0,1]:
        # strip anything after a comma (use only the first option, for now)
        df.iloc[:,i] = df.iloc[:,i].str.replace(r",.*","")
    return df


def apply_dict(text, df):
    """This is where the mapping happens.
        Currently it makes replacements without asking for the user's consent
    """
    for index, row in df.iterrows():
        if (len(row['from']) > len(row['to'])) and (row['from'] in text):
            print("Replacing",row['from'],'-->',row['to'])
            text = text.replace(row['from'],row['to'])
    return text


# Build the 'dictionary' of translations (as a Pandas Dataframe)
df = parse_garbl()
df = df.append(parse_brockway(),ignore_index=True)
df = df.applymap(str)               # just in case

df.columns = ["from", "to"]
print(df)

# specify some input text, either as a file or grab text online
filename = 'my_essay.txt'
if os.path.isfile(filename):
    print("Reading from",filename)
    with open(filename, 'r') as infile:
        old_text = infile.read()
else:
    # Grab some text from online, e.g. "Collected works of William Hazlitt"
    import requests
    url = 'https://www.gutenberg.org/files/55932/55932-0.txt'
    print("Grabbing some text from",url)
    r = requests.get(url)
    old_text = r.text


# Now apply the shortening
new_text = apply_dict(old_text, df)
print("\n")
print("Before processing, text length =",len(old_text),"characters")
print("After processing, text length =",len(new_text),"characters")

# Save to new text file
with open("my_essay_out.txt", "w") as outfile:
    outfile.write(new_text)
# EOF
	#!/usr/bin/env python

	# Replaces lengthy words/phrases with shorter variants

	# Author: Scott Hawley

	import pandas as pd
	import re
	import os

	def parse_garbl(df=None):
	"""Scraping word-shortening lists by Gary B. Larson
	Currently it only selects the first of every possible re-mapping option
	"""
	print("Scraping word-shortening lists by Gary B. Larson")
	urls = [r'https://garbl.info/stylemanual/words.htm',\
	r'https://garbl.info/stylemanual/phrases.htm',\
	r'https://garbl.info/stylemanual/redundant.htm']
	for url in urls:
	tables = pd.read_html(url)
	df2 = tables[0].iloc[2:-1,0:2] # 1st two columns; skip first couple rows and last rows.
	for i in [0,1]:
	# strip parenthetical anything
	df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"\s\(.\)\s*","")
	# strip anything after a comma (use only the first option, for now)
	df2.iloc[:,i] = df2.iloc[:,i].str.replace(r",.*","")
	# grab x from 'either x or y'
	df2.iloc[:,i] = df2.iloc[:,i].str.replace(r"either (.) or .","\\1")
	df2 = df2[df2.iloc[:,0].str.contains("Back to top") == False]
	if df is not None:
	df = df.append(df2)
	else:
	df = df2
	return df

	def parse_brockway():
	print("Scraping Laura Hale Brockway's list from PR Daily")
	url = 'http://m.prdaily.com/Main/Articles/20_phrases_you_can_replace_with_one_word__11285.aspx'
	tables = pd.read_html(url)
	df = tables[1].iloc[:,0:2]
	for i in [0,1]:
	# strip anything after a comma (use only the first option, for now)
	df.iloc[:,i] = df.iloc[:,i].str.replace(r",.*","")
	return df


	def apply_dict(text, df):
	"""This is where the mapping happens.
	Currently it makes replacements without asking for the user's consent
	"""
	for index, row in df.iterrows():
	if (len(row['from']) > len(row['to'])) and (row['from'] in text):
	print("Replacing",row['from'],'-->',row['to'])
	text = text.replace(row['from'],row['to'])
	return text


	# Build the 'dictionary' of translations (as a Pandas Dataframe)
	df = parse_garbl()
	df = df.append(parse_brockway(),ignore_index=True)
	df = df.applymap(str) # just in case

	df.columns = ["from", "to"]
	print(df)

	# specify some input text, either as a file or grab text online
	filename = 'my_essay.txt'
	if os.path.isfile(filename):
	print("Reading from",filename)
	with open(filename, 'r') as infile:
	old_text = infile.read()
	else:
	# Grab some text from online, e.g. "Collected works of William Hazlitt"
	import requests
	url = 'https://www.gutenberg.org/files/55932/55932-0.txt'
	print("Grabbing some text from",url)
	r = requests.get(url)
	old_text = r.text


	# Now apply the shortening
	new_text = apply_dict(old_text, df)
	print("\n")
	print("Before processing, text length =",len(old_text),"characters")
	print("After processing, text length =",len(new_text),"characters")

	# Save to new text file
	with open("my_essay_out.txt", "w") as outfile:
	outfile.write(new_text)
	# EOF