Created
October 25, 2015 20:44
-
-
Save hartez/108f59c39f539032890f to your computer and use it in GitHub Desktop.
Python script for counting words in emails
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bsoup | |
import pandas as pd | |
import numpy as np | |
import humanfriendly | |
# Read in email data file | |
df = pd.read_csv('../bodytext.csv', header = 0) | |
# Filter out sent mail | |
emails = df.query('FromEmail != "[my email address]"').copy() | |
def wordCount(row): | |
if(row['Format'] == 'Html'): | |
return htmlWordCount(row['Body']) | |
return textWordCount(row['Body']) | |
def textWordCount(text): | |
if not(isinstance(text, str)): | |
return 0 | |
return len(text.split(None)) | |
def htmlWordCount(text): | |
if not(isinstance(text, str)): | |
return 0 | |
soup = bsoup(text, 'html.parser') | |
if soup is None: | |
return 0 | |
stripped = soup.get_text(" ", strip=True) | |
[s.extract() for s in soup(['style', 'script', 'head', 'title'])] | |
stripped = soup.get_text(" ", strip=True) | |
return textWordCount(stripped) | |
averageWordsPerMinute = 350 | |
# Count the words in each message body | |
emails['WordCount'] = emails.apply(wordCount, axis=1) | |
emails['MinutesToRead'] = emails['WordCount'] / averageWordsPerMinute | |
# Get total number of minutes required to read all these emails | |
totalMinutes = emails['MinutesToRead'].sum() | |
# And convert that to a more human-readable timespan | |
timeToRead = humanfriendly.format_timespan(totalMinutes * 60) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
when i write this code in python,it shows error - no module named bs4 in line no.1.please help