Skip to content

Instantly share code, notes, and snippets.

@apoorvalal
Last active May 5, 2022 19:57
Show Gist options
  • Save apoorvalal/e67ab79491835de8af2fd1286154ae7b to your computer and use it in GitHub Desktop.
Save apoorvalal/e67ab79491835de8af2fd1286154ae7b to your computer and use it in GitHub Desktop.
example of text parsing using gettysburg address.
# %%
from bs4 import BeautifulSoup
from urllib import request
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
# run once
# import nltk
# nltk.download("stopwords")
# %% scrape
url = request.urlopen('http://avalon.law.yale.edu/19th_century/gettyb.asp').read()
soup = BeautifulSoup(url)
# %% bag o'words
text = soup.p.contents[0]
# drop punctuation
text = re.sub(r'[^\w\s]', '', text)
# make list by splitting
textlist = text.split()
# %% drop stopwords
textlist = [w.lower() for w in textlist if w not in stopwords.words('english')]
# %% lemmatize
ps = PorterStemmer()
textlist = [ps.stem(w) for w in textlist]
# %% count words
word_counts = Counter(textlist)
word_counts.most_common()[0:10]
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment