apoorvalal/simple_text_parse.py

## simple_text_parse.py
# %%
from bs4 import BeautifulSoup
from urllib import request

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter

# run once
# import nltk
# nltk.download("stopwords")

# %% scrape
url = request.urlopen('http://avalon.law.yale.edu/19th_century/gettyb.asp').read()
soup = BeautifulSoup(url)

# %% bag o'words
text = soup.p.contents[0]
# drop punctuation
text = re.sub(r'[^\w\s]', '', text)
# make list by splitting
textlist = text.split()

# %% drop stopwords
textlist = [w.lower() for w in textlist if w not in stopwords.words('english')]

# %% lemmatize
ps = PorterStemmer()
textlist = [ps.stem(w) for w in textlist]

# %% count words
word_counts = Counter(textlist)
word_counts.most_common()[0:10]

# %%
	# %%
	from bs4 import BeautifulSoup
	from urllib import request

	import re
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from nltk.tokenize import word_tokenize
	from collections import Counter

	# run once
	# import nltk
	# nltk.download("stopwords")

	# %% scrape
	url = request.urlopen('http://avalon.law.yale.edu/19th_century/gettyb.asp').read()
	soup = BeautifulSoup(url)

	# %% bag o'words
	text = soup.p.contents[0]
	# drop punctuation
	text = re.sub(r'[^\w\s]', '', text)
	# make list by splitting
	textlist = text.split()

	# %% drop stopwords
	textlist = [w.lower() for w in textlist if w not in stopwords.words('english')]

	# %% lemmatize
	ps = PorterStemmer()
	textlist = [ps.stem(w) for w in textlist]

	# %% count words
	word_counts = Counter(textlist)
	word_counts.most_common()[0:10]

	# %%