hdon/seth.py

## seth.py
#import bleach # learn more: https://python.org/pypi/bleach
from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer as ss
import urllib as ul
import urllib.request
import re

#TODO: spider to crawl for urls

earl = 'http://www.premier-mountain-properties.net/houses-with-water-rights-almont-co/'
pearl = urllib.parse.urlparse(earl)
url_path_terms = re.findall('\w+', pearl.path)

with ul.request.urlopen(earl) as response:
    html = response.read()

articles = ss('article')

soup = bs(html, "lxml", parse_only=articles)
article = soup.find_all('article')
thing = bleach.clean(str(article), strip=True)
thing = re.sub('\W+',' ', thing)
thing = re.sub( r"([A-Z])", r" \1", thing)

text = thing
text = text.lower()
text = text.split()
cleanlist = ["a", "an", "the", "for", "of", "it", "but", "nor", "so", "and", "but", "or", "yet", "is", "to", "at", "i", "if", "as", "in", "by", "on", "li", "ul", "p"]
cleantext  = [word for word in text if word.lower() not in cleanlist]

izer = len(url_path_terms)
x = 0
totalkts = 0
while x < izer:
  kts = cleantext.count(url_path_terms[x])
  print(url_path_terms[x] + ": " + str(kts))
  totalkts = totalkts + kts
  x = x+1

wordcount = len(cleantext)
print("Total words: " + str(wordcount))

dense = (totalkts/wordcount)*100
print("Keyword Density: " + str(round(dense, 2)) + "%")
	#import bleach # learn more: https://python.org/pypi/bleach
	from bs4 import BeautifulSoup as bs
	from bs4 import SoupStrainer as ss
	import urllib as ul
	import urllib.request
	import re

	#TODO: spider to crawl for urls

	earl = 'http://www.premier-mountain-properties.net/houses-with-water-rights-almont-co/'
	pearl = urllib.parse.urlparse(earl)
	url_path_terms = re.findall('\w+', pearl.path)

	with ul.request.urlopen(earl) as response:
	html = response.read()

	articles = ss('article')

	soup = bs(html, "lxml", parse_only=articles)
	article = soup.find_all('article')
	thing = bleach.clean(str(article), strip=True)
	thing = re.sub('\W+',' ', thing)
	thing = re.sub( r"([A-Z])", r" \1", thing)

	text = thing
	text = text.lower()
	text = text.split()
	cleanlist = ["a", "an", "the", "for", "of", "it", "but", "nor", "so", "and", "but", "or", "yet", "is", "to", "at", "i", "if", "as", "in", "by", "on", "li", "ul", "p"]
	cleantext = [word for word in text if word.lower() not in cleanlist]

	izer = len(url_path_terms)
	x = 0
	totalkts = 0
	while x < izer:
	kts = cleantext.count(url_path_terms[x])
	print(url_path_terms[x] + ": " + str(kts))
	totalkts = totalkts + kts
	x = x+1

	wordcount = len(cleantext)
	print("Total words: " + str(wordcount))

	dense = (totalkts/wordcount)*100
	print("Keyword Density: " + str(round(dense, 2)) + "%")