Skip to content

Instantly share code, notes, and snippets.

@hdon
Last active April 30, 2017 04:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hdon/137585c2ac59cda986d9426f8ba1aebb to your computer and use it in GitHub Desktop.
Save hdon/137585c2ac59cda986d9426f8ba1aebb to your computer and use it in GitHub Desktop.
seth's thing
#import bleach # learn more: https://python.org/pypi/bleach
from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer as ss
import urllib as ul
import urllib.request
import re
#TODO: spider to crawl for urls
earl = 'http://www.premier-mountain-properties.net/houses-with-water-rights-almont-co/'
pearl = urllib.parse.urlparse(earl)
url_path_terms = re.findall('\w+', pearl.path)
with ul.request.urlopen(earl) as response:
html = response.read()
articles = ss('article')
soup = bs(html, "lxml", parse_only=articles)
article = soup.find_all('article')
thing = bleach.clean(str(article), strip=True)
thing = re.sub('\W+',' ', thing)
thing = re.sub( r"([A-Z])", r" \1", thing)
text = thing
text = text.lower()
text = text.split()
cleanlist = ["a", "an", "the", "for", "of", "it", "but", "nor", "so", "and", "but", "or", "yet", "is", "to", "at", "i", "if", "as", "in", "by", "on", "li", "ul", "p"]
cleantext = [word for word in text if word.lower() not in cleanlist]
izer = len(url_path_terms)
x = 0
totalkts = 0
while x < izer:
kts = cleantext.count(url_path_terms[x])
print(url_path_terms[x] + ": " + str(kts))
totalkts = totalkts + kts
x = x+1
wordcount = len(cleantext)
print("Total words: " + str(wordcount))
dense = (totalkts/wordcount)*100
print("Keyword Density: " + str(round(dense, 2)) + "%")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment