Some ugly code to scrape blog posts and count how many, count total words, and count researchblogging.org references. Attributes will differ depending on theme used and platform (made using wordpress).
from BeautifulSoup import BeautifulSoup | |
import mechanize | |
import time | |
import re | |
pagenum = 1 | |
url = "http://www.bloghomepage.com" | |
browser = mechanize.Browser() | |
page = browser.open(url) | |
postcount = 0 | |
totalwordcount = 0 | |
totalrefcount = 0 | |
stop = "false" | |
while "false" in stop: | |
soup = BeautifulSoup(page) | |
link = soup.find("span", {"class":"next"}) | |
# attributes probably need to be changed depending on the wordpress theme used | |
if "Older" in str(link): | |
pagenum += 1 | |
print pagenum | |
stop = "false" | |
else: | |
stop = "true" | |
for table in soup.findAll('h1', {'class':'title'}): | |
links = table.findAll('a') | |
if "href" in str(links): | |
#print links | |
separatelinks = re.search("(?P<url>http?://[^\>\"\s]+)", str(links)).group("url") | |
print separatelinks | |
site = browser.open(str(separatelinks)).read() | |
soup = BeautifulSoup(site) | |
content = soup.find("div", {"class":"content entry-content"}) | |
fp = open('C:/filename.html', 'w') | |
fp.write(str(content)) | |
fp.close() | |
f = open('C:/filename.html', 'r') | |
readit = f.read() | |
f.close() | |
onlytext = ''.join(BeautifulSoup(readit).findAll(text=True)) | |
#print onlytext | |
fp2 = open('C:/filename.html', 'w') | |
fp2.write(str(onlytext.encode('ascii','ignore'))) | |
fp2.close() | |
words = re.findall('\w+', open('C:/filename.html').read().lower()) | |
wordcount = len(words) | |
print wordcount | |
totalwordcount += wordcount | |
print totalwordcount | |
refcount = 0 | |
for Z3988 in soup.findAll('span', {'class':'Z3988'}): | |
refcount += 1 | |
print refcount | |
totalrefcount += refcount | |
print totalrefcount | |
postcount += 1 | |
print postcount | |
#time.sleep(1) If you don't want to hit the server too fast | |
else: | |
print "not a link" | |
url = "http://www.bloghomepage.com/page/"+str(pagenum) | |
page = browser.open(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment