phpbb-external-statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
import urllib2 | |
from retry import retry | |
try: | |
from BeautifulSoup import BeautifulSoup | |
except ImportError: | |
from bs4 import BeautifulSoup | |
if len(sys.argv) < 2: | |
print 'Usage:',sys.argv[0],'file_with_subforums_urls [file_to_output_raw_data_to]' | |
sys.exit() | |
topics = [] | |
posts = [] | |
stats = {} | |
with open(sys.argv[1]) as f: | |
forums = f.readlines() | |
forums = [x.strip() for x in forums] | |
print 'Got',len(forums),'forums to process' | |
@retry(urllib2.URLError,tries=3,delay=2,backoff=2) | |
def urlopen_with_retry(url): | |
return urllib2.urlopen(url) | |
for forum in forums: | |
next_page = forum | |
print "Processing",forum | |
while next_page: | |
try: | |
response = urlopen_with_retry(next_page) | |
page = response.read() | |
parsed_page = BeautifulSoup(page,'lxml') | |
next_page = '' | |
for link in parsed_page.body.find_all('a'): | |
if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall': | |
next_page = forum.split('?')[0] + '?' + link['href'].split('?')[1] | |
for link in parsed_page.body.find_all('a',class_="topictitle"): | |
if link.has_attr('href') and ('viewtopic.php' in link['href']) and link.parent.name == 'td' and link.parent.get('class')[0] == 'row1': | |
topics.append(forum.split('?')[0].replace('viewforum','viewtopic') + '?' + link['href'].split('?')[1]) | |
except: | |
print "Skipped due to error in",forum | |
next_page = '' | |
print 'Got',len(topics),'topics to process' | |
for topic in topics: | |
next_page = topic | |
print "Processing",topic | |
while next_page: | |
try: | |
response = urlopen_with_retry(next_page) | |
page = response.read() | |
parsed_page = BeautifulSoup(page,'lxml') | |
next_page = '' | |
for link in parsed_page.body.find_all('a'): | |
if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall': | |
next_page = topic.split('?')[0] + '?' + link['href'].split('?')[1] | |
for td in parsed_page.body.find_all('td',class_="gensmall"): | |
for child in td.children: | |
if child.name == 'div' and child.has_attr('style') and child['style'] == 'float: right;': | |
posts.append(child.text.replace('Posted: ','')) | |
except: | |
print "Skipped due to error in",topic | |
next_page = '' | |
print 'Got',len(posts),'posts' | |
if len(sys.argv) == 3: | |
output = open(sys.argv[2],'w') | |
for post in posts: | |
output.write("%s\n" % post) | |
for post in posts: | |
post = post.split(' ') | |
if not stats.has_key(post[3]): | |
stats[post[3]] = 0 | |
stats[post[3]] += 1 | |
print "Results as tsv:" | |
for key in sorted(stats.iterkeys()): | |
print "%s\t%s" % (key,stats[key]) |
Example file_with_subforums_urls
:
http://www.example.com/viewforum.php?f=1
http://www.example.com/viewforum.php?f=2
http://www.example.com/viewforum.php?f=3
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Also this can be used to parse raw data and group it by month.