Skip to content

Instantly share code, notes, and snippets.

@deseven
Last active July 28, 2017 22:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deseven/e21515ff252376d5ab44f616d4be6e66 to your computer and use it in GitHub Desktop.
Save deseven/e21515ff252376d5ab44f616d4be6e66 to your computer and use it in GitHub Desktop.
phpbb-external-statistics
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib2
from retry import retry
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
if len(sys.argv) < 2:
print 'Usage:',sys.argv[0],'file_with_subforums_urls [file_to_output_raw_data_to]'
sys.exit()
topics = []
posts = []
stats = {}
with open(sys.argv[1]) as f:
forums = f.readlines()
forums = [x.strip() for x in forums]
print 'Got',len(forums),'forums to process'
@retry(urllib2.URLError,tries=3,delay=2,backoff=2)
def urlopen_with_retry(url):
return urllib2.urlopen(url)
for forum in forums:
next_page = forum
print "Processing",forum
while next_page:
try:
response = urlopen_with_retry(next_page)
page = response.read()
parsed_page = BeautifulSoup(page,'lxml')
next_page = ''
for link in parsed_page.body.find_all('a'):
if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall':
next_page = forum.split('?')[0] + '?' + link['href'].split('?')[1]
for link in parsed_page.body.find_all('a',class_="topictitle"):
if link.has_attr('href') and ('viewtopic.php' in link['href']) and link.parent.name == 'td' and link.parent.get('class')[0] == 'row1':
topics.append(forum.split('?')[0].replace('viewforum','viewtopic') + '?' + link['href'].split('?')[1])
except:
print "Skipped due to error in",forum
next_page = ''
print 'Got',len(topics),'topics to process'
for topic in topics:
next_page = topic
print "Processing",topic
while next_page:
try:
response = urlopen_with_retry(next_page)
page = response.read()
parsed_page = BeautifulSoup(page,'lxml')
next_page = ''
for link in parsed_page.body.find_all('a'):
if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall':
next_page = topic.split('?')[0] + '?' + link['href'].split('?')[1]
for td in parsed_page.body.find_all('td',class_="gensmall"):
for child in td.children:
if child.name == 'div' and child.has_attr('style') and child['style'] == 'float: right;':
posts.append(child.text.replace('Posted: ',''))
except:
print "Skipped due to error in",topic
next_page = ''
print 'Got',len(posts),'posts'
if len(sys.argv) == 3:
output = open(sys.argv[2],'w')
for post in posts:
output.write("%s\n" % post)
for post in posts:
post = post.split(' ')
if not stats.has_key(post[3]):
stats[post[3]] = 0
stats[post[3]] += 1
print "Results as tsv:"
for key in sorted(stats.iterkeys()):
print "%s\t%s" % (key,stats[key])
@deseven
Copy link
Author

deseven commented Jul 28, 2017

Also this can be used to parse raw data and group it by month.

@deseven
Copy link
Author

deseven commented Jul 28, 2017

Example file_with_subforums_urls:

http://www.example.com/viewforum.php?f=1
http://www.example.com/viewforum.php?f=2
http://www.example.com/viewforum.php?f=3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment