Skip to content

Instantly share code, notes, and snippets.

@marians
Created January 2, 2012 08:32
Show Gist options
  • Save marians/1549876 to your computer and use it in GitHub Desktop.
Save marians/1549876 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
"""
This script acquires statistics on usenet groups.
It first reads a list of groups from one or more usenet servers
and then gets monthly post statistics about these groups from
Google Groups.
"""
import sys
import os
import nntplib
import urllib2
import re
import time
from scrapemark import scrape
# Path/filename for the group list. Will be created on first run if not existent.
GROUP_LIST_FILE = 'grouplist.csv'
# minimum number of appearances. A group that is seen less frequently will not be considered
MIN_GROUP_COUNT = 10
# results
RESULTS_FILE = 'postcount.csv'
def get_servers():
url = 'http://www.newzbot.com/serverlist.php?since=ALL&orderby=kps&sortorder=desc&show_maxgroup=on&show_post=on&show_kps=on&show_created=on&notable=1'
data = scrape("""
{*
<b>News server hostname:</b> <a href='/search.php?t=info&q={{ [id]|int }}'>{{ [name] }}</a>
*}
""", url=url)
if 'name' in data:
return data['name']
def get_groups_from_server(servername):
try:
s = nntplib.NNTP(servername)
except:
return None
try:
(response, groups) = s.newgroups('000101', '000000')
except:
return None
s.quit()
ret = []
for group in groups:
parts = group.split()
ret.append(parts[0])
return ret
def get_group_stats_from_google(group):
url = "http://groups.google.com/group/%s/about?hl=en" % group
request = urllib2.Request(url, None, {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
try:
handler = urllib2.urlopen(request)
except:
return None
html = handler.read()
matches = re.findall(r'href="/group/[^/]+/browse_frm/month/([^\?]+)\?hl=en">([0-9]+)</', html)
if matches is not None:
return matches
if __name__ == '__main__':
groupdict = {}
if not os.path.exists(GROUP_LIST_FILE):
# get group lists
print "Reading NNTP server list..."
nntp_servers = get_servers()
print "Got", len(nntp_servers), "servers."
for server in nntp_servers:
print "Getting groups from", server, "..."
groups = get_groups_from_server(server)
if groups is not None:
print "... found", len(groups), "groups."
for group in groups:
if group not in groupdict:
groupdict[group] = 0
groupdict[group] += 1
print groupdict
f = open(GROUP_LIST_FILE, 'w+')
for group in groupdict.keys():
f.write(group + "\t" + str(groupdict[group]) + "\n")
f.close()
# read grouplist file
lines = open(GROUP_LIST_FILE).read().split("\n")
for line in lines:
line = line.strip()
if line != "":
parts = line.split("\t")
groupdict[parts[0]] = int(parts[1])
# get group stats from Google
if not os.path.exists(RESULTS_FILE):
f = open(RESULTS_FILE, 'w+')
for group in groupdict.keys():
if groupdict[group] >= MIN_GROUP_COUNT:
print "Getting post count for group", group, "..."
postcounts = get_group_stats_from_google(group)
if postcounts is not None:
for entry in postcounts:
f.write("%s\t%s\t%d\n" % (group, entry[0], int(entry[1])))
time.sleep(1)
f.close()
datedict = {}
lines = open(RESULTS_FILE, 'r').read().split("\n")
for line in lines:
line = line.strip()
parts = line.split("\t")
if len(parts) == 3:
if parts[1] not in datedict:
datedict[parts[1]] = 0
datedict[parts[1]] += int(parts[2])
for datestring in datedict.keys():
print datestring + "-01\t" + str(datedict[datestring])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment