marians/usenet_group_stats.py

## usenet_group_stats.py
#!/usr/bin/env python
# encoding: utf-8
"""
	This script acquires statistics on usenet groups.

	It first reads a list of groups from one or more usenet servers
	and then gets monthly post statistics about these groups from
	Google Groups.
"""

import sys
import os
import nntplib
import urllib2
import re
import time
from scrapemark import scrape

# Path/filename for the group list. Will be created on first run if not existent.
GROUP_LIST_FILE = 'grouplist.csv'

# minimum number of appearances. A group that is seen less frequently will not be considered
MIN_GROUP_COUNT = 10

# results
RESULTS_FILE = 'postcount.csv'

def get_servers():
	url = 'http://www.newzbot.com/serverlist.php?since=ALL&orderby=kps&sortorder=desc&show_maxgroup=on&show_post=on&show_kps=on&show_created=on&notable=1'
	data = scrape("""
	{*
		<b>News server hostname:</b> <a href='/search.php?t=info&q={{ [id]|int }}'>{{ [name] }}</a>
	*}
	""", url=url)
	if 'name' in data:
		return data['name']

def get_groups_from_server(servername):
	try:
		s = nntplib.NNTP(servername)
	except:
		return None
	try:
		(response, groups) = s.newgroups('000101', '000000')
	except:
		return None
	s.quit()
	ret = []
	for group in groups:
		parts = group.split()
		ret.append(parts[0])
	return ret

def get_group_stats_from_google(group):
	url = "http://groups.google.com/group/%s/about?hl=en" % group
	request = urllib2.Request(url, None, {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
	try:
		handler = urllib2.urlopen(request)
	except:
		return None
	html = handler.read()
	matches = re.findall(r'href="/group/[^/]+/browse_frm/month/([^\?]+)\?hl=en">([0-9]+)</', html)
	if matches is not None:
		return matches

if __name__ == '__main__':
	groupdict = {}
	if not os.path.exists(GROUP_LIST_FILE):
		# get group lists
		print "Reading NNTP server list..."
		nntp_servers = get_servers()
		print "Got", len(nntp_servers), "servers."
		for server in nntp_servers:
			print "Getting groups from", server, "..."
			groups = get_groups_from_server(server)
			if groups is not None:
				print "... found", len(groups), "groups."
				for group in groups:
					if group not in groupdict:
						groupdict[group] = 0
					groupdict[group] += 1
		print groupdict
		f = open(GROUP_LIST_FILE, 'w+')
		for group in groupdict.keys():
			f.write(group + "\t" + str(groupdict[group]) + "\n")
		f.close()

	# read grouplist file
	lines = open(GROUP_LIST_FILE).read().split("\n")
	for line in lines:
		line = line.strip()
		if line != "":
			parts = line.split("\t")
			groupdict[parts[0]] = int(parts[1])

	# get group stats from Google
	if not os.path.exists(RESULTS_FILE):
		f = open(RESULTS_FILE, 'w+')
		for group in groupdict.keys():
			if groupdict[group] >= MIN_GROUP_COUNT:
				print "Getting post count for group", group, "..."
				postcounts = get_group_stats_from_google(group)
				if postcounts is not None:
					for entry in postcounts:
						f.write("%s\t%s\t%d\n" % (group, entry[0], int(entry[1])))
				time.sleep(1)
		f.close()

	datedict = {}
	lines = open(RESULTS_FILE, 'r').read().split("\n")
	for line in lines:
		line = line.strip()
		parts = line.split("\t")
		if len(parts) == 3:
			if parts[1] not in datedict:
				datedict[parts[1]] = 0
			datedict[parts[1]] += int(parts[2])
	for datestring in datedict.keys():
		print datestring + "-01\t" + str(datedict[datestring])
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	This script acquires statistics on usenet groups.

	It first reads a list of groups from one or more usenet servers
	and then gets monthly post statistics about these groups from
	Google Groups.
	"""

	import sys
	import os
	import nntplib
	import urllib2
	import re
	import time
	from scrapemark import scrape

	# Path/filename for the group list. Will be created on first run if not existent.
	GROUP_LIST_FILE = 'grouplist.csv'

	# minimum number of appearances. A group that is seen less frequently will not be considered
	MIN_GROUP_COUNT = 10

	# results
	RESULTS_FILE = 'postcount.csv'

	def get_servers():
	url = 'http://www.newzbot.com/serverlist.php?since=ALL&orderby=kps&sortorder=desc&show_maxgroup=on&show_post=on&show_kps=on&show_created=on&notable=1'
	data = scrape("""
	{*
	<b>News server hostname:</b> <a href='/search.php?t=info&q={{ [id]\|int }}'>{{ [name] }}</a>
	*}
	""", url=url)
	if 'name' in data:
	return data['name']

	def get_groups_from_server(servername):
	try:
	s = nntplib.NNTP(servername)
	except:
	return None
	try:
	(response, groups) = s.newgroups('000101', '000000')
	except:
	return None
	s.quit()
	ret = []
	for group in groups:
	parts = group.split()
	ret.append(parts[0])
	return ret

	def get_group_stats_from_google(group):
	url = "http://groups.google.com/group/%s/about?hl=en" % group
	request = urllib2.Request(url, None, {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
	try:
	handler = urllib2.urlopen(request)
	except:
	return None
	html = handler.read()
	matches = re.findall(r'href="/group/[^/]+/browse_frm/month/([^\?]+)\?hl=en">([0-9]+)</', html)
	if matches is not None:
	return matches

	if __name__ == '__main__':
	groupdict = {}
	if not os.path.exists(GROUP_LIST_FILE):
	# get group lists
	print "Reading NNTP server list..."
	nntp_servers = get_servers()
	print "Got", len(nntp_servers), "servers."
	for server in nntp_servers:
	print "Getting groups from", server, "..."
	groups = get_groups_from_server(server)
	if groups is not None:
	print "... found", len(groups), "groups."
	for group in groups:
	if group not in groupdict:
	groupdict[group] = 0
	groupdict[group] += 1
	print groupdict
	f = open(GROUP_LIST_FILE, 'w+')
	for group in groupdict.keys():
	f.write(group + "\t" + str(groupdict[group]) + "\n")
	f.close()

	# read grouplist file
	lines = open(GROUP_LIST_FILE).read().split("\n")
	for line in lines:
	line = line.strip()
	if line != "":
	parts = line.split("\t")
	groupdict[parts[0]] = int(parts[1])

	# get group stats from Google
	if not os.path.exists(RESULTS_FILE):
	f = open(RESULTS_FILE, 'w+')
	for group in groupdict.keys():
	if groupdict[group] >= MIN_GROUP_COUNT:
	print "Getting post count for group", group, "..."
	postcounts = get_group_stats_from_google(group)
	if postcounts is not None:
	for entry in postcounts:
	f.write("%s\t%s\t%d\n" % (group, entry[0], int(entry[1])))
	time.sleep(1)
	f.close()

	datedict = {}
	lines = open(RESULTS_FILE, 'r').read().split("\n")
	for line in lines:
	line = line.strip()
	parts = line.split("\t")
	if len(parts) == 3:
	if parts[1] not in datedict:
	datedict[parts[1]] = 0
	datedict[parts[1]] += int(parts[2])
	for datestring in datedict.keys():
	print datestring + "-01\t" + str(datedict[datestring])