Skip to content

Instantly share code, notes, and snippets.

@ymotongpoo
Created August 25, 2011 16:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ymotongpoo/1171120 to your computer and use it in GitHub Desktop.
Save ymotongpoo/1171120 to your computer and use it in GitHub Desktop.
extract python package name and download counts from PyPI
# -*- coding: utf-8 -*-
from lxml import etree
from StringIO import StringIO
from multiprocessing import Pool
import urllib2
base_url = r"http://pypi.python.org"
xhtml_ns = r'http://www.w3.org/1999/xhtml'
pypi_list = r"http://pypi.python.org/pypi?%3Aaction=index"
user_agent = ('"User-Agent","value":"Mozilla/5.0 (Windows NT 5.1) ' +
'AppleWebKit/535.1 (KHTML, like Gecko) ' +
'Chrome/14.0.835.15 Safari/535.1"')
# '//table[@class="list"]/tbody/tr...' doesn't work
package_name_xpath = ('//table[@class="list"]/' +
'/tr[@class="odd" or @class="even"]/td/a')
package_data_xpath = ('//table[@class="list"]/tbody' +
'/tr[@class="odd" or @class="even"]/td')
update_col = 3
download_col = 6
def extract_packages(pypi_list):
req = urllib2.Request(pypi_list)
req.add_header('User-Agent', user_agent)
p = urllib2.urlopen(req)
data = p.read()
html = etree.parse(StringIO(data), etree.HTMLParser())
pkg_tags = html.xpath(package_name_xpath,
namespace = xhtml_ns)
print package_name_xpath
print len(pkg_tags)
pkg_list = [get_pkg_page(p) for p in pkg_tags]
return pkg_list
def get_pkg_page(pkg_dom):
href = pkg_dom.attrib['href']
pkg_title = pkg_dom.text
elements = pkg_title.split()
if len(elements) == 1:
print "package error", pkg_title
return dict(name = pkg_title,
ver = None,
url = href)
else:
name = elements[0]
ver = ' '.join(elements[1:])
return dict(name = name,
ver = ver,
url = href)
def fetch_pkg_data(pkg_dict):
req = urllib2.Request(pkg_link)
req.add_header('User-Agent', user_agent)
p = urllib2.urlopen(req)
data = p.read()
html = etree.parse(StringIO(data), etree.HTMLParser())
pkg_data = html.xpath(package_data_xpath,
namespace = xhtml_ns)
update = pkg_data[update_col].text
download = pkg_data[download_col].text
return (update, download)
def main(pypi_list):
pkg_list = extract_packages(pypi_list)
for p in pkg_list:
p['update'], p['download'] = fetch_pkg_data(p)
return pkg_list
if __name__ == '__main__':
print extract_packages(pypi_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment