Skip to content

Instantly share code, notes, and snippets.

@tnydwrds
Created August 20, 2011 02:57
Show Gist options
  • Save tnydwrds/1158556 to your computer and use it in GitHub Desktop.
Save tnydwrds/1158556 to your computer and use it in GitHub Desktop.
Scrape one level of links on a site for OpenX 2.8 afr tags
#!/usr/bin/env python
import re
import urllib2
log_message = '"%s",%d,"%s"'
print('"url","num_of_afrs","zones"')
def get_zones(htmldoc):
return set(re.findall('<iframe.+?src=["\'].+?afr\.php.+?zoneid=(\d+).*?["\'].+?>',htmldoc))
def get_links(htmldoc, root_domain):
return set(re.findall('<a.+?href=["\'](http://'+root_domain+'.*?)["\'].+?>',htmldoc))
def parse_url(url, root_domain, recursive=False):
"""
Recursive function to go through links.
This is kind of deceptive because isn't intelligently recursive. Its
really only useful for one level at the moment.
"""
req = urllib2.Request(url)
res = urllib2.urlopen(req)
html = res.read()
zones = get_zones(html)
print log_message % (url, len(zones), ','.join(zones))
if recursive:
links.update(get_links(html, root_domain))
for link in links:
parse_url(link, root_domain)
if __name__ == '__main__':
links = set()
parse_url('http://example.com', 'example.com', True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment