Skip to content

Instantly share code, notes, and snippets.

@yrik
Created October 7, 2015 09:08
Show Gist options
  • Save yrik/a58774c01a0da94c11dc to your computer and use it in GitHub Desktop.
Save yrik/a58774c01a0da94c11dc to your computer and use it in GitHub Desktop.
get all django sites in the world
# coding: utf-8
import signal
import time
import socket
import requests
import lxml.html
from urlparse import urlparse
from multiprocessing import Pool
from db import Site, Session
socket.setdefaulttimeout(1)
'''
1. iterate over sites and
1. collect unique domains
2. check if it's django app
'''
class Timeout():
"""Timeout class using ALARM signal."""
class Timeout(Exception):
pass
def __init__(self, sec):
self.sec = sec
def __enter__(self):
signal.signal(signal.SIGALRM, self.raise_timeout)
signal.alarm(self.sec)
def __exit__(self, *args):
signal.alarm(0) # disable alarm
def raise_timeout(self, *args):
raise Timeout.Timeout()
def get_response(url):
try:
with Timeout(1):
r = requests.get(url, timeout=1)
content = r.content
del r
return content
except Exception as e:
#print e
return None
def get_domain(url):
parsed_uri = urlparse(url)
return parsed_uri.netloc
def is_django_domain(domain):
url = 'http://%s/admin/' % domain
r = get_response(url)
is_django = False
if r:
if 'Django site admin' in r:
is_django = True
if 'csrfmiddlewaretoken' in r and '/admin/' in r:
is_django = True
del r
return is_django
def get_next_domains(domain):
url = 'http://%s' % domain
content = get_response(url)
if not content:
return []
try:
dom = lxml.html.fromstring(content)
except Exception as e:
print e
return []
links = dom.xpath('//a/@href')
links = filter(lambda x: 'http' in x, links)
domains = map(get_domain, links)
domains = set(domains)
del dom
del links
return domains
def get_new_sites(session):
session.expire_all()
return session.query(Site).filter_by(is_processed=False)
def add_new_sites(session, domains):
session.expire_all()
sites = []
for domain in domains:
exist_count = session.query(Site).filter_by(domain=domain).count()
if not exist_count:
site = Site(domain=domain, is_processed=False, is_django=False)
sites.append(site)
try:
session.add(site)
session.commit()
except Exception as e:
session.rollback()
print e
return sites
def mark_processed(session, site, is_django):
site.is_django = is_django
site.is_processed = True
try:
session.merge(site)
session.commit()
except Exception as e:
session.rollback()
print e
return site
def handle_site(site):
domain = site.domain
session = Session()
is_django = is_django_domain(domain)
try:
print domain, is_django
except Exception as e:
print e
mark_processed(session, site, is_django)
domains = get_next_domains(domain)
add_new_sites(session, domains)
del domains
session.close()
return domain
def handle_site_wrapped(site):
try:
handle_site(site)
except Exception as e:
pass
def process():
#add_new_sites(['www.djangoproject.com'])
session = Session()
N = 10000
sites = get_new_sites(session).limit(N)
while sites:
start_time = time.time()
print 'GOT SITES'
pool = Pool(20)
pool.map(handle_site_wrapped, sites)
pool.close()
end_time = time.time()
print 'PROCESSED %s SITES in %s min' % (N, round((end_time-start_time)/60.0, 2))
session.close()
session = Session()
sites = get_new_sites(session).limit(N)
if __name__ == '__main__':
process()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment