Created
October 7, 2015 09:08
-
-
Save yrik/a58774c01a0da94c11dc to your computer and use it in GitHub Desktop.
get all django sites in the world
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import signal | |
import time | |
import socket | |
import requests | |
import lxml.html | |
from urlparse import urlparse | |
from multiprocessing import Pool | |
from db import Site, Session | |
socket.setdefaulttimeout(1) | |
''' | |
1. iterate over sites and | |
1. collect unique domains | |
2. check if it's django app | |
''' | |
class Timeout(): | |
"""Timeout class using ALARM signal.""" | |
class Timeout(Exception): | |
pass | |
def __init__(self, sec): | |
self.sec = sec | |
def __enter__(self): | |
signal.signal(signal.SIGALRM, self.raise_timeout) | |
signal.alarm(self.sec) | |
def __exit__(self, *args): | |
signal.alarm(0) # disable alarm | |
def raise_timeout(self, *args): | |
raise Timeout.Timeout() | |
def get_response(url): | |
try: | |
with Timeout(1): | |
r = requests.get(url, timeout=1) | |
content = r.content | |
del r | |
return content | |
except Exception as e: | |
#print e | |
return None | |
def get_domain(url): | |
parsed_uri = urlparse(url) | |
return parsed_uri.netloc | |
def is_django_domain(domain): | |
url = 'http://%s/admin/' % domain | |
r = get_response(url) | |
is_django = False | |
if r: | |
if 'Django site admin' in r: | |
is_django = True | |
if 'csrfmiddlewaretoken' in r and '/admin/' in r: | |
is_django = True | |
del r | |
return is_django | |
def get_next_domains(domain): | |
url = 'http://%s' % domain | |
content = get_response(url) | |
if not content: | |
return [] | |
try: | |
dom = lxml.html.fromstring(content) | |
except Exception as e: | |
print e | |
return [] | |
links = dom.xpath('//a/@href') | |
links = filter(lambda x: 'http' in x, links) | |
domains = map(get_domain, links) | |
domains = set(domains) | |
del dom | |
del links | |
return domains | |
def get_new_sites(session): | |
session.expire_all() | |
return session.query(Site).filter_by(is_processed=False) | |
def add_new_sites(session, domains): | |
session.expire_all() | |
sites = [] | |
for domain in domains: | |
exist_count = session.query(Site).filter_by(domain=domain).count() | |
if not exist_count: | |
site = Site(domain=domain, is_processed=False, is_django=False) | |
sites.append(site) | |
try: | |
session.add(site) | |
session.commit() | |
except Exception as e: | |
session.rollback() | |
print e | |
return sites | |
def mark_processed(session, site, is_django): | |
site.is_django = is_django | |
site.is_processed = True | |
try: | |
session.merge(site) | |
session.commit() | |
except Exception as e: | |
session.rollback() | |
print e | |
return site | |
def handle_site(site): | |
domain = site.domain | |
session = Session() | |
is_django = is_django_domain(domain) | |
try: | |
print domain, is_django | |
except Exception as e: | |
print e | |
mark_processed(session, site, is_django) | |
domains = get_next_domains(domain) | |
add_new_sites(session, domains) | |
del domains | |
session.close() | |
return domain | |
def handle_site_wrapped(site): | |
try: | |
handle_site(site) | |
except Exception as e: | |
pass | |
def process(): | |
#add_new_sites(['www.djangoproject.com']) | |
session = Session() | |
N = 10000 | |
sites = get_new_sites(session).limit(N) | |
while sites: | |
start_time = time.time() | |
print 'GOT SITES' | |
pool = Pool(20) | |
pool.map(handle_site_wrapped, sites) | |
pool.close() | |
end_time = time.time() | |
print 'PROCESSED %s SITES in %s min' % (N, round((end_time-start_time)/60.0, 2)) | |
session.close() | |
session = Session() | |
sites = get_new_sites(session).limit(N) | |
if __name__ == '__main__': | |
process() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment