Created
November 13, 2013 21:18
-
-
Save gabrii/7456572 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sunburnt | |
import nltk | |
import urlparse | |
from StringIO import StringIO | |
import re | |
from color import * | |
import threading | |
import time | |
threading.stack_size(256*1024) | |
from threading import Thread | |
import traceback | |
import random | |
import sys | |
h =2 | |
if h == 1: | |
host = '127.0.0.1:8123' | |
elif h == 2: | |
host = '192.168.1.11:8123' | |
elif h == 3: | |
host = '192.168.1.12:8123' | |
hilos = 50 | |
sys.stdout.write('\x1b]2;CRAWLER AT '+host+' ('+str(hilos)+' threads)\x07') | |
print '' | |
linkregex = re.compile('<a.*href *= *[\'"](.*?)[\'"].*>') | |
imgregex = re.compile('<img.*src *= *[\'"](.*?)[\'"].*>') | |
global lock | |
lock = False | |
#urls.printing = False | |
pages = sunburnt.SolrInterface("http://127.0.0.1:8983/solr/pages") | |
urls = sunburnt.SolrInterface("http://127.0.0.1:8983/solr/urls") | |
doms = sunburnt.SolrInterface("http://127.0.0.1:8983/solr/doms") | |
global fulls | |
fulls = {} | |
import urllib2, urllib | |
def glock(): | |
return 1 | |
global lock | |
while True: | |
if lock == False: | |
lock = True | |
#print '||' | |
return 1 | |
time.sleep(0.05) | |
def llock(): | |
return 1 | |
global lock | |
lock = False | |
#print '|/' | |
def unlash(s): | |
return s.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ') | |
class fate(): | |
def __init__(self, id): | |
self.id = id | |
self.initc() | |
self.dom = '' | |
self.ers = 0 | |
def initc(self): | |
ph = urllib2.ProxyHandler({'http':host}) | |
#ph = urllib2.ProxyHandler({'http':'127.0.0.1:8123'}) | |
self.opener= urllib2.build_opener(ph) | |
self.opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
def next_url(self): | |
self.url = 'none' | |
self.res = col(7)+'['+str(self.id)+']' | |
try: | |
if self.dom == '' or self.ers > 10: | |
self.ers = 0 | |
time.sleep(5) | |
raise Exception() | |
self.crawling = urls.query(domain=self.dom,status = 0).paginate(start=random.randint(0,10), rows=1)[0] | |
except: | |
self.crawling = urls.query(status = 0).exclude(domain=self.dom).paginate(start=random.randint(0,100), rows=1)[0] | |
del(self.crawling['_version_']) | |
self.url = self.crawling['url'] | |
self.dom = self.crawling['domain'] | |
self.res += col(4)+ ' {'+self.dom+'}' | |
self.set_status(-3) | |
def set_status(self, status): | |
self.crawling['status'] = status | |
urls.add(self.crawling) | |
def extract_links(self): | |
links = linkregex.findall(self.body) | |
url = urlparse.urlparse(self.url) | |
ok_links = 0 | |
for link in links: | |
linkor = link | |
if link.split('.')[-1] in ('txt', 'csv', 'docx', 'doc', 'mobi', 'jar', 'exe', 'gz', 'png','rss', 'tar','zip','deb','jpeg', 'jpg', 'gif', 'bmp', 'pdf', 'epub', 'avi', 'wmv', 'mpeg', 'mpeg4', 'mp3', 'mp4'): | |
continue | |
if link.find("'''") != -1: | |
continue | |
#if len(link) > 200: | |
# continue | |
if link.startswith('/'): | |
link = 'http://' + url[1] + link | |
elif link.startswith('#'): | |
continue #Just for now | |
link = 'http://' + url[1] + url[2] + link | |
elif not link.startswith('http'): | |
link = 'http://' + url[1] + '/' + link | |
if link.find('.onion') == -1: | |
continue | |
if link.find('.onion.to') != -1: | |
link = link.replace('.onion.to', '.onion', 1) | |
if link.find('.tor2web.fi') != -1: | |
link = link.replace('.tor2web.fi', '.onion', 1) | |
if link.find('.onionr2web.org') != -1: | |
link = link.replace('.onionr2web.org', '.onion', 1) | |
if link[-6:] == '.onion': | |
link = link+'/' | |
if link.startswith('https'): | |
link = link.replace('https', 'http', 1) | |
if links.count(linkor) > 1: | |
continue | |
try: | |
dom = self.url_split(link)['domain'] | |
if len(dom) != 16: | |
continue | |
except: | |
continue | |
if fulls.has_key(dom): | |
continue | |
if len(urls.query(url=link)) == 0: | |
full = None | |
for i in range(100): | |
try: | |
full = doms.query(domain=dom)[0]['full'] | |
break | |
except: | |
time.sleep(0.1) | |
if full == None: | |
doms.add({'domain':dom,'num':0, 'full':0}) | |
full = 0 | |
if full == 1: | |
fulls[dom] = None | |
continue | |
urls.add({'url':link, 'domain':dom, 'status':0}) | |
self.increase(dom) | |
ok_links+=1 | |
self.res += ' '+col(6) + str(ok_links)+'/'+str(len(links))+'+links' | |
#paint(self.id, 6) | |
#print ok_links,'/', len(links), 'links added!' | |
def extract_images(self): | |
images = imgregex.findall(self.body) | |
self.images = [] | |
url = urlparse.urlparse(self.url) | |
ok_img = 0 | |
for img in images: | |
#if img.split('.')[-1] not in ('png','jpeg', 'jpg', 'gif', 'bmp'): | |
# continue | |
if img.find("'''") != -1: | |
continue | |
if img.startswith('/'): | |
img = 'http://' + url[1] + img | |
elif img.startswith('#'): | |
continue #Just for now | |
img = 'http://' + url[1] + url[2] + img | |
elif not img.startswith('http'): | |
img = 'http://' + url[1] + '/' + img | |
if img.find('.onion') == -1: | |
continue | |
if img.find('.onion.to') != -1: | |
img = img.replace('.onion.to', '.onion', 1) | |
if img.find('.tor2web.fi') != -1: | |
img = img.replace('.tor2web.fi', '.onion', 1) | |
if img.find('.onionr2web.org') != -1: | |
img = img.replace('.onionr2web.org', '.onion', 1) | |
if img[-6:] == '.onion': | |
img = img+'/' | |
if img.startswith('https'): | |
img = img.replace('https', 'http', 1) | |
if img not in self.images: | |
self.images.append(img) | |
ok_img+=1 | |
self.res += ' '+col(2) + str(ok_img)+'/'+str(len(images))+'+images' | |
#paint(self.id, 2) | |
#print ok_img,'/', len(images), 'images added!' | |
def get(self, trys = 1): | |
try: | |
#paint(self.id, 4) | |
#print 'Opening', self.url | |
start = time.time() | |
try: | |
req = self.opener.open(self.url.encode('utf-8'), None, 120) | |
except urllib2.HTTPError, e: | |
self.set_status(e.code) | |
print col(1)+' {'+self.dom+'} CODE(html):', e.code | |
return 0 | |
except urllib2.URLError, e: | |
print col(1)+' {'+self.dom+'} CODE(url):', e.args[0][0], e.args[0][1] | |
if e.args[0][0] == 111: | |
time.sleep(100) | |
self.set_status(0) | |
return 0 | |
self.set_status(e.args[0][0]) | |
return 0 | |
t = time.time()-start | |
self.res+=col(3)+ ' %.3f' % t + 's' | |
html = req.read() | |
if req.headers["Content-Type"].find('text/html') == -1: | |
self.set_status(-2) | |
paint(self.id, 5) | |
print 'Non-HTML', self.url | |
self.ers+=1 | |
return 0 | |
self.body = unicode(html.decode('utf-8', 'ignore')) | |
return 1 | |
except Exception, E: | |
paint(self.id, 7) | |
print 'Error[',E,'] opening', self.url | |
print col(1)+' {'+self.dom+'}' | |
self.ers+=1 | |
self.set_status(-1) | |
return 0 | |
def index(self): | |
readable = nltk.clean_html(self.body.lower()) | |
page = {'url':self.url, 'content':unlash(readable), 'html':self.body, 'domain':self.dom} | |
for i in range(len(self.images)): | |
page['img_'+str(i)] = self.images[i] | |
glock() | |
try: | |
pages.add(page) | |
self.ers = 0 | |
#si_pages.commit() | |
except Exception, E: | |
try: | |
pages.add(page) | |
self.ers = 0 | |
except: | |
self.ers+=1 | |
raise Exception('Could not index page!') | |
finally: | |
llock() | |
#paint(self.id, 3) | |
#print 'Indexed', self.url | |
def increase(self, dom): | |
for i in range(2): | |
try: | |
d = doms.query(domain=dom)[0] | |
del(d['_version_']) | |
d['num']+=1 | |
if d['num'] > 5048: | |
d['full'] = 1 | |
doms.add(d) | |
except: | |
time.sleep(0.01) | |
def url_split(self, url): | |
if type(url) == dict: | |
return url | |
url = url.split('.onion') | |
domain = url[0].replace('http://', '') | |
path = '.onion'.join(url[1:]) | |
return {'domain':domain, 'path':path} | |
def run(self): | |
while True: | |
try: | |
for i in range(20): | |
try: | |
self.next_url() | |
break | |
except: | |
if i == 19: | |
self.next_url() | |
if not self.get(2): | |
continue | |
for i in range(20): | |
try: | |
self.extract_links() | |
break | |
except: | |
if i == 19: | |
self.extract_links() | |
for i in range(20): | |
try: | |
self.extract_images() | |
break | |
except: | |
if i == 19: | |
self.extract_images() | |
try: | |
for i in range(20): | |
try: | |
self.index() | |
self.set_status(1) | |
break | |
except: | |
if i == 19: | |
self.index() | |
self.set_status(1) | |
except: | |
self.ers+=1 | |
self.set_status(-5) | |
print self.res | |
except Exception, E: | |
self.initc() | |
paint(self.id, 1) | |
try: | |
s = self.url | |
if s == 'none': | |
raise Exception() | |
self.set_status(-1) | |
except: | |
print 'Err 124' | |
print 'Crawler crashed cuz', E | |
self.ers +=1 | |
def start_thread(id): | |
f = fate(id) | |
f.run() | |
if 0: | |
#urls.delete_all() | |
#doms.delete_all() | |
#pages.delete_all() | |
#time.sleep(0.1) | |
#urls.add({'url':'http://torlinkbgs6aabns.onion/', 'domain':'torlinkbgs6aabns', 'status':0}) | |
#doms.add({'domain':'torlinkbgs6aabns', 'num':0, 'full':0}) | |
# urls.add({'url':'http://dppmfxaacucguzpc.onion/', 'domain':'dppmfxaacucguzpc', 'status':0}) | |
# doms.add({'domain':'dppmfxaacucguzpc', 'num':0, 'full':0}) | |
#urls.add({'url':'http://kpvz7ki2v5agwt35.onion/wiki/index.php/Main_Page', 'domain':'kpvz7ki2v5agwt35', 'status':0}) | |
#doms.add({'domain':'kpvz7ki2v5agwt35', 'num':0, 'full':0}) | |
urls.add({'url':'http://bdpuqvsqmphctrcs.onion/noscript.html', 'domain':'bdpuqvsqmphctrcs', 'status':0}) | |
doms.add({'domain':'bdpuqvsqmphctrcs', 'num':0, 'full':0}) | |
time.sleep(0.5) | |
#f = fate(0) | |
#f.run() | |
exit() | |
if 1: | |
#RESET ERRORS! | |
for foo in range(10): | |
for err in (500, 501,502,503,504,505): | |
u = urls.query(status = err) | |
print 'done!' | |
for i in u: | |
i['status'] = 0 | |
del(i['_version_']) | |
urls.add(i) | |
print err, i['url'] | |
urls.commit() | |
exit() | |
if 0: | |
#RESET ERRORS! | |
for foo in range(1): | |
u = urls.query() | |
print 'done!' | |
for i in u: | |
if i['url'].find('.onion.to/') != -1: | |
urls.delete(i) | |
print i['url'] | |
i['url'] = i['url'].replace('.onion.to/', '.onion/', 1) | |
del(i['_version_']) | |
urls.add(i) | |
urls.commit() | |
exit() | |
if 0: | |
#RESET FULL DOMS! | |
u = doms.query(full = 1).paginate(start=0, rows=2000000) | |
for i in range(len(u)): | |
e = u[i] | |
e['full'] = 0 | |
del(e['_version_']) | |
doms.add(e) | |
doms.commit() | |
if i % 1 == 0: | |
print i, u[i]['domain'] | |
print len(u) | |
exit() | |
if 0: | |
u = doms.query(domain='7haz75ietrhjds3j').paginate(start=0, rows=2000000) | |
for i in u: | |
i['full'] = 1 | |
del(i['_version_']) | |
doms.add(i) | |
print i | |
exit() | |
if 0: | |
time.sleep(500) | |
u = pages.query() | |
n = 0 | |
for e in u: | |
n+=1 | |
if len(e['domain']) != 16: | |
pages.delete(e) | |
print e['domain'] | |
if n % 1000 == 0: | |
print n | |
print len(u) | |
exit() | |
if 0: | |
t = fate(0) | |
t.res = '' | |
import parser | |
t.body = parser.s | |
t.url = 'http://ahmia.fi/' | |
t.dom = 'ahmia' | |
t.extract_links() | |
exit() | |
for j in range(10,10+hilos): | |
t = Thread(target=start_thread, args=(j,)) | |
t.start() | |
time.sleep(0.1) | |
t.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment