Skip to content

Instantly share code, notes, and snippets.

@gabrii
Created November 13, 2013 21:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gabrii/7456572 to your computer and use it in GitHub Desktop.
Save gabrii/7456572 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sunburnt
import nltk
import urlparse
from StringIO import StringIO
import re
from color import *
import threading
import time
threading.stack_size(256*1024)
from threading import Thread
import traceback
import random
import sys
h =2
if h == 1:
host = '127.0.0.1:8123'
elif h == 2:
host = '192.168.1.11:8123'
elif h == 3:
host = '192.168.1.12:8123'
hilos = 50
sys.stdout.write('\x1b]2;CRAWLER AT '+host+' ('+str(hilos)+' threads)\x07')
print ''
linkregex = re.compile('<a.*href *= *[\'"](.*?)[\'"].*>')
imgregex = re.compile('<img.*src *= *[\'"](.*?)[\'"].*>')
global lock
lock = False
#urls.printing = False
pages = sunburnt.SolrInterface("http://127.0.0.1:8983/solr/pages")
urls = sunburnt.SolrInterface("http://127.0.0.1:8983/solr/urls")
doms = sunburnt.SolrInterface("http://127.0.0.1:8983/solr/doms")
global fulls
fulls = {}
import urllib2, urllib
def glock():
return 1
global lock
while True:
if lock == False:
lock = True
#print '||'
return 1
time.sleep(0.05)
def llock():
return 1
global lock
lock = False
#print '|/'
def unlash(s):
return s.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ')
class fate():
def __init__(self, id):
self.id = id
self.initc()
self.dom = ''
self.ers = 0
def initc(self):
ph = urllib2.ProxyHandler({'http':host})
#ph = urllib2.ProxyHandler({'http':'127.0.0.1:8123'})
self.opener= urllib2.build_opener(ph)
self.opener.addheaders = [('User-agent', 'Mozilla/5.0')]
def next_url(self):
self.url = 'none'
self.res = col(7)+'['+str(self.id)+']'
try:
if self.dom == '' or self.ers > 10:
self.ers = 0
time.sleep(5)
raise Exception()
self.crawling = urls.query(domain=self.dom,status = 0).paginate(start=random.randint(0,10), rows=1)[0]
except:
self.crawling = urls.query(status = 0).exclude(domain=self.dom).paginate(start=random.randint(0,100), rows=1)[0]
del(self.crawling['_version_'])
self.url = self.crawling['url']
self.dom = self.crawling['domain']
self.res += col(4)+ ' {'+self.dom+'}'
self.set_status(-3)
def set_status(self, status):
self.crawling['status'] = status
urls.add(self.crawling)
def extract_links(self):
links = linkregex.findall(self.body)
url = urlparse.urlparse(self.url)
ok_links = 0
for link in links:
linkor = link
if link.split('.')[-1] in ('txt', 'csv', 'docx', 'doc', 'mobi', 'jar', 'exe', 'gz', 'png','rss', 'tar','zip','deb','jpeg', 'jpg', 'gif', 'bmp', 'pdf', 'epub', 'avi', 'wmv', 'mpeg', 'mpeg4', 'mp3', 'mp4'):
continue
if link.find("'''") != -1:
continue
#if len(link) > 200:
# continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
continue #Just for now
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link.find('.onion') == -1:
continue
if link.find('.onion.to') != -1:
link = link.replace('.onion.to', '.onion', 1)
if link.find('.tor2web.fi') != -1:
link = link.replace('.tor2web.fi', '.onion', 1)
if link.find('.onionr2web.org') != -1:
link = link.replace('.onionr2web.org', '.onion', 1)
if link[-6:] == '.onion':
link = link+'/'
if link.startswith('https'):
link = link.replace('https', 'http', 1)
if links.count(linkor) > 1:
continue
try:
dom = self.url_split(link)['domain']
if len(dom) != 16:
continue
except:
continue
if fulls.has_key(dom):
continue
if len(urls.query(url=link)) == 0:
full = None
for i in range(100):
try:
full = doms.query(domain=dom)[0]['full']
break
except:
time.sleep(0.1)
if full == None:
doms.add({'domain':dom,'num':0, 'full':0})
full = 0
if full == 1:
fulls[dom] = None
continue
urls.add({'url':link, 'domain':dom, 'status':0})
self.increase(dom)
ok_links+=1
self.res += ' '+col(6) + str(ok_links)+'/'+str(len(links))+'+links'
#paint(self.id, 6)
#print ok_links,'/', len(links), 'links added!'
def extract_images(self):
images = imgregex.findall(self.body)
self.images = []
url = urlparse.urlparse(self.url)
ok_img = 0
for img in images:
#if img.split('.')[-1] not in ('png','jpeg', 'jpg', 'gif', 'bmp'):
# continue
if img.find("'''") != -1:
continue
if img.startswith('/'):
img = 'http://' + url[1] + img
elif img.startswith('#'):
continue #Just for now
img = 'http://' + url[1] + url[2] + img
elif not img.startswith('http'):
img = 'http://' + url[1] + '/' + img
if img.find('.onion') == -1:
continue
if img.find('.onion.to') != -1:
img = img.replace('.onion.to', '.onion', 1)
if img.find('.tor2web.fi') != -1:
img = img.replace('.tor2web.fi', '.onion', 1)
if img.find('.onionr2web.org') != -1:
img = img.replace('.onionr2web.org', '.onion', 1)
if img[-6:] == '.onion':
img = img+'/'
if img.startswith('https'):
img = img.replace('https', 'http', 1)
if img not in self.images:
self.images.append(img)
ok_img+=1
self.res += ' '+col(2) + str(ok_img)+'/'+str(len(images))+'+images'
#paint(self.id, 2)
#print ok_img,'/', len(images), 'images added!'
def get(self, trys = 1):
try:
#paint(self.id, 4)
#print 'Opening', self.url
start = time.time()
try:
req = self.opener.open(self.url.encode('utf-8'), None, 120)
except urllib2.HTTPError, e:
self.set_status(e.code)
print col(1)+' {'+self.dom+'} CODE(html):', e.code
return 0
except urllib2.URLError, e:
print col(1)+' {'+self.dom+'} CODE(url):', e.args[0][0], e.args[0][1]
if e.args[0][0] == 111:
time.sleep(100)
self.set_status(0)
return 0
self.set_status(e.args[0][0])
return 0
t = time.time()-start
self.res+=col(3)+ ' %.3f' % t + 's'
html = req.read()
if req.headers["Content-Type"].find('text/html') == -1:
self.set_status(-2)
paint(self.id, 5)
print 'Non-HTML', self.url
self.ers+=1
return 0
self.body = unicode(html.decode('utf-8', 'ignore'))
return 1
except Exception, E:
paint(self.id, 7)
print 'Error[',E,'] opening', self.url
print col(1)+' {'+self.dom+'}'
self.ers+=1
self.set_status(-1)
return 0
def index(self):
readable = nltk.clean_html(self.body.lower())
page = {'url':self.url, 'content':unlash(readable), 'html':self.body, 'domain':self.dom}
for i in range(len(self.images)):
page['img_'+str(i)] = self.images[i]
glock()
try:
pages.add(page)
self.ers = 0
#si_pages.commit()
except Exception, E:
try:
pages.add(page)
self.ers = 0
except:
self.ers+=1
raise Exception('Could not index page!')
finally:
llock()
#paint(self.id, 3)
#print 'Indexed', self.url
def increase(self, dom):
for i in range(2):
try:
d = doms.query(domain=dom)[0]
del(d['_version_'])
d['num']+=1
if d['num'] > 5048:
d['full'] = 1
doms.add(d)
except:
time.sleep(0.01)
def url_split(self, url):
if type(url) == dict:
return url
url = url.split('.onion')
domain = url[0].replace('http://', '')
path = '.onion'.join(url[1:])
return {'domain':domain, 'path':path}
def run(self):
while True:
try:
for i in range(20):
try:
self.next_url()
break
except:
if i == 19:
self.next_url()
if not self.get(2):
continue
for i in range(20):
try:
self.extract_links()
break
except:
if i == 19:
self.extract_links()
for i in range(20):
try:
self.extract_images()
break
except:
if i == 19:
self.extract_images()
try:
for i in range(20):
try:
self.index()
self.set_status(1)
break
except:
if i == 19:
self.index()
self.set_status(1)
except:
self.ers+=1
self.set_status(-5)
print self.res
except Exception, E:
self.initc()
paint(self.id, 1)
try:
s = self.url
if s == 'none':
raise Exception()
self.set_status(-1)
except:
print 'Err 124'
print 'Crawler crashed cuz', E
self.ers +=1
def start_thread(id):
f = fate(id)
f.run()
if 0:
#urls.delete_all()
#doms.delete_all()
#pages.delete_all()
#time.sleep(0.1)
#urls.add({'url':'http://torlinkbgs6aabns.onion/', 'domain':'torlinkbgs6aabns', 'status':0})
#doms.add({'domain':'torlinkbgs6aabns', 'num':0, 'full':0})
# urls.add({'url':'http://dppmfxaacucguzpc.onion/', 'domain':'dppmfxaacucguzpc', 'status':0})
# doms.add({'domain':'dppmfxaacucguzpc', 'num':0, 'full':0})
#urls.add({'url':'http://kpvz7ki2v5agwt35.onion/wiki/index.php/Main_Page', 'domain':'kpvz7ki2v5agwt35', 'status':0})
#doms.add({'domain':'kpvz7ki2v5agwt35', 'num':0, 'full':0})
urls.add({'url':'http://bdpuqvsqmphctrcs.onion/noscript.html', 'domain':'bdpuqvsqmphctrcs', 'status':0})
doms.add({'domain':'bdpuqvsqmphctrcs', 'num':0, 'full':0})
time.sleep(0.5)
#f = fate(0)
#f.run()
exit()
if 1:
#RESET ERRORS!
for foo in range(10):
for err in (500, 501,502,503,504,505):
u = urls.query(status = err)
print 'done!'
for i in u:
i['status'] = 0
del(i['_version_'])
urls.add(i)
print err, i['url']
urls.commit()
exit()
if 0:
#RESET ERRORS!
for foo in range(1):
u = urls.query()
print 'done!'
for i in u:
if i['url'].find('.onion.to/') != -1:
urls.delete(i)
print i['url']
i['url'] = i['url'].replace('.onion.to/', '.onion/', 1)
del(i['_version_'])
urls.add(i)
urls.commit()
exit()
if 0:
#RESET FULL DOMS!
u = doms.query(full = 1).paginate(start=0, rows=2000000)
for i in range(len(u)):
e = u[i]
e['full'] = 0
del(e['_version_'])
doms.add(e)
doms.commit()
if i % 1 == 0:
print i, u[i]['domain']
print len(u)
exit()
if 0:
u = doms.query(domain='7haz75ietrhjds3j').paginate(start=0, rows=2000000)
for i in u:
i['full'] = 1
del(i['_version_'])
doms.add(i)
print i
exit()
if 0:
time.sleep(500)
u = pages.query()
n = 0
for e in u:
n+=1
if len(e['domain']) != 16:
pages.delete(e)
print e['domain']
if n % 1000 == 0:
print n
print len(u)
exit()
if 0:
t = fate(0)
t.res = ''
import parser
t.body = parser.s
t.url = 'http://ahmia.fi/'
t.dom = 'ahmia'
t.extract_links()
exit()
for j in range(10,10+hilos):
t = Thread(target=start_thread, args=(j,))
t.start()
time.sleep(0.1)
t.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment