Skip to content

Instantly share code, notes, and snippets.

@wybiral
Created June 12, 2019 16:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wybiral/640ac594e7b08b262f6abe203c379684 to your computer and use it in GitHub Desktop.
Save wybiral/640ac594e7b08b262f6abe203c379684 to your computer and use it in GitHub Desktop.
from threading import active_count, Thread
from time import sleep, time
try:
from elasticsearch import Elasticsearch
except:
print('Requires elasticsearch: pip install elasticsearch')
exit(1)
try:
from queue import Empty, Queue
except:
print('Requires python3 (stop using python2)')
exit(1)
try:
from requests import Session
except:
print('Requires requests: pip install requests')
exit(1)
try:
from twitter import Api
except:
print('Requires python-twitter: pip install python-twitter')
exit(1)
try:
from urllib.parse import urlparse
except:
print('Requires python3 (stop using python2)')
exit(1)
# Number of threads to open /robots.txt requests in parallel
THREAD_COUNT = 100
# Instantiate Twitter API
# Fill in your own values from https://developer.twitter.com/apps
api = Api(
consumer_key='',
consumer_secret='',
access_token_key='',
access_token_secret='',
)
# Instantiate ElasticSearch
es = Elasticsearch()
# Spoof a browser header (this is Tor Browser)
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# SOCKS proxy for requests to go through Tor daemon
# For Tor Browser proxy use port 9150
# For Brave Tor mode proxy use port 9350
# Warning: this is NOT used for the Twitter API, only robots.txt requests
PROXIES = {
'http': 'socks5h://127.0.0.1:9050',
'https': 'socks5h://127.0.0.1:9050'
}
def main():
# Input queue gets filled with urls
qin = Queue(maxsize=100)
# Output queue gets filled with (url, paths) pairs
qout = Queue(maxsize=100)
# Start source thread
start_thread(source, qin)
# Start worker threads
for i in range(THREAD_COUNT):
start_thread(worker, qin, qout)
while True:
try:
origin, headers, disallow = qout.get(timeout=10)
es.index(
index="robots",
body={
'origin': origin,
'headers': headers,
'disallow': disallow,
}
)
print(origin)
except Empty:
# No output since timeout
# Check thread count and bail if too few
if active_count() <= 2:
break
except KeyboardInterrupt:
break
# Read from twitter_stream and write strings to qin for workers
def source(qin):
# XXX THIS WILL CONTINUE TO GROW
done = set()
for url in twitter_stream():
if url not in done:
qin.put(url)
done.add(url)
# Get data from qin, parse robots.txt from urls, put results in qout
def worker(qin, qout):
while True:
s = Session()
s.headers.update(HEADERS)
s.proxies.update(PROXIES)
try:
origin = qin.get(timeout=60)
with s.get(origin + '/robots.txt', stream=True) as r:
headers = dict(r.headers)
disallow = parse_paths(r.iter_lines(decode_unicode=True))
if disallow:
disallow = sorted(disallow)
qout.put((origin, headers, disallow))
except Empty:
# No more data in the queue, something must be wrong, bail
return
except Exception as e:
# ¯\_(ツ)_/¯
continue
# Parse robots.txt and return set of disallowed paths
def parse_paths(lines):
paths = set()
for i, line in enumerate(lines):
if i > 100 and len(paths) == 0:
# No paths after 100 lines... Just bail...
break
if len(paths) > 100:
# More than 100 paths!? Bail...
break
if ':' not in line[:16]:
continue
key, value = line.split(':', 1)
key = key.strip().lower()
if key != 'disallow':
# Ignore everything except disallow rules
continue
value = value.strip()
if len(value) > 256:
# Paths larger than 256 chars in robots.txt file? Nah...
continue
if value in ('', '/'):
# Boring, ignore these
continue
paths.add(value)
return paths
# Infinitely stream URL origins from Twitter links in real-time
def twitter_stream():
# backoff to avoid being put in timeout by Twitter if errors occur
backoff = 1
while True:
try:
for tweet in api.GetStreamSample():
# Reset backoff since request succeeded
backoff = 1
if 'entities' not in tweet:
continue
entities = tweet['entities']
if 'urls' not in entities:
continue
for url in entities['urls']:
if 'unwound' in url:
u = url['unwound']['url']
else:
u = url['expanded_url']
p = urlparse(u)
yield '{x.scheme}://{x.netloc}'.format(x=p).lower()
except Exception as e:
# Sometimes GetStreamSample connection fails
sleep(backoff)
# exponential backoff for repeated errors
backoff *= 2
# Start fn(*args) in daemonic thread
def start_thread(fn, *args):
th = Thread(target=fn, args=args)
th.daemon = True
th.start()
# Start at main if executed at a program
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment