Created
June 12, 2019 16:04
-
-
Save wybiral/640ac594e7b08b262f6abe203c379684 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from threading import active_count, Thread | |
from time import sleep, time | |
try: | |
from elasticsearch import Elasticsearch | |
except: | |
print('Requires elasticsearch: pip install elasticsearch') | |
exit(1) | |
try: | |
from queue import Empty, Queue | |
except: | |
print('Requires python3 (stop using python2)') | |
exit(1) | |
try: | |
from requests import Session | |
except: | |
print('Requires requests: pip install requests') | |
exit(1) | |
try: | |
from twitter import Api | |
except: | |
print('Requires python-twitter: pip install python-twitter') | |
exit(1) | |
try: | |
from urllib.parse import urlparse | |
except: | |
print('Requires python3 (stop using python2)') | |
exit(1) | |
# Number of threads to open /robots.txt requests in parallel | |
THREAD_COUNT = 100 | |
# Instantiate Twitter API | |
# Fill in your own values from https://developer.twitter.com/apps | |
api = Api( | |
consumer_key='', | |
consumer_secret='', | |
access_token_key='', | |
access_token_secret='', | |
) | |
# Instantiate ElasticSearch | |
es = Elasticsearch() | |
# Spoof a browser header (this is Tor Browser) | |
HEADERS = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
'DNT': '1', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1', | |
} | |
# SOCKS proxy for requests to go through Tor daemon | |
# For Tor Browser proxy use port 9150 | |
# For Brave Tor mode proxy use port 9350 | |
# Warning: this is NOT used for the Twitter API, only robots.txt requests | |
PROXIES = { | |
'http': 'socks5h://127.0.0.1:9050', | |
'https': 'socks5h://127.0.0.1:9050' | |
} | |
def main(): | |
# Input queue gets filled with urls | |
qin = Queue(maxsize=100) | |
# Output queue gets filled with (url, paths) pairs | |
qout = Queue(maxsize=100) | |
# Start source thread | |
start_thread(source, qin) | |
# Start worker threads | |
for i in range(THREAD_COUNT): | |
start_thread(worker, qin, qout) | |
while True: | |
try: | |
origin, headers, disallow = qout.get(timeout=10) | |
es.index( | |
index="robots", | |
body={ | |
'origin': origin, | |
'headers': headers, | |
'disallow': disallow, | |
} | |
) | |
print(origin) | |
except Empty: | |
# No output since timeout | |
# Check thread count and bail if too few | |
if active_count() <= 2: | |
break | |
except KeyboardInterrupt: | |
break | |
# Read from twitter_stream and write strings to qin for workers | |
def source(qin): | |
# XXX THIS WILL CONTINUE TO GROW | |
done = set() | |
for url in twitter_stream(): | |
if url not in done: | |
qin.put(url) | |
done.add(url) | |
# Get data from qin, parse robots.txt from urls, put results in qout | |
def worker(qin, qout): | |
while True: | |
s = Session() | |
s.headers.update(HEADERS) | |
s.proxies.update(PROXIES) | |
try: | |
origin = qin.get(timeout=60) | |
with s.get(origin + '/robots.txt', stream=True) as r: | |
headers = dict(r.headers) | |
disallow = parse_paths(r.iter_lines(decode_unicode=True)) | |
if disallow: | |
disallow = sorted(disallow) | |
qout.put((origin, headers, disallow)) | |
except Empty: | |
# No more data in the queue, something must be wrong, bail | |
return | |
except Exception as e: | |
# ¯\_(ツ)_/¯ | |
continue | |
# Parse robots.txt and return set of disallowed paths | |
def parse_paths(lines): | |
paths = set() | |
for i, line in enumerate(lines): | |
if i > 100 and len(paths) == 0: | |
# No paths after 100 lines... Just bail... | |
break | |
if len(paths) > 100: | |
# More than 100 paths!? Bail... | |
break | |
if ':' not in line[:16]: | |
continue | |
key, value = line.split(':', 1) | |
key = key.strip().lower() | |
if key != 'disallow': | |
# Ignore everything except disallow rules | |
continue | |
value = value.strip() | |
if len(value) > 256: | |
# Paths larger than 256 chars in robots.txt file? Nah... | |
continue | |
if value in ('', '/'): | |
# Boring, ignore these | |
continue | |
paths.add(value) | |
return paths | |
# Infinitely stream URL origins from Twitter links in real-time | |
def twitter_stream(): | |
# backoff to avoid being put in timeout by Twitter if errors occur | |
backoff = 1 | |
while True: | |
try: | |
for tweet in api.GetStreamSample(): | |
# Reset backoff since request succeeded | |
backoff = 1 | |
if 'entities' not in tweet: | |
continue | |
entities = tweet['entities'] | |
if 'urls' not in entities: | |
continue | |
for url in entities['urls']: | |
if 'unwound' in url: | |
u = url['unwound']['url'] | |
else: | |
u = url['expanded_url'] | |
p = urlparse(u) | |
yield '{x.scheme}://{x.netloc}'.format(x=p).lower() | |
except Exception as e: | |
# Sometimes GetStreamSample connection fails | |
sleep(backoff) | |
# exponential backoff for repeated errors | |
backoff *= 2 | |
# Start fn(*args) in daemonic thread | |
def start_thread(fn, *args): | |
th = Thread(target=fn, args=args) | |
th.daemon = True | |
th.start() | |
# Start at main if executed at a program | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment