Last active
June 11, 2019 14:14
-
-
Save wybiral/20c20ccf00b6c93506b8acdc6ccb0c8b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from threading import active_count, Thread | |
from time import sleep, time | |
try: | |
from queue import Empty, Queue | |
except: | |
print('Requires python3 (stop using python2)') | |
exit(1) | |
try: | |
from requests import Session | |
except: | |
print('Requires requests: pip install requests') | |
exit(1) | |
try: | |
from twitter import Api | |
except: | |
print('Requires python-twitter: pip install python-twitter') | |
exit(1) | |
try: | |
from urllib.parse import urlparse | |
except: | |
print('Requires python3 (stop using python2)') | |
exit(1) | |
# Number of threads to open /robots.txt requests in parallel | |
THREAD_COUNT = 100 | |
# Instantiate Twitter API | |
# Fill in your own values from https://developer.twitter.com/apps | |
api = Api( | |
consumer_key='... YOUR CONSUMER API KEY ...', | |
consumer_secret='... YOUR CONSUMER API SECRET KEY ...', | |
access_token_key='... YOUR ACCESS TOKEN ...', | |
access_token_secret='... YOUR ACCESS TOKEN SECRET ...', | |
) | |
# Spoof a browser header (this is Tor Browser) | |
HEADERS = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
'DNT': '1', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1', | |
} | |
# SOCKS proxy for requests to go through Tor daemon | |
# For Tor Browser proxy use port 9150 | |
# For Brave Tor mode proxy use port 9350 | |
# Warning: this is NOT used for the Twitter API, only robots.txt requests | |
PROXIES = { | |
'http': 'socks5h://127.0.0.1:9050', | |
'https': 'socks5h://127.0.0.1:9050' | |
} | |
def main(): | |
# Input queue gets filled with urls | |
qin = Queue(maxsize=100) | |
# Output queue gets filled with (url, paths) pairs | |
qout = Queue(maxsize=100) | |
# Start source thread | |
start_thread(source, qin) | |
# Start worker threads | |
for i in range(THREAD_COUNT): | |
start_thread(worker, qin, qout) | |
while True: | |
try: | |
url, paths = qout.get(timeout=10) | |
print(url) | |
for path in sorted(paths): | |
print(' '+ path) | |
except Empty: | |
# No output since timeout | |
# Check thread count and bail if too few | |
if active_count() <= 2: | |
break | |
except KeyboardInterrupt: | |
break | |
# Read from twitter_stream and write strings to qin for workers | |
def source(qin): | |
# XXX THIS WILL CONTINUE TO GROW | |
done = set() | |
for url in twitter_stream(): | |
if url not in done: | |
qin.put(url) | |
done.add(url) | |
# Get data from qin, parse robots.txt from urls, put results in qout | |
def worker(qin, qout): | |
while True: | |
s = Session() | |
s.headers.update(HEADERS) | |
s.proxies.update(PROXIES) | |
try: | |
url = qin.get(timeout=60) | |
with s.get(url + '/robots.txt', stream=True) as r: | |
paths = parse_paths(r.iter_lines(decode_unicode=True)) | |
if paths: | |
qout.put((url, paths)) | |
except Empty: | |
# No more data in the queue, something must be wrong, bail | |
return | |
except: | |
# ¯\_(ツ)_/¯ | |
continue | |
# Parse robots.txt and return set of disallowed paths | |
def parse_paths(lines): | |
paths = set() | |
for i, line in enumerate(lines): | |
if i > 100 and len(paths) == 0: | |
# No paths after 100 lines... Just bail... | |
break | |
if len(paths) > 100: | |
# More than 100 paths!? Bail... | |
break | |
if ':' not in line[:16]: | |
continue | |
key, value = line.split(':', 1) | |
key = key.strip().lower() | |
if key != 'disallow': | |
# Ignore everything except disallow rules | |
continue | |
value = value.strip() | |
if len(value) > 256: | |
# Paths larger than 256 chars in robots.txt file? Nah... | |
continue | |
if value in ('', '/'): | |
# Boring, ignore these | |
continue | |
paths.add(value) | |
return paths | |
# Infinitely stream URL origins from Twitter links in real-time | |
def twitter_stream(): | |
# backoff to avoid being put in timeout by Twitter if errors occur | |
backoff = 1 | |
while True: | |
try: | |
for tweet in api.GetStreamSample(): | |
# Reset backoff since request succeeded | |
backoff = 1 | |
if 'entities' not in tweet: | |
continue | |
entities = tweet['entities'] | |
if 'urls' not in entities: | |
continue | |
for url in entities['urls']: | |
if 'unwound' in url: | |
u = url['unwound']['url'] | |
else: | |
u = url['expanded_url'] | |
p = urlparse(u) | |
yield '{x.scheme}://{x.netloc}'.format(x=p).lower() | |
except Exception as e: | |
# Sometimes GetStreamSample connection fails | |
sleep(backoff) | |
# exponential backoff for repeated errors | |
backoff *= 2 | |
# Start fn(*args) in daemonic thread | |
def start_thread(fn, *args): | |
th = Thread(target=fn, args=args) | |
th.daemon = True | |
th.start() | |
# Start at main if executed at a program | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment