wybiral/robots.py

## robots.py
from threading import active_count, Thread
from time import sleep, time

try:
    from queue import Empty, Queue
except:
    print('Requires python3 (stop using python2)')
    exit(1)

try:
    from requests import Session
except:
    print('Requires requests: pip install requests')
    exit(1)

try:
    from twitter import Api
except:
    print('Requires python-twitter: pip install python-twitter')
    exit(1)

try:
    from urllib.parse import urlparse
except:
    print('Requires python3 (stop using python2)')
    exit(1)

# Number of threads to open /robots.txt requests in parallel
THREAD_COUNT = 100

# Instantiate Twitter API
# Fill in your own values from https://developer.twitter.com/apps
api = Api(
    consumer_key='... YOUR CONSUMER API KEY ...',
    consumer_secret='... YOUR CONSUMER API SECRET KEY ...',
    access_token_key='... YOUR ACCESS TOKEN ...',
    access_token_secret='... YOUR ACCESS TOKEN SECRET ...',
)

# Spoof a browser header (this is Tor Browser)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

# SOCKS proxy for requests to go through Tor daemon
# For Tor Browser proxy use port 9150
# For Brave Tor mode proxy use port 9350
# Warning: this is NOT used for the Twitter API, only robots.txt requests
PROXIES = {
    'http': 'socks5h://127.0.0.1:9050',
    'https': 'socks5h://127.0.0.1:9050'
}

def main():
    # Input queue gets filled with urls
    qin = Queue(maxsize=100)
    # Output queue gets filled with (url, paths) pairs
    qout = Queue(maxsize=100)
    # Start source thread
    start_thread(source, qin)
    # Start worker threads
    for i in range(THREAD_COUNT):
        start_thread(worker, qin, qout)
    while True:
        try:
            url, paths = qout.get(timeout=10)
            print(url)
            for path in sorted(paths):
                print('  '+ path)
        except Empty:
            # No output since timeout
            # Check thread count and bail if too few
            if active_count() <= 2:
                break
        except KeyboardInterrupt:
            break

# Read from twitter_stream and write strings to qin for workers
def source(qin):
    # XXX THIS WILL CONTINUE TO GROW
    done = set()
    for url in twitter_stream():
        if url not in done:
            qin.put(url)
            done.add(url)

# Get data from qin, parse robots.txt from urls, put results in qout
def worker(qin, qout):
    while True:
        s = Session()
        s.headers.update(HEADERS)
        s.proxies.update(PROXIES)
        try:
            url = qin.get(timeout=60)
            with s.get(url + '/robots.txt', stream=True) as r:
                paths = parse_paths(r.iter_lines(decode_unicode=True))
                if paths:
                    qout.put((url, paths))
        except Empty:
            # No more data in the queue, something must be wrong, bail
            return
        except:
            # ¯\_(ツ)_/¯
            continue

# Parse robots.txt and return set of disallowed paths
def parse_paths(lines):
    paths = set()
    for i, line in enumerate(lines):
        if i > 100 and len(paths) == 0:
            # No paths after 100 lines... Just bail...
            break
        if len(paths) > 100:
            # More than 100 paths!? Bail...
            break
        if ':' not in line[:16]:
            continue
        key, value = line.split(':', 1)
        key = key.strip().lower()
        if key != 'disallow':
            # Ignore everything except disallow rules
            continue
        value = value.strip()
        if len(value) > 256:
            # Paths larger than 256 chars in robots.txt file? Nah...
            continue
        if value in ('', '/'):
            # Boring, ignore these
            continue
        paths.add(value)
    return paths

# Infinitely stream URL origins from Twitter links in real-time
def twitter_stream():
    # backoff to avoid being put in timeout by Twitter if errors occur
    backoff = 1
    while True:
        try:
            for tweet in api.GetStreamSample():
                # Reset backoff since request succeeded
                backoff = 1
                if 'entities' not in tweet:
                    continue
                entities = tweet['entities']
                if 'urls' not in entities:
                    continue
                for url in entities['urls']:
                    if 'unwound' in url:
                        u = url['unwound']['url']
                    else:
                        u = url['expanded_url']
                    p = urlparse(u)
                    yield '{x.scheme}://{x.netloc}'.format(x=p).lower()
        except Exception as e:
            # Sometimes GetStreamSample connection fails
            sleep(backoff)
            # exponential backoff for repeated errors
            backoff *= 2

# Start fn(*args) in daemonic thread
def start_thread(fn, *args):
    th = Thread(target=fn, args=args)
    th.daemon = True
    th.start()


# Start at main if executed at a program
if __name__ == '__main__':
    main()
	from threading import active_count, Thread
	from time import sleep, time

	try:
	from queue import Empty, Queue
	except:
	print('Requires python3 (stop using python2)')
	exit(1)

	try:
	from requests import Session
	except:
	print('Requires requests: pip install requests')
	exit(1)

	try:
	from twitter import Api
	except:
	print('Requires python-twitter: pip install python-twitter')
	exit(1)

	try:
	from urllib.parse import urlparse
	except:
	print('Requires python3 (stop using python2)')
	exit(1)

	# Number of threads to open /robots.txt requests in parallel
	THREAD_COUNT = 100

	# Instantiate Twitter API
	# Fill in your own values from https://developer.twitter.com/apps
	api = Api(
	consumer_key='... YOUR CONSUMER API KEY ...',
	consumer_secret='... YOUR CONSUMER API SECRET KEY ...',
	access_token_key='... YOUR ACCESS TOKEN ...',
	access_token_secret='... YOUR ACCESS TOKEN SECRET ...',
	)

	# Spoof a browser header (this is Tor Browser)
	HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	}

	# SOCKS proxy for requests to go through Tor daemon
	# For Tor Browser proxy use port 9150
	# For Brave Tor mode proxy use port 9350
	# Warning: this is NOT used for the Twitter API, only robots.txt requests
	PROXIES = {
	'http': 'socks5h://127.0.0.1:9050',
	'https': 'socks5h://127.0.0.1:9050'
	}

	def main():
	# Input queue gets filled with urls
	qin = Queue(maxsize=100)
	# Output queue gets filled with (url, paths) pairs
	qout = Queue(maxsize=100)
	# Start source thread
	start_thread(source, qin)
	# Start worker threads
	for i in range(THREAD_COUNT):
	start_thread(worker, qin, qout)
	while True:
	try:
	url, paths = qout.get(timeout=10)
	print(url)
	for path in sorted(paths):
	print(' '+ path)
	except Empty:
	# No output since timeout
	# Check thread count and bail if too few
	if active_count() <= 2:
	break
	except KeyboardInterrupt:
	break

	# Read from twitter_stream and write strings to qin for workers
	def source(qin):
	# XXX THIS WILL CONTINUE TO GROW
	done = set()
	for url in twitter_stream():
	if url not in done:
	qin.put(url)
	done.add(url)

	# Get data from qin, parse robots.txt from urls, put results in qout
	def worker(qin, qout):
	while True:
	s = Session()
	s.headers.update(HEADERS)
	s.proxies.update(PROXIES)
	try:
	url = qin.get(timeout=60)
	with s.get(url + '/robots.txt', stream=True) as r:
	paths = parse_paths(r.iter_lines(decode_unicode=True))
	if paths:
	qout.put((url, paths))
	except Empty:
	# No more data in the queue, something must be wrong, bail
	return
	except:
	# ¯\_(ツ)_/¯
	continue

	# Parse robots.txt and return set of disallowed paths
	def parse_paths(lines):
	paths = set()
	for i, line in enumerate(lines):
	if i > 100 and len(paths) == 0:
	# No paths after 100 lines... Just bail...
	break
	if len(paths) > 100:
	# More than 100 paths!? Bail...
	break
	if ':' not in line[:16]:
	continue
	key, value = line.split(':', 1)
	key = key.strip().lower()
	if key != 'disallow':
	# Ignore everything except disallow rules
	continue
	value = value.strip()
	if len(value) > 256:
	# Paths larger than 256 chars in robots.txt file? Nah...
	continue
	if value in ('', '/'):
	# Boring, ignore these
	continue
	paths.add(value)
	return paths

	# Infinitely stream URL origins from Twitter links in real-time
	def twitter_stream():
	# backoff to avoid being put in timeout by Twitter if errors occur
	backoff = 1
	while True:
	try:
	for tweet in api.GetStreamSample():
	# Reset backoff since request succeeded
	backoff = 1
	if 'entities' not in tweet:
	continue
	entities = tweet['entities']
	if 'urls' not in entities:
	continue
	for url in entities['urls']:
	if 'unwound' in url:
	u = url['unwound']['url']
	else:
	u = url['expanded_url']
	p = urlparse(u)
	yield '{x.scheme}://{x.netloc}'.format(x=p).lower()
	except Exception as e:
	# Sometimes GetStreamSample connection fails
	sleep(backoff)
	# exponential backoff for repeated errors
	backoff *= 2

	# Start fn(*args) in daemonic thread
	def start_thread(fn, *args):
	th = Thread(target=fn, args=args)
	th.daemon = True
	th.start()


	# Start at main if executed at a program
	if __name__ == '__main__':
	main()