Skip to content

Instantly share code, notes, and snippets.

@darkarnium
Created December 5, 2016 04:11
Show Gist options
  • Save darkarnium/3103dfb0b90f545d3e54a374d5752394 to your computer and use it in GitHub Desktop.
Save darkarnium/3103dfb0b90f545d3e54a374d5752394 to your computer and use it in GitHub Desktop.
Populate Sesshu wish Alexa Top 1,000,000 sites.
#!/usr/bin/env python
import tqdm
import json
import click
import boto3
import multiprocessing
def request(target, plugin='http_robots'):
''' Builds a request object (JSON). '''
r = {'target': 'http://{}'.format(target), 'plugin': plugin}
return(json.dumps(r))
def dispatch(work, destination, results, id, block):
c = boto3.client('sns')
# Iterate over domains in this block, and submit to SNS. On failure, don't
# attempt to catch the exception, let it terminate the thread. On success
# submit a True into the results queue - for counting / status only.
for i in xrange(block * id, block * (id + 1)):
c.publish(
TopicArn=destination,
Message=request(work[i])
)
results.put(True)
@click.command()
@click.option('--arn', help='Destination ARN (Amazon SNS)')
@click.option('--source', help='File containing list of domains')
@click.option('--workers', default=4, help='Workers to spawn (default: 4)')
def main(arn, source, workers):
''' Populate SQS with input domains. '''
domains = []
results = multiprocessing.Queue()
with open(source) as in_file:
domains = in_file.read().splitlines()
# Spawn workers, and submit the domains into the topic.
block = len(domains) / workers
for i in xrange(workers):
p = multiprocessing.Process(
target=dispatch, args=(domains, arn, results, i, block))
p.start()
# Monitor the results queue until all work has been complete.
domain_counter = 0
with tqdm.tqdm(total=len(domains), unit='R', unit_scale=False) as pbar:
# If there's a message in the results queue, increment the counter, and
# pull the message off.
while True:
if results.qsize > 0:
results.get()
domain_counter += 1
pbar.update(1)
# Check whether all work has been processed and results received.
if len(domains) == (domain_counter + 1):
break
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment