Skip to content

Instantly share code, notes, and snippets.

@johnstcn
Created November 12, 2016 04:20
Show Gist options
  • Save johnstcn/da428b8bfcbfc75e66c61b61c9035fcc to your computer and use it in GitHub Desktop.
Save johnstcn/da428b8bfcbfc75e66c61b61c9035fcc to your computer and use it in GitHub Desktop.
Get a list of URLs, run a regex on the content and output the match. In parallel.
#!/usr/bin/env python
import argparse
import re
import grequests
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def perform(args):
buf = []
for chunk in chunks(args.input.readlines(), args.concurrency):
urls = [line.strip() for line in chunk]
responses = grequests.map((grequests.get(u) for u in urls))
for resp in responses:
match = re.search(args.regex, resp.content)
if match:
args.output.write("%s\t%s\n" % (resp.request.url, ','.join(match.groups())))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input", type=argparse.FileType('r'))
parser.add_argument("output", type=argparse.FileType('w'))
parser.add_argument("regex", type=str)
parser.add_argument("--concurrency", type=int, default=10)
args = parser.parse_args()
perform(args)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment