Last active
November 8, 2016 13:51
-
-
Save IuryAlves/a1f65f59259a71c74e464e63c396163b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
### REQUIREMENTS ### | |
# * python 3.5+ | |
# * aiohttp (pip install aiohttp) | |
import argparse | |
import os | |
import asyncio | |
from xml.etree import ElementTree | |
import aiohttp | |
def xml_image_iter(root): | |
for element in root: | |
for children in element: | |
for inner_children in children: | |
if inner_children.tag.endswith('loc'): | |
yield inner_children.text | |
async def main(urls): | |
async with aiohttp.ClientSession() as session: | |
for url in urls: | |
async with session.get(url) as response: | |
if response.status == 403: | |
print(url) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Find all urls in image sitemaps that returns 403.') | |
parser.add_argument('--dir', default='.') | |
args = parser.parse_args() | |
dir_ = args.dir | |
# find all files in `dir_` that ends with .xml | |
files = filter(lambda file: file.endswith('.xml'), os.listdir(dir_)) | |
urls = [] | |
for file in files: | |
tree = ElementTree.parse(file) | |
root = tree.getroot() | |
urls.extend(list(xml_image_iter(root))) | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main(urls)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment