#!/usr/bin/env python | |
# coding: utf-8 | |
### REQUIREMENTS ### | |
# * python 3.5+ | |
# * aiohttp (pip install aiohttp) | |
import argparse | |
import os | |
import asyncio | |
from xml.etree import ElementTree | |
import aiohttp | |
def xml_image_iter(root): | |
for element in root: | |
for children in element: | |
for inner_children in children: | |
if inner_children.tag.endswith('loc'): | |
yield inner_children.text | |
async def main(urls): | |
async with aiohttp.ClientSession() as session: | |
for url in urls: | |
async with session.get(url) as response: | |
if response.status == 403: | |
print(url) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Find all urls in image sitemaps that returns 403.') | |
parser.add_argument('--dir', default='.') | |
args = parser.parse_args() | |
dir_ = args.dir | |
# find all files in `dir_` that ends with .xml | |
files = filter(lambda file: file.endswith('.xml'), os.listdir(dir_)) | |
urls = [] | |
for file in files: | |
tree = ElementTree.parse(file) | |
root = tree.getroot() | |
urls.extend(list(xml_image_iter(root))) | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main(urls)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment