Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/env python
# coding: utf-8
### REQUIREMENTS ###
# * python 3.5+
# * aiohttp (pip install aiohttp)
import argparse
import os
import asyncio
from xml.etree import ElementTree
import aiohttp
def xml_image_iter(root):
for element in root:
for children in element:
for inner_children in children:
if inner_children.tag.endswith('loc'):
yield inner_children.text
async def main(urls):
async with aiohttp.ClientSession() as session:
for url in urls:
async with session.get(url) as response:
if response.status == 403:
print(url)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Find all urls in image sitemaps that returns 403.')
parser.add_argument('--dir', default='.')
args = parser.parse_args()
dir_ = args.dir
# find all files in `dir_` that ends with .xml
files = filter(lambda file: file.endswith('.xml'), os.listdir(dir_))
urls = []
for file in files:
tree = ElementTree.parse(file)
root = tree.getroot()
urls.extend(list(xml_image_iter(root)))
loop = asyncio.get_event_loop()
loop.run_until_complete(main(urls))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment