Skip to content

Instantly share code, notes, and snippets.

@n0samu
Created July 9, 2022 18:33
Show Gist options
  • Save n0samu/30c17ce6f679d852a0a087ce8da2083a to your computer and use it in GitHub Desktop.
Save n0samu/30c17ce6f679d852a0a087ce8da2083a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from urllib.request import urlopen
from urllib.parse import quote
import xml.etree.ElementTree as ET
import sys
def read_xml(f):
it = ET.iterparse(f)
for _, el in it:
el.tag = el.tag.split('}', 1)[1] # strip all namespaces
root = it.root
return root
key = None
page = 0
num_files = 0
if len(sys.argv) == 3:
key = sys.argv[2]
if len(sys.argv) in (2, 3):
base = sys.argv[1]
else:
print('Usage: python scrapeS3.py BaseURL [StartKey]')
sys.exit()
out_file = 'urls.txt'
with open(out_file, 'a', encoding='utf-8') as f:
try:
while True:
files = list()
page += 1
url = f"{base}?marker={quote(key, safe='')}" if key else base
sys.stdout.write(f'Enumerating page {page}: {url}\n')
sys.stdout.flush()
tree = read_xml(urlopen(url))
key = None
for entry in tree.findall('Contents'):
key = entry.find('Key').text
files.append(base + quote(key))
num_files += len(files)
f.write('\n'.join(files) + '\n')
if key is None:
break
except KeyboardInterrupt:
pass
finally:
sys.stdout.write('\n')
sys.stdout.flush()
print(f'Wrote {num_files} URLs to {out_file}!')
@xhendev
Copy link

xhendev commented Jul 31, 2023

Is there a way to scrape buckets that show "AccessDenied"?

@n0samu
Copy link
Author

n0samu commented Aug 1, 2023

No.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment