Skip to content

Instantly share code, notes, and snippets.

@nickrsan
Created September 2, 2022 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nickrsan/12a022b7793b08f8871f8669e4c38ed6 to your computer and use it in GitHub Desktop.
Save nickrsan/12a022b7793b08f8871f8669e4c38ed6 to your computer and use it in GitHub Desktop.
Download Items from Public Google Cloud Storage Bucket
import requests
import re
from pathlib import Path
def get_public_export_urls(bucket_name, prefix=""):
"""
Downloads items from a *public* Google Storage bucket without using a GCloud login. Filters only to files
with the specified prefix
:param bucket_name:
:param prefix: A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files
:return: list of urls
"""
base_url = "http://storage.googleapis.com/"
request_url = f"{base_url}{bucket_name}/"
# get the content of the bucket (it needs to be public
listing = requests.get(request_url).text
# comes back as an XML listing - don't need to parse the XML, just need the values of the Key elements
pattern = re.compile("\<Key\>(.*?)\<\/Key\>")
items = pattern.findall(listing)
# make them into full URLs with the bucket URL at the front and check if the files have the prefix specific
filtered = [f"{request_url}{item}" for item in items if item.startswith(prefix)]
return filtered
def download_public_export(bucket_name, output_folder, prefix=""):
# get the urls of items in the bucket with the specified prefix
urls = get_public_export_urls(bucket_name, prefix)
for url in urls:
filename = url.split("/")[-1] # get the filename
output_path = Path(output_folder) / filename # construct the output path
response = requests.get(url) # get the data - this could be a problem if it's larger than fits in RAM - I believe requests has a way to operate as a streambuffer - not looking into that at this moment
output_path.write_bytes(response.content) # write it to a file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment