Skip to content

Instantly share code, notes, and snippets.

@mymindwentblvnk
Last active May 6, 2023 08:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mymindwentblvnk/8d82f2663342d08f473a205fe8b3b8bf to your computer and use it in GitHub Desktop.
Save mymindwentblvnk/8d82f2663342d08f473a205fe8b3b8bf to your computer and use it in GitHub Desktop.
Call this script with `parallel -a dates.txt -j 8 python rename_gcs_objects.py -date` to easily parallelize it.
import pickle
import sys
from google.cloud import storage
from google.cloud.storage import Client
CLIENT: Client = storage.Client.from_service_account_json("me-playground-gcs-admin-91c328fca8f6.json")
BUCKET_NAME = 'vgn-departures-archive'
BUCKET = CLIENT.get_bucket(BUCKET_NAME)
def get_storage_objects(prefix):
pickle_file_name = f"{prefix.replace('/', '-')}.p"
try:
target_filenames = pickle.load(open(pickle_file_name, "rb"))
except:
target_filenames = []
for b in CLIENT.list_blobs(BUCKET_NAME, prefix=prefix):
year, month, day, file_name = b.name.split('/')
assert int(year) in (2022, 2023)
assert int(month) in range(1, 12+1)
assert int(day) in range(1, 31+1)
target_filename = f"year={int(year)}/month={int(month)}/day={int(day)}/{file_name}"
target_filenames.append((b.name, target_filename))
pickle.dump(target_filenames, open(pickle_file_name, "wb"))
return target_filenames
def blob_exists(filename):
bucket = CLIENT.get_bucket(BUCKET_NAME)
blob = bucket.blob(filename)
return blob.exists()
def list_blobs(prefix):
return [b.name for b in CLIENT.list_blobs(BUCKET_NAME, prefix=prefix)]
def count_blobs(prefix=''):
return len(list_blobs(prefix))
if __name__ == '__main__':
print(f"old: {count_blobs('20')}")
print(f"new: {count_blobs('year=')}")
args = sys.argv[1:]
if args:
assert len(args) == 2
assert args[0] == '-date'
prefix = args[1].replace('-', '/')
else:
prefix = '20'
print(f"Processing {prefix}")
file_names = get_storage_objects(prefix)
if file_names:
existing_blobs = list_blobs(prefix='year=')
filtered_file_names = [f for f in file_names if f[1] not in existing_blobs]
print(f"Copying {len(filtered_file_names)} to destination bucket.")
for old_file_name, new_file_name in filtered_file_names:
source_blob = BUCKET.blob(old_file_name)
print(f"Copying {BUCKET.name}/{source_blob.name} to {BUCKET.name}/{new_file_name}.")
BUCKET.copy_blob(source_blob, BUCKET, new_file_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment