Skip to content

Instantly share code, notes, and snippets.

@davidbrochart
Last active April 25, 2019 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidbrochart/9a6f7d36dfdb81c7bdc8d2214919efe6 to your computer and use it in GitHub Desktop.
Save davidbrochart/9a6f7d36dfdb81c7bdc8d2214919efe6 to your computer and use it in GitHub Desktop.
import gcsfs
import pandas as pd
dt0 = dt = datetime(2000, 3, 1, 12) # from this date (included)
dt1 = datetime(2000, 3, 11, 12) # to that date (excluded)
#dt0 = dt = datetime(2000, 3, 11, 12) # from this date (included)
#dt1 = datetime(2000, 3, 21, 12) # to that date (excluded)
resume = False # if True, resume a previous upload
# and dt0 and dt1 must be later than the previous date range
fake_gcs = True # if True, won't upload to Google Cloud Storage
# but fake it in local trmm_bucket directory
if fake_gcs:
store = 'trmm_bucket'
else:
store = gcsfs.GCSMap('pangeo-data/trmm_3b42rt')
if resume:
time_prev = xr.open_zarr(store).time.values
time_nb = 40
while dt < dt1:
print(f'Downloading {time_nb} files from {dt}...')
filenames, datetimes = download_files(dt, time_nb)
ds = create_dataset(filenames, datetimes)
if not resume and dt == dt0:
encoding = create_zarr(ds, 'trmm_3b42rt')
empty_zarr('trmm_3b42rt', 'time')
else:
if resume:
encoding = get_encoding('trmm_3b42rt_new')
create_zarr(ds, 'trmm_3b42rt_new', encoding)
empty_zarr('trmm_3b42rt')
append_zarr('trmm_3b42rt_new', 'trmm_3b42rt')
print('Uploading...')
if fake_gcs:
subprocess.check_call('mkdir -p trmm_bucket; cp -r trmm_3b42rt/* '
'trmm_bucket/; cp -r trmm_3b42rt/.[^.]* '
'trmm_bucket/', shell=True)
else:
subprocess.check_call('gsutil -m cp -r trmm_3b42rt/ gs://pangeo-data/'
.split())
dt += timedelta(hours=3*time_nb)
time_new = pd.date_range(dt0, dt1-timedelta(hours=3), freq='3H')
if resume:
time_var = np.hstack((time_prev, time_new))
else:
time_var = time_new
time_ds = xr.DataArray(np.zeros(len(time_var)), coords=[time_var], dims=['time']).to_dataset(name='trmm_time')
shutil.rmtree('trmm_time', ignore_errors=True)
time_ds.to_zarr('trmm_time')
if fake_gcs:
subprocess.check_call('rm -rf trmm_bucket/time'.split())
subprocess.check_call('cp -r trmm_time/time trmm_bucket/'.split())
else:
subprocess.check_call('gsutil -m rm -rf '
'gs://pangeo-data/trmm_3b42rt/time'.split())
subprocess.check_call('gsutil -m cp -r trmm_time/time/ '
'gs://pangeo-data/trmm_3b42rt/'.split())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment