Skip to content

Instantly share code, notes, and snippets.

@grisaitis
Created September 23, 2022 01:00
Show Gist options
  • Save grisaitis/58caf7ab7df3f3a1ce74aeebe96ac070 to your computer and use it in GitHub Desktop.
Save grisaitis/58caf7ab7df3f3a1ce74aeebe96ac070 to your computer and use it in GitHub Desktop.
'''
script to fetch all GCS URIs of diagnostic whole slide images of TCGA SKCM, to copy to your google bucket
'''
from io import StringIO
import json
import pandas as pd
import requests
files_endpt = "https://api.gdc.cancer.gov/files"
fields = [
"file_name",
"cases.samples.sample_type",
"cases.disease_type",
"cases.project.project_id",
]
fields = ",".join(fields)
filters = {
"op": "and",
"content":[
{
"op": "in",
"content":{
"field": "cases.project.project_id",
"value": ["TCGA-SKCM"]
}
},
{
"op": "in",
"content":{
"field": "files.experimental_strategy",
"value": ["Diagnostic Slide"]
}
}
]
}
params = {
"filters": filters,
"fields": fields,
"format": "CSV",
"size": "500",
}
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
df = pd.read_csv(StringIO(response.content.decode("utf-8")))
with open("./gcs_uris.txt", "w") as f:
for gcs_uri in df.apply(lambda row: f'gs://gdc-tcga-phs000178-open/{row["id"]}/{row["file_name"]}', axis=1):
print(gcs_uri)
f.write(gcs_uri)
f.write("\n")
# command = f'gsutil cp {gcs_uri} "gs://liulab/tk_files/"'
# print(command)
# !command
# in a terminal, do:
# cat gcs_uris.txt | gsutil -m cp -I gs://your_bucket_name/some/path/
# result looks like:
# $ gsutil ls gs://liulab/tk_files_2
# gs://your_bucket_name/some/path/TCGA-3N-A9WB-01Z-00-DX1.A9950ED4-9480-455C-AE0D-8E076D4DA432.svs
# gs://your_bucket_name/some/path/TCGA-3N-A9WC-01Z-00-DX1.C833FCAB-6329-4F90-88E5-CFDA0948047B.svs
# ...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment