Skip to content

Instantly share code, notes, and snippets.

@dataplayer12
Last active April 10, 2023 13:12
Show Gist options
  • Save dataplayer12/400cf2ee6584389dd4a1be144691f147 to your computer and use it in GitHub Desktop.
Save dataplayer12/400cf2ee6584389dd4a1be144691f147 to your computer and use it in GitHub Desktop.
Download SA-1B dataset from Segment Anything paper by FAIR
import wget
import argparse
import os
import shutil
def parse_arguments():
p=argparse.ArgumentParser()
p.add_argument('--links-file', type=str, required=True, help='Path of text file downloaded from https://ai.facebook.com/datasets/segment-anything-downloads/')
p.add_argument('--start-chunk', type=int, required=False, default=0, help='Index of the starting chunk to download. Note the dataset is divided into ~1000 chunks of tar files')
p.add_argument('--end-chunk', type=int, required=False, default=-1, help='Index of the end chunk.')
p.add_argument('--output-dir', type=str, required=False, default='./', help='Output directory to store data in')
args=p.parse_args()
return args
def download_chunk(name, link, chunk_num, n_chunks, outdir):
print(f'Downloading chunk number {chunk_num} of {n_chunks} with name {name}')
cwd=os.getcwd()
os.chdir(outdir)
wget.download(link)
long_name = link[link.rfind('/')+1:link.rfind('.tar')] + '.tar'
assert long_name in os.listdir(), f'File {long_name} not found'
shutil.move(long_name, name)
os.chdir(cwd)
print(f'Finished downloading {name}')
def main():
args=parse_arguments()
os.makedirs(args.output_dir, exist_ok=True)
with open(args.links_file, 'r') as f:
content=f.read().split('\n')[1:-1]
names_links=[line.split('\t') for line in content]
n_chunks=len(names_links) if args.end_chunk == -1 else (args.end_chunk - args.start_chunk)
for (idx, (name, link)) in enumerate(names_links[args.start_chunk: args.start_chunk + n_chunks]):
chunk_index = idx + 1
download_chunk(name, link, chunk_index, n_chunks, args.output_dir)
if __name__ == '__main__':
main()
@dataplayer12
Copy link
Author

How to use

  • Read the terms of use of dataset and download the text file from here
  • The dataset is divided into 1000 tar balls and is roughly 12 TB in size(!)
  • Install python wget utility with pip install wget
  • You can specify the path of the downloaded text file, starting chunk and ending chunk to the python script above (leave empty if you want to download the full dataset)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment