Skip to content

Instantly share code, notes, and snippets.

@moskomule
Last active March 25, 2019 06:11
Show Gist options
  • Save moskomule/46c926f8474ef5d0fcd2c7a188c4078f to your computer and use it in GitHub Desktop.
Save moskomule/46c926f8474ef5d0fcd2c7a188c4078f to your computer and use it in GitHub Desktop.
webvision

utility for downloading webvision2018 dataset

Requires gnu parallel and aria2c. Both can be downloaded via brew

#!/usr/bin/env bash
function download() {
BASE_URL="https://data.vision.ee.ethz.ch/aeirikur/webvision2018"
seq -w 0 32 | parallel -j 4 aria2c -x5 ${BASE_URL}/webvision_train_{}.tar
aria2c -x5 ${BASE_URL}/val_images_resized.tar
wget ${BASE_URL}/val_filelist.txt
wget ${BASE_URL}/info.tar
aria2c -x5 ${BASE_URL}/test_images_resized.tar
wget ${BASE_URL}/test_filelist.txt
}
function extract() {
seq -w 0 32 | parallel -j 4 tar -xvf webvision_train_{}.tar
tar -xvf val_images_resized.tar
tar -xvf info.tar
tar -xvf test_images_resized.tar
}
mkdir train
mkdir meta
mkdir test
mkdir test/meta
download
extract
nv val_images_resized val
mv val_filelist.txt meta/val.txt
mv test_images_resized test/val
awk '$0=$0" 0"' test_filelist.txt > test/meta/val.txt
cp meta/synsets.txt test/meta
from pathlib import Path
import shutil
from tqdm import tqdm
train = Path("train")
val = Path("val")
info = Path("info")
meta = Path("meta")
synsets = []
filelist = []
def move(rows: list,
base: Path,
append: bool = True):
base = Path(base)
for row in tqdm(rows):
source, label = row.strip().split()
name = source.split("/")[-1]
target = base / synsets[int(label)] / name
try:
if not append:
source = val / source
shutil.move(source, target)
if append:
filelist.append(str(target) + " " + label + "\n")
except FileNotFoundError:
print(f"No file {source}")
with (info / "synsets.txt").open() as f1, \
(meta / "synsets.txt").open('w') as f2:
rows = f1.readlines()
for row in rows:
k = row.split()[0]
f2.write(k + "\n")
synsets.append(k)
for s in synsets:
(train / "google" / s).mkdir(exist_ok=True)
(train / "flickr" / s).mkdir(exist_ok=True)
(val / s).mkdir(exist_ok=True)
with (info / "train_filelist_google.txt").open() as fg,\
(info / "train_filelist_flickr.txt").open() as ff:
rows_google = fg.readlines()
rows_flickr = ff.readlines()
move(rows_google, train / "google")
move(rows_flickr, train / "flickr")
with (meta / "val.txt").open() as f:
rows = f.readlines()
move(rows, val, False)
with (meta / "train.txt").open('w') as f:
for row in filelist:
f.write(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment