Skip to content

Instantly share code, notes, and snippets.

@dandelin
Forked from csarron/convert_pyarrow.py
Created April 6, 2022 09:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dandelin/ec1c0d7d5387f28ea628aab8b04932da to your computer and use it in GitHub Desktop.
Save dandelin/ec1c0d7d5387f28ea628aab8b04932da to your computer and use it in GitHub Desktop.
pip install pyarrow fire tqdm
"""
crawl images:
pip install img2dataset==1.11.0
img2dataset --url_list cc3m.tsv\
--output_folder cc3m-img --input_format "tsv"\
--url_col "url" --caption_col "caption"\
--output_format files --resize_mode=no\
--processes_count 10 --thread_count 64 --number_sample_per_shard 2000\
--enable_wandb True --save_metadata False
img2dataset --url_list cc3m-val.tsv --input_format "tsv" \
--url_col "url" --caption_col "caption" \
--output_format files --output_folder cc3m-val-img \
--image_size 512 --resize_only_if_bigger yes --resize_mode center_crop \
--processes_count `nproc` --thread_count 16 --number_sample_per_shard 1000 \
--enable_wandb True --wandb_project img2dataset --retries 5
dataset=redcaps
dataset=cc12m
shard=4000
processes=32
threads=64
dataset=cc3m
shard=2000
processes=16
threads=64
dataset=sbu1m
shard=2000
processes=16
threads=32
dataset=wit-v1-en-filtered
shard=1000
processes=8
threads=8
dataset=wit-v1-en-noisy
shard=1000
processes=8
threads=8
# set both processes and threads to 8 to avoid wikipedia has rate limiting
img2dataset --url_list ${dataset}.tsv.gz\
--output_folder ${dataset}-img --input_format "tsv.gz"\
--resize_only_if_bigger True --image_size 512\
--url_col "url" --caption_col "caption"\
--output_format files --resize_mode=keep_ratio\
--processes_count ${processes} --thread_count ${threads} --number_sample_per_shard ${shard}\
--enable_wandb True --save_metadata False --timeout 30
preprocess wit-v1:
for i in `seq 1 9`; do
echo $i
gzip -dc wit_v1.train.all-0000$i-of-00010.tsv.gz | awk '{ if ($1 == "en") { print } }' > wit-v1-train-en-$i.tsv
done
usage:
python convert_pyarrow.py --dataset_dir cc3m-img
python convert_pyarrow.py --dataset_dir wit-v1-en-filtered-img
python convert_pyarrow.py --dataset_dir cc12m-img
python convert_pyarrow.py --dataset_dir sbu1m
"""
import fire
from pathlib import Path
from tqdm import tqdm
import pyarrow as pa
def iter_file(path):
for folder in path.glob('*'):
for file in folder.glob("*.jpg"):
yield file
def read_file(file, mod='rb'):
with open(file, mod) as f:
d = f.read()
return d
def batch_gen(ds_path, batch_size=4):
ids, texts, imgs = [], [], []
ds_name = ds_path.stem
for img_file in iter_file(ds_path):
img_data = read_file(img_file)
pair_name = img_file.stem
pair_id = '-'.join([ds_name, pair_name])
txt_file = img_file.with_suffix('.txt')
if not txt_file.exists():
continue
text = read_file(txt_file, 'r')
ids.append(pair_id)
texts.append(text)
imgs.append(img_data)
if len(ids) == batch_size:
yield ids, texts, imgs
ids, texts, imgs = [], [], []
if ids:
yield ids, texts, imgs
def process(dataset_dir, save_file=None, batch_size=1000):
dataset_path = Path(dataset_dir)
if save_file is None:
save_path = dataset_path.with_suffix('.arrow')
else:
save_path = Path(save_file)
save_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
p_bar = tqdm(batch_gen(dataset_path, batch_size))
schema = pa.schema([('id', pa.string()), ('text', pa.string()), ('image', pa.binary())])
with pa.OSFile(str(save_path), 'wb') as sink:
with pa.ipc.new_file(sink, schema) as writer:
for batch in p_bar:
ids, texts, imgs = batch
# print(ids, texts, len(imgs))
p_id = pa.array(ids, type=pa.string())
p_text = pa.array(texts, type=pa.string())
p_img = pa.array(imgs, type=pa.binary())
batch = pa.record_batch([p_id, p_text, p_img], schema)
writer.write(batch)
count += len(ids)
if count % batch_size == 0:
p_bar.set_description(f'processed {count}')
print(f'processed {count} in total')
def main():
fire.Fire(process)
if __name__ == "__main__":
main()
import json
import fire
import csv
import gzip
from tqdm.auto import tqdm
from pathlib import Path
def process(redcaps_dir, out_file):
redcaps_path = Path(redcaps_dir)
num_files = 0
num_pairs = 0
with open(out_file, 'w', newline='') as of:
tsv_writer = csv.writer(of, delimiter='\t')
tsv_writer.writerow(["caption", "url"])
for anno_file in tqdm(redcaps_path.glob('*.json')):
d = json.load(open(anno_file))
num_files += 1
for item in tqdm(d['annotations'], leave=False):
url = item['url']
caption = item['caption']
num_pairs += 1
tsv_writer.writerow([caption, url])
print(f'processed {num_files} files, {num_pairs} pairs in total')
def main():
fire.Fire(process)
if __name__ == "__main__":
main()
# parse wit en
# python parse.py -d wit-v1-train-en.tsv.gz -o wit-v1-en-noisy.tsv
import argparse
import csv
import gzip
from tqdm import tqdm
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--data_file", type=str, required=True)
parser.add_argument("-o", "--output_file", type=str, required=True)
args = parser.parse_args()
data_file = args.data_file
if data_file.endswith('.gz'):
df = gzip.open(data_file, mode='rt', newline='')
else:
df = open(args.data_file)
count = 0
skipped = 0
with open(args.output_file, 'w', newline='') as of:
tsv_writer = csv.writer(of, delimiter='\t')
tsv_writer.writerow(["caption", "url"])
tsv_reader = csv.reader(df, delimiter="\t")
p_bar = tqdm(tsv_reader)
for row in p_bar:
n_col = len(row)
if n_col < 7:
skipped += 1
continue
caption = row[6]
url = row[2]
if url:
if caption: # wit-v1-en-filtered
# print(f't={len(row)}, cap={row[6]}, url={row[2]}')
tsv_writer.writerow([caption, url])
count += 1
else:
# use this to include more noisy data, i.e. wit-v1-en-noisy
caption = row[7] if n_col>7 else ''
caption += row[8] if n_col>8 else ''
if caption:
tsv_writer.writerow([caption, url])
count += 1
else:
skipped += 1
else:
skipped += 1
if count % 1000 == 0:
p_bar.set_description(f"Processed {count}, skip {skipped}")
print(f'all done, got {count}, skip {skipped}, total={count+skipped}')
# coco datasets https://cs.stanford.edu/people/karpathy/deepimagesent/
import json
import fire
from pathlib import Path
from tqdm import tqdm
import pyarrow as pa
from collections import defaultdict
def read_image(image_name, image_dir=Path.cwd(), mod='rb'):
split = image_name.split('_')[1]
if type(split) != str:
raise ValueError(f'{image_name}, {split}, {type(split)}')
image_file = Path(image_dir, split, image_name)
if not image_file.exists():
raise ValueError(f'{image_file} not exist!')
with open(image_file, mod) as f:
d = f.read()
return d
def get_data(dataset_dir):
dataset_path = Path(dataset_dir)
coco_dataset = json.load(open(dataset_path / 'dataset_coco.json'))
image_items = coco_dataset["images"]
img2captions = defaultdict(list)
img2split = dict()
for item in tqdm(image_items):
filename = item["filename"]
img2split[filename] = item["split"]
for i in item["sentences"]:
img2captions[filename].append(i["raw"])
train_split, val_split, test_split = {}, {}, {}
for k, v in img2split.items():
if v == 'train' or v == 'restval':
train_split[k] = img2captions[k]
elif v == 'val':
val_split[k] = img2captions[k]
else:
assert v == 'test'
test_split[k] = img2captions[k]
return train_split, val_split, test_split
def batch_gen(data_split, image_dir='coco', batch_size=4):
ids, texts, imgs = [], [], []
for img_file, captions in data_split.items():
img_data = read_image(img_file, image_dir)
ids.append(img_file)
texts.append(captions)
imgs.append(img_data)
if len(ids) == batch_size:
yield ids, texts, imgs
ids, texts, imgs = [], [], []
if ids:
yield ids, texts, imgs
def process_split(data_split, image_dir, save_file=None, batch_size=1000):
save_path = Path(save_file)
save_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
p_bar = tqdm(batch_gen(data_split, image_dir, batch_size))
schema = pa.schema(
[('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())])
with pa.OSFile(str(save_path), 'wb') as sink:
with pa.ipc.new_file(sink, schema) as writer:
for batch in p_bar:
ids, texts, imgs = batch
# print(ids, texts, len(imgs))
p_id = pa.array(ids, type=pa.string())
p_text = pa.array(texts, type=pa.list_(pa.string()))
p_img = pa.array(imgs, type=pa.binary())
batch = pa.record_batch([p_id, p_text, p_img], schema)
writer.write(batch)
count += len(ids)
if count % batch_size == 0:
p_bar.set_description(f'processed {count}')
print(f'processed {count} in total')
def process(dataset_dir):
print('loading dataset...')
train_split, val_split, test_split = get_data(dataset_dir)
print('processing train split...')
process_split(train_split, dataset_dir,
save_file='coco-karpathy-train.arrow', batch_size=1000)
print('processing val split...')
process_split(val_split, dataset_dir,
save_file='coco-karpathy-val.arrow', batch_size=1000)
print('processing test split...')
process_split(test_split, dataset_dir,
save_file='coco-karpathy-test.arrow', batch_size=1000)
print('all done...')
def main():
fire.Fire(process)
if __name__ == "__main__":
main()
# download datasets and images from https://visualgenome.org/api/v0/api_home.html
import json
import pyarrow as pa
from collections import defaultdict
from tqdm import tqdm
import fire
from pathlib import Path
def read_image(image_name, image_dir=Path.cwd(), mod='rb'):
img_f1 = Path(image_dir, 'VG_100K', f'{image_name}.jpg')
img_f2 = Path(image_dir, 'VG_100K_2', f'{image_name}.jpg')
if img_f1.exists():
image_file = img_f1
elif img_f2.exists():
image_file = img_f2
else:
raise ValueError(f'neither VG_100K nor VG_100K_2 has {image_name}.jpg !')
with open(image_file, mod) as f:
d = f.read()
return d
def get_data(dataset_dir):
dataset_path = Path(dataset_dir)
vg_dataset = json.load(open(dataset_path / 'region_descriptions.json'))
img2captions = defaultdict(list)
for cap in tqdm(vg_dataset):
cap = cap["regions"]
for c in cap:
img2captions[c["image_id"]].append(c['phrase'])
return img2captions
def batch_gen(data_split, image_dir='vg', batch_size=4):
ids, texts, imgs = [], [], []
for img_file, captions in data_split.items():
img_data = read_image(img_file, image_dir)
ids.append(f'{image_dir}-{img_file}')
texts.append(captions)
imgs.append(img_data)
if len(ids) == batch_size:
yield ids, texts, imgs
ids, texts, imgs = [], [], []
if ids:
yield ids, texts, imgs
def process_split(data_split, image_dir, save_file=None, batch_size=1000):
save_path = Path(save_file)
save_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
p_bar = tqdm(batch_gen(data_split, image_dir, batch_size))
schema = pa.schema(
[('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())])
with pa.OSFile(str(save_path), 'wb') as sink:
with pa.ipc.new_file(sink, schema) as writer:
for batch in p_bar:
ids, texts, imgs = batch
# print(ids, texts, len(imgs))
p_id = pa.array(ids, type=pa.string())
p_text = pa.array(texts, type=pa.list_(pa.string()))
p_img = pa.array(imgs, type=pa.binary())
batch = pa.record_batch([p_id, p_text, p_img], schema)
writer.write(batch)
count += len(ids)
if count % batch_size == 0:
p_bar.set_description(f'processed {count}')
print(f'processed {count} in total')
def process(dataset_dir):
print('loading dataset...')
train_split = get_data(dataset_dir)
print('processing vg train...')
process_split(train_split, dataset_dir, save_file='vg-train.arrow', batch_size=1000)
print('all done...')
def main():
fire.Fire(process)
if __name__ == "__main__":
main()
# python read_arrow.py --arrow_file cc3m-img.arrow --iterate
import fire
import io
import time
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import pyarrow as pa
def main(arrow_file, iterate=False):
start = time.perf_counter()
with pa.memory_map(arrow_file, 'rb') as source:
pa_reader = pa.ipc.open_file(source)
loaded_array = pa_reader.read_all()
init = time.perf_counter()
print(f"loaded {len(loaded_array)} in {init-start:.3f} s, used mem: {pa.total_allocated_bytes() >> 20} MB")
if iterate:
start = time.perf_counter()
count = 0
for batch in tqdm(loaded_array.to_batches()):
d = batch.to_pydict()
for c1, c2, c3 in zip(d['id'], d['text'], d['image']):
count += 1
print(f'{count} iters in {time.perf_counter()-start:.3f} s')
# main('cc3msmall.arrow', True)
# +
# from IPython.display import display # to display images
# for i, t, img in zip(loaded_array[0], loaded_array[1], loaded_array[2]):
# print(i, t, )
# image = Image.open(io.BytesIO(img.as_py()))
# display(image)
# +
# loaded_array[2][1].as_py()
# -
if __name__ == "__main__":
fire.Fire(main)
import json
import fire
import csv
import gzip
from tqdm.auto import tqdm
from pathlib import Path
import pyarrow as pa
schema = pa.schema([("id", pa.string()), ("text", pa.string()), ("image", pa.binary())])
def read_file(file, mod="rb"):
with open(file, mod) as f:
d = f.read()
return d
def batch_gen(annotation_path, images_path, batch_size=4):
ids, texts, imgs = [], [], []
for anno_file in tqdm(annotation_path.glob("*.json")):
d = json.load(open(anno_file))
for item in tqdm(d["annotations"], leave=False):
image_id = item["image_id"]
subreddit = item["subreddit"]
img_file = images_path / subreddit / f"{image_id}.jpg"
if not img_file.exists():
continue
img_data = read_file(img_file)
caption = item["caption"]
pair_id = "-".join(["redcaps", subreddit, image_id])
ids.append(pair_id)
texts.append(caption)
imgs.append(img_data)
if len(ids) == batch_size:
yield ids, texts, imgs
ids, texts, imgs = [], [], []
if ids:
yield ids, texts, imgs
def process(redcaps_dir, save_file, images_dir=None, batch_size=1000):
redcaps_path = Path(redcaps_dir)
annotation_path = redcaps_path / "annotations"
images_path = redcaps_path / "images" if images_dir is None else Path(images_dir)
if save_file is None:
save_path = redcaps_path.with_suffix(".arrow")
else:
save_path = Path(save_file)
save_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
p_bar = tqdm(batch_gen(annotation_path, images_path, batch_size))
with pa.OSFile(str(save_path), "wb") as sink:
with pa.ipc.new_file(sink, schema) as writer:
for batch in p_bar:
ids, texts, imgs = batch
# print(ids, texts, len(imgs))
p_id = pa.array(ids, type=pa.string())
p_text = pa.array(texts, type=pa.string())
p_img = pa.array(imgs, type=pa.binary())
batch = pa.record_batch([p_id, p_text, p_img], schema)
writer.write(batch)
count += len(ids)
if count % batch_size == 0:
p_bar.set_description(f"processed {count}")
print(f"processed {count} in total")
def main():
fire.Fire(process)
if __name__ == "__main__":
main()
# follow https://github.com/redcaps-dataset/redcaps-downloader
for ann_file in annotations/[a-c]*.json; do
redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
# Set --resize -1 to turn off resizing shorter edge (saves disk space).
done
for ann_file in annotations/[d-n]*.json; do
redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
# Set --resize -1 to turn off resizing shorter edge (saves disk space).
done
for ann_file in annotations/[o-z]*.json; do
redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
# Set --resize -1 to turn off resizing shorter edge (saves disk space).
done
stats:
vg 108077, 0.1M
coco 113287, 0.11M
wit-filtered 1511216 1.51M
sbu1m 867217 0.87M
cc3m 2907484 2.91M
cc12m 10914482 10.91M
redcaps 11843894 11.84 M
sbu1m loaded 867217 in 6.650 s
convert_pyarrow.py --dataset_dir cc3m-img
processed 2907000: : 2908it [9:19:07, 11.54s/it]
python convert_pyarrow.py --dataset_dir img-datasets/wit-v1-en-filtered-img
processed 1511000: : 1512it [1:55:07, 4.57s/it]
processed 1511216 in total
python convert_pyarrow.py --dataset_dir cc12m-img
processed 10914000: : 10915it [14:38:39, 4.83s/it]
processed 10914484 in total
# 16 cores model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
python read_arrow.py --arrow_file cc3m-img.arrow --iterate
loaded 2907484 in 3.985 s, used mem: 0 MB
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2908/2908 [12:18<00:00, 3.94it/s]
2907484 iters in 738.930 s
tar --sort=name --transform='s,vg-,,2' --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train
tar --sort=name --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train
find vg-train -type f -print0 | tar --sort=name --transform='s,vg-,,2' --transform='s,/,-,g' --show-transformed-names -cvf vgt.tar --null -T -
find cc3m -type f -name '*.jpg' -print0 | while read -d $'\0' image
do
new=${image//\//-}
echo "${new%.jpg}.txt file:${image%.jpg}.txt"
echo "$new file:$image"
done |
sort |
tarp create -o - - |
tarp split -c 200000 -o 'cc3m-tars/cc3m-%06d.tar' -
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment