dandelin/convert_pyarrow.py

## convert_pyarrow.py
"""
crawl images:
pip install img2dataset==1.11.0

img2dataset --url_list cc3m.tsv\
            --output_folder cc3m-img --input_format "tsv"\
            --url_col "url" --caption_col "caption"\
            --output_format files --resize_mode=no\
            --processes_count 10 --thread_count 64 --number_sample_per_shard 2000\
            --enable_wandb True --save_metadata False

img2dataset --url_list cc3m-val.tsv  --input_format "tsv" \
            --url_col "url" --caption_col "caption" \
            --output_format files --output_folder cc3m-val-img \
            --image_size 512 --resize_only_if_bigger yes --resize_mode center_crop \
            --processes_count `nproc` --thread_count 16 --number_sample_per_shard 1000 \
            --enable_wandb True --wandb_project img2dataset --retries 5

dataset=redcaps

dataset=cc12m
shard=4000
processes=32
threads=64

dataset=cc3m
shard=2000
processes=16
threads=64

dataset=sbu1m
shard=2000
processes=16
threads=32

dataset=wit-v1-en-filtered
shard=1000
processes=8
threads=8

dataset=wit-v1-en-noisy
shard=1000
processes=8
threads=8
# set both processes and threads to 8 to avoid wikipedia has rate limiting

img2dataset --url_list ${dataset}.tsv.gz\
            --output_folder ${dataset}-img --input_format "tsv.gz"\
						--resize_only_if_bigger True --image_size 512\
            --url_col "url" --caption_col "caption"\
            --output_format files --resize_mode=keep_ratio\
            --processes_count ${processes} --thread_count ${threads} --number_sample_per_shard ${shard}\
            --enable_wandb True --save_metadata False --timeout 30

preprocess wit-v1:
for i in `seq 1 9`; do
 echo $i
 gzip -dc wit_v1.train.all-0000$i-of-00010.tsv.gz | awk '{ if ($1 == "en") { print } }' > wit-v1-train-en-$i.tsv
done


usage:
python convert_pyarrow.py --dataset_dir cc3m-img
python convert_pyarrow.py --dataset_dir wit-v1-en-filtered-img
python convert_pyarrow.py --dataset_dir cc12m-img
python convert_pyarrow.py --dataset_dir  sbu1m
"""
import fire

from pathlib import Path

from tqdm import tqdm

import pyarrow as pa


def iter_file(path):
    for folder in path.glob('*'):
        for file in folder.glob("*.jpg"):
            yield file


def read_file(file, mod='rb'):
    with open(file, mod) as f:
        d = f.read()
    return d


def batch_gen(ds_path, batch_size=4):
    ids, texts, imgs = [], [], []
    ds_name = ds_path.stem
    for img_file in iter_file(ds_path):
        img_data = read_file(img_file)
        pair_name = img_file.stem
        pair_id = '-'.join([ds_name, pair_name])
        txt_file = img_file.with_suffix('.txt')
        if not txt_file.exists():
            continue
        text = read_file(txt_file, 'r')
        ids.append(pair_id)
        texts.append(text)
        imgs.append(img_data)
        if len(ids) == batch_size:
            yield ids, texts, imgs
            ids, texts, imgs = [], [], []
    if ids:
        yield ids, texts, imgs


def process(dataset_dir, save_file=None, batch_size=1000):
    dataset_path = Path(dataset_dir)

    if save_file is None:
        save_path = dataset_path.with_suffix('.arrow')
    else:
        save_path = Path(save_file)
    save_path.parent.mkdir(parents=True, exist_ok=True)

    count = 0
    p_bar = tqdm(batch_gen(dataset_path, batch_size))

    schema = pa.schema([('id', pa.string()), ('text', pa.string()), ('image', pa.binary())])

    with pa.OSFile(str(save_path), 'wb') as sink:
        with pa.ipc.new_file(sink, schema) as writer:
            for batch in p_bar:
                ids, texts, imgs = batch
#                 print(ids, texts, len(imgs))
                p_id = pa.array(ids, type=pa.string())
                p_text = pa.array(texts, type=pa.string())
                p_img = pa.array(imgs, type=pa.binary())
                batch = pa.record_batch([p_id, p_text, p_img], schema)
                writer.write(batch)
                count += len(ids)
                if count % batch_size == 0:
                    p_bar.set_description(f'processed {count}')
    print(f'processed {count} in total')


def main():
    fire.Fire(process)


if __name__ == "__main__":
    main()

## convert_redcaps.py
import json
import fire
import csv
import gzip
from tqdm.auto import tqdm
from pathlib import Path


def process(redcaps_dir, out_file):
    redcaps_path = Path(redcaps_dir)
    num_files = 0
    num_pairs = 0
    with open(out_file, 'w', newline='') as of:
        tsv_writer = csv.writer(of, delimiter='\t')
        tsv_writer.writerow(["caption", "url"])
        for anno_file in tqdm(redcaps_path.glob('*.json')):
            d = json.load(open(anno_file))
            num_files += 1
            for item in tqdm(d['annotations'], leave=False):
                url = item['url']
                caption = item['caption']
                num_pairs += 1
                tsv_writer.writerow([caption, url])
    print(f'processed {num_files} files, {num_pairs} pairs in total')


def main():
    fire.Fire(process)


if __name__ == "__main__":
    main()

## filter_wit.py
# parse wit en
# python parse.py -d wit-v1-train-en.tsv.gz -o wit-v1-en-noisy.tsv

import argparse
import csv
import gzip
from tqdm import tqdm

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--data_file", type=str, required=True)
    parser.add_argument("-o", "--output_file", type=str, required=True)
    args = parser.parse_args()
    data_file = args.data_file
    if data_file.endswith('.gz'):
      df = gzip.open(data_file, mode='rt', newline='')
    else:
      df = open(args.data_file)
    count = 0
    skipped = 0
    with open(args.output_file, 'w', newline='') as of:
      tsv_writer = csv.writer(of, delimiter='\t')
      tsv_writer.writerow(["caption", "url"])
      tsv_reader = csv.reader(df, delimiter="\t")
      p_bar = tqdm(tsv_reader)
      for row in p_bar:
          n_col = len(row)
          if n_col < 7:
            skipped += 1
            continue
          caption = row[6]
          url = row[2]
          if url:
            if caption: # wit-v1-en-filtered
              # print(f't={len(row)}, cap={row[6]}, url={row[2]}')
              tsv_writer.writerow([caption, url])
              count += 1
            else:
              # use this to include more noisy data, i.e. wit-v1-en-noisy
              caption = row[7] if n_col>7 else ''
              caption += row[8] if n_col>8 else ''
              if caption:
                tsv_writer.writerow([caption, url])
                count += 1
              else:
                skipped += 1
          else:
            skipped += 1
          if count % 1000 == 0:
            p_bar.set_description(f"Processed {count}, skip {skipped}")
    print(f'all done, got {count}, skip {skipped}, total={count+skipped}')

## make_coco_arrow.py
# coco datasets https://cs.stanford.edu/people/karpathy/deepimagesent/
import json

import fire

from pathlib import Path

from tqdm import tqdm

import pyarrow as pa
from collections import defaultdict


def read_image(image_name, image_dir=Path.cwd(), mod='rb'):
    split = image_name.split('_')[1]
    if type(split) != str:
        raise ValueError(f'{image_name}, {split}, {type(split)}')
    image_file = Path(image_dir, split, image_name)
    if not image_file.exists():
        raise ValueError(f'{image_file} not exist!')
    with open(image_file, mod) as f:
        d = f.read()
    return d


def get_data(dataset_dir):
    dataset_path = Path(dataset_dir)
    coco_dataset = json.load(open(dataset_path / 'dataset_coco.json'))
    image_items = coco_dataset["images"]

    img2captions = defaultdict(list)
    img2split = dict()

    for item in tqdm(image_items):
        filename = item["filename"]
        img2split[filename] = item["split"]
        for i in item["sentences"]:
            img2captions[filename].append(i["raw"])
    train_split, val_split, test_split = {}, {}, {}
    for k, v in img2split.items():
        if v == 'train' or v == 'restval':
            train_split[k] = img2captions[k]
        elif v == 'val':
            val_split[k] = img2captions[k]
        else:
            assert v == 'test'
            test_split[k] = img2captions[k]
    return train_split, val_split, test_split


def batch_gen(data_split, image_dir='coco', batch_size=4):
    ids, texts, imgs = [], [], []
    for img_file, captions in data_split.items():
        img_data = read_image(img_file, image_dir)
        ids.append(img_file)
        texts.append(captions)
        imgs.append(img_data)
        if len(ids) == batch_size:
            yield ids, texts, imgs
            ids, texts, imgs = [], [], []
    if ids:
        yield ids, texts, imgs


def process_split(data_split, image_dir, save_file=None, batch_size=1000):
    save_path = Path(save_file)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    count = 0
    p_bar = tqdm(batch_gen(data_split, image_dir, batch_size))

    schema = pa.schema(
        [('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())])
    with pa.OSFile(str(save_path), 'wb') as sink:
        with pa.ipc.new_file(sink, schema) as writer:
            for batch in p_bar:
                ids, texts, imgs = batch
#                 print(ids, texts, len(imgs))
                p_id = pa.array(ids, type=pa.string())
                p_text = pa.array(texts, type=pa.list_(pa.string()))
                p_img = pa.array(imgs, type=pa.binary())
                batch = pa.record_batch([p_id, p_text, p_img], schema)
                writer.write(batch)
                count += len(ids)
                if count % batch_size == 0:
                    p_bar.set_description(f'processed {count}')
    print(f'processed {count} in total')


def process(dataset_dir):
    print('loading dataset...')
    train_split, val_split, test_split = get_data(dataset_dir)
    print('processing train split...')
    process_split(train_split, dataset_dir,
                  save_file='coco-karpathy-train.arrow', batch_size=1000)
    print('processing val split...')
    process_split(val_split, dataset_dir,
                  save_file='coco-karpathy-val.arrow', batch_size=1000)
    print('processing test split...')
    process_split(test_split, dataset_dir,
                  save_file='coco-karpathy-test.arrow', batch_size=1000)
    print('all done...')


def main():
    fire.Fire(process)


if __name__ == "__main__":
    main()

## make_vg_arrow.py
# download datasets and images from https://visualgenome.org/api/v0/api_home.html
import json

import pyarrow as pa
from collections import defaultdict

from tqdm import tqdm

import fire

from pathlib import Path


def read_image(image_name, image_dir=Path.cwd(), mod='rb'):
    img_f1 = Path(image_dir, 'VG_100K', f'{image_name}.jpg')
    img_f2 = Path(image_dir, 'VG_100K_2', f'{image_name}.jpg')
    if img_f1.exists():
        image_file = img_f1
    elif img_f2.exists():
        image_file = img_f2
    else:
        raise ValueError(f'neither VG_100K nor VG_100K_2 has {image_name}.jpg !')

    with open(image_file, mod) as f:
        d = f.read()
    return d


def get_data(dataset_dir):
    dataset_path = Path(dataset_dir)
    vg_dataset = json.load(open(dataset_path / 'region_descriptions.json'))
    img2captions = defaultdict(list)
    for cap in tqdm(vg_dataset):
        cap = cap["regions"]
        for c in cap:
            img2captions[c["image_id"]].append(c['phrase'])
    return img2captions


def batch_gen(data_split, image_dir='vg', batch_size=4):
    ids, texts, imgs = [], [], []
    for img_file, captions in data_split.items():
        img_data = read_image(img_file, image_dir)
        ids.append(f'{image_dir}-{img_file}')
        texts.append(captions)
        imgs.append(img_data)
        if len(ids) == batch_size:
            yield ids, texts, imgs
            ids, texts, imgs = [], [], []
    if ids:
        yield ids, texts, imgs


def process_split(data_split, image_dir, save_file=None, batch_size=1000):
    save_path = Path(save_file)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    count = 0
    p_bar = tqdm(batch_gen(data_split, image_dir, batch_size))

    schema = pa.schema(
        [('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())])
    with pa.OSFile(str(save_path), 'wb') as sink:
        with pa.ipc.new_file(sink, schema) as writer:
            for batch in p_bar:
                ids, texts, imgs = batch
#                 print(ids, texts, len(imgs))
                p_id = pa.array(ids, type=pa.string())
                p_text = pa.array(texts, type=pa.list_(pa.string()))
                p_img = pa.array(imgs, type=pa.binary())
                batch = pa.record_batch([p_id, p_text, p_img], schema)
                writer.write(batch)
                count += len(ids)
                if count % batch_size == 0:
                    p_bar.set_description(f'processed {count}')
    print(f'processed {count} in total')


def process(dataset_dir):
    print('loading dataset...')
    train_split = get_data(dataset_dir)
    print('processing vg train...')
    process_split(train_split, dataset_dir, save_file='vg-train.arrow', batch_size=1000)
    print('all done...')


def main():
    fire.Fire(process)


if __name__ == "__main__":
    main()

## read_pyarrow.py
# python read_arrow.py --arrow_file cc3m-img.arrow --iterate

import fire
import io
import time
from pathlib import Path
from PIL import Image

from tqdm import tqdm

import pyarrow as pa


def main(arrow_file, iterate=False):
    start = time.perf_counter()
    with pa.memory_map(arrow_file, 'rb') as source:
        pa_reader = pa.ipc.open_file(source)
        loaded_array = pa_reader.read_all()
    init = time.perf_counter()
    print(f"loaded {len(loaded_array)} in {init-start:.3f} s, used mem: {pa.total_allocated_bytes() >> 20} MB")
    if iterate:
        start = time.perf_counter()
        count = 0
        for batch in tqdm(loaded_array.to_batches()):
            d = batch.to_pydict()
            for c1, c2, c3 in zip(d['id'], d['text'], d['image']):
                count += 1
        print(f'{count} iters in {time.perf_counter()-start:.3f} s')


# main('cc3msmall.arrow', True)

# +
# from IPython.display import display # to display images
# for i, t, img in zip(loaded_array[0], loaded_array[1], loaded_array[2]):
#     print(i, t, )
#     image = Image.open(io.BytesIO(img.as_py()))
#     display(image)

# +
# loaded_array[2][1].as_py()
# -

if __name__ == "__main__":
    fire.Fire(main)


## redcaps2pyarrow.py
import json
import fire
import csv
import gzip
from tqdm.auto import tqdm
from pathlib import Path
import pyarrow as pa


schema = pa.schema([("id", pa.string()), ("text", pa.string()), ("image", pa.binary())])


def read_file(file, mod="rb"):
    with open(file, mod) as f:
        d = f.read()
    return d


def batch_gen(annotation_path, images_path, batch_size=4):
    ids, texts, imgs = [], [], []
    for anno_file in tqdm(annotation_path.glob("*.json")):
        d = json.load(open(anno_file))
        for item in tqdm(d["annotations"], leave=False):
            image_id = item["image_id"]
            subreddit = item["subreddit"]
            img_file = images_path / subreddit / f"{image_id}.jpg"
            if not img_file.exists():
                continue
            img_data = read_file(img_file)
            caption = item["caption"]
            pair_id = "-".join(["redcaps", subreddit, image_id])
            ids.append(pair_id)
            texts.append(caption)
            imgs.append(img_data)
            if len(ids) == batch_size:
                yield ids, texts, imgs
                ids, texts, imgs = [], [], []
    if ids:
        yield ids, texts, imgs


def process(redcaps_dir, save_file, images_dir=None, batch_size=1000):
    redcaps_path = Path(redcaps_dir)
    annotation_path = redcaps_path / "annotations"
    images_path = redcaps_path / "images" if images_dir is None else Path(images_dir)

    if save_file is None:
        save_path = redcaps_path.with_suffix(".arrow")
    else:
        save_path = Path(save_file)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    count = 0

    p_bar = tqdm(batch_gen(annotation_path, images_path, batch_size))
    with pa.OSFile(str(save_path), "wb") as sink:
        with pa.ipc.new_file(sink, schema) as writer:
            for batch in p_bar:
                ids, texts, imgs = batch
                #                 print(ids, texts, len(imgs))
                p_id = pa.array(ids, type=pa.string())
                p_text = pa.array(texts, type=pa.string())
                p_img = pa.array(imgs, type=pa.binary())
                batch = pa.record_batch([p_id, p_text, p_img], schema)
                writer.write(batch)
                count += len(ids)
                if count % batch_size == 0:
                    p_bar.set_description(f"processed {count}")
    print(f"processed {count} in total")


def main():
    fire.Fire(process)


if __name__ == "__main__":
    main()

## redcaps_dowloader.sh
# follow https://github.com/redcaps-dataset/redcaps-downloader

for ann_file in annotations/[a-c]*.json; do
    redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
    # Set --resize -1 to turn off resizing shorter edge (saves disk space).
done


for ann_file in annotations/[d-n]*.json; do
    redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
    # Set --resize -1 to turn off resizing shorter edge (saves disk space).
done


for ann_file in annotations/[o-z]*.json; do
    redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
    # Set --resize -1 to turn off resizing shorter edge (saves disk space).
done


## stats.txt
stats:

vg 108077, 0.1M
coco 113287, 0.11M

wit-filtered 1511216 1.51M
sbu1m 867217 0.87M
cc3m 2907484 2.91M
cc12m 10914482 10.91M

redcaps 11843894 11.84 M
sbu1m loaded 867217 in 6.650 s

convert_pyarrow.py --dataset_dir cc3m-img
processed 2907000: : 2908it [9:19:07, 11.54s/it]

python convert_pyarrow.py --dataset_dir img-datasets/wit-v1-en-filtered-img
processed 1511000: : 1512it [1:55:07,  4.57s/it]
processed 1511216 in total

python convert_pyarrow.py --dataset_dir cc12m-img
processed 10914000: : 10915it [14:38:39,  4.83s/it]
processed 10914484 in total


# 16 cores model name      : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
python read_arrow.py --arrow_file cc3m-img.arrow --iterate
loaded 2907484 in 3.985 s, used mem: 0 MB
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2908/2908 [12:18<00:00,  3.94it/s]
2907484 iters in 738.930 s


## tar.sh
tar --sort=name --transform='s,vg-,,2'  --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train

tar --sort=name --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train

find vg-train -type f -print0 | tar --sort=name --transform='s,vg-,,2'  --transform='s,/,-,g' --show-transformed-names -cvf vgt.tar --null -T -


find cc3m -type f -name '*.jpg' -print0 | while read -d $'\0' image
do
    new=${image//\//-}
    echo "${new%.jpg}.txt file:${image%.jpg}.txt"
    echo "$new file:$image"
done |
sort |
tarp create -o - - |
tarp split -c 200000 -o 'cc3m-tars/cc3m-%06d.tar' -
	"""
	crawl images:
	pip install img2dataset==1.11.0

	img2dataset --url_list cc3m.tsv\
	--output_folder cc3m-img --input_format "tsv"\
	--url_col "url" --caption_col "caption"\
	--output_format files --resize_mode=no\
	--processes_count 10 --thread_count 64 --number_sample_per_shard 2000\
	--enable_wandb True --save_metadata False

	img2dataset --url_list cc3m-val.tsv --input_format "tsv" \
	--url_col "url" --caption_col "caption" \
	--output_format files --output_folder cc3m-val-img \
	--image_size 512 --resize_only_if_bigger yes --resize_mode center_crop \
	--processes_count `nproc` --thread_count 16 --number_sample_per_shard 1000 \
	--enable_wandb True --wandb_project img2dataset --retries 5

	dataset=redcaps

	dataset=cc12m
	shard=4000
	processes=32
	threads=64

	dataset=cc3m
	shard=2000
	processes=16
	threads=64

	dataset=sbu1m
	shard=2000
	processes=16
	threads=32

	dataset=wit-v1-en-filtered
	shard=1000
	processes=8
	threads=8

	dataset=wit-v1-en-noisy
	shard=1000
	processes=8
	threads=8
	# set both processes and threads to 8 to avoid wikipedia has rate limiting

	img2dataset --url_list ${dataset}.tsv.gz\
	--output_folder ${dataset}-img --input_format "tsv.gz"\
	--resize_only_if_bigger True --image_size 512\
	--url_col "url" --caption_col "caption"\
	--output_format files --resize_mode=keep_ratio\
	--processes_count ${processes} --thread_count ${threads} --number_sample_per_shard ${shard}\
	--enable_wandb True --save_metadata False --timeout 30

	preprocess wit-v1:
	for i in `seq 1 9`; do
	echo $i
	gzip -dc wit_v1.train.all-0000$i-of-00010.tsv.gz \| awk '{ if ($1 == "en") { print } }' > wit-v1-train-en-$i.tsv
	done


	usage:
	python convert_pyarrow.py --dataset_dir cc3m-img
	python convert_pyarrow.py --dataset_dir wit-v1-en-filtered-img
	python convert_pyarrow.py --dataset_dir cc12m-img
	python convert_pyarrow.py --dataset_dir sbu1m
	"""
	import fire

	from pathlib import Path

	from tqdm import tqdm

	import pyarrow as pa


	def iter_file(path):
	for folder in path.glob('*'):
	for file in folder.glob("*.jpg"):
	yield file


	def read_file(file, mod='rb'):
	with open(file, mod) as f:
	d = f.read()
	return d


	def batch_gen(ds_path, batch_size=4):
	ids, texts, imgs = [], [], []
	ds_name = ds_path.stem
	for img_file in iter_file(ds_path):
	img_data = read_file(img_file)
	pair_name = img_file.stem
	pair_id = '-'.join([ds_name, pair_name])
	txt_file = img_file.with_suffix('.txt')
	if not txt_file.exists():
	continue
	text = read_file(txt_file, 'r')
	ids.append(pair_id)
	texts.append(text)
	imgs.append(img_data)
	if len(ids) == batch_size:
	yield ids, texts, imgs
	ids, texts, imgs = [], [], []
	if ids:
	yield ids, texts, imgs


	def process(dataset_dir, save_file=None, batch_size=1000):
	dataset_path = Path(dataset_dir)

	if save_file is None:
	save_path = dataset_path.with_suffix('.arrow')
	else:
	save_path = Path(save_file)
	save_path.parent.mkdir(parents=True, exist_ok=True)

	count = 0
	p_bar = tqdm(batch_gen(dataset_path, batch_size))

	schema = pa.schema([('id', pa.string()), ('text', pa.string()), ('image', pa.binary())])

	with pa.OSFile(str(save_path), 'wb') as sink:
	with pa.ipc.new_file(sink, schema) as writer:
	for batch in p_bar:
	ids, texts, imgs = batch
	# print(ids, texts, len(imgs))
	p_id = pa.array(ids, type=pa.string())
	p_text = pa.array(texts, type=pa.string())
	p_img = pa.array(imgs, type=pa.binary())
	batch = pa.record_batch([p_id, p_text, p_img], schema)
	writer.write(batch)
	count += len(ids)
	if count % batch_size == 0:
	p_bar.set_description(f'processed {count}')
	print(f'processed {count} in total')


	def main():
	fire.Fire(process)


	if __name__ == "__main__":
	main()
	import json
	import fire
	import csv
	import gzip
	from tqdm.auto import tqdm
	from pathlib import Path


	def process(redcaps_dir, out_file):
	redcaps_path = Path(redcaps_dir)
	num_files = 0
	num_pairs = 0
	with open(out_file, 'w', newline='') as of:
	tsv_writer = csv.writer(of, delimiter='\t')
	tsv_writer.writerow(["caption", "url"])
	for anno_file in tqdm(redcaps_path.glob('*.json')):
	d = json.load(open(anno_file))
	num_files += 1
	for item in tqdm(d['annotations'], leave=False):
	url = item['url']
	caption = item['caption']
	num_pairs += 1
	tsv_writer.writerow([caption, url])
	print(f'processed {num_files} files, {num_pairs} pairs in total')


	def main():
	fire.Fire(process)


	if __name__ == "__main__":
	main()
	# parse wit en
	# python parse.py -d wit-v1-train-en.tsv.gz -o wit-v1-en-noisy.tsv

	import argparse
	import csv
	import gzip
	from tqdm import tqdm

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-d", "--data_file", type=str, required=True)
	parser.add_argument("-o", "--output_file", type=str, required=True)
	args = parser.parse_args()
	data_file = args.data_file
	if data_file.endswith('.gz'):
	df = gzip.open(data_file, mode='rt', newline='')
	else:
	df = open(args.data_file)
	count = 0
	skipped = 0
	with open(args.output_file, 'w', newline='') as of:
	tsv_writer = csv.writer(of, delimiter='\t')
	tsv_writer.writerow(["caption", "url"])
	tsv_reader = csv.reader(df, delimiter="\t")
	p_bar = tqdm(tsv_reader)
	for row in p_bar:
	n_col = len(row)
	if n_col < 7:
	skipped += 1
	continue
	caption = row[6]
	url = row[2]
	if url:
	if caption: # wit-v1-en-filtered
	# print(f't={len(row)}, cap={row[6]}, url={row[2]}')
	tsv_writer.writerow([caption, url])
	count += 1
	else:
	# use this to include more noisy data, i.e. wit-v1-en-noisy
	caption = row[7] if n_col>7 else ''
	caption += row[8] if n_col>8 else ''
	if caption:
	tsv_writer.writerow([caption, url])
	count += 1
	else:
	skipped += 1
	else:
	skipped += 1
	if count % 1000 == 0:
	p_bar.set_description(f"Processed {count}, skip {skipped}")
	print(f'all done, got {count}, skip {skipped}, total={count+skipped}')
	# coco datasets https://cs.stanford.edu/people/karpathy/deepimagesent/
	import json

	import fire

	from pathlib import Path

	from tqdm import tqdm

	import pyarrow as pa
	from collections import defaultdict


	def read_image(image_name, image_dir=Path.cwd(), mod='rb'):
	split = image_name.split('_')[1]
	if type(split) != str:
	raise ValueError(f'{image_name}, {split}, {type(split)}')
	image_file = Path(image_dir, split, image_name)
	if not image_file.exists():
	raise ValueError(f'{image_file} not exist!')
	with open(image_file, mod) as f:
	d = f.read()
	return d


	def get_data(dataset_dir):
	dataset_path = Path(dataset_dir)
	coco_dataset = json.load(open(dataset_path / 'dataset_coco.json'))
	image_items = coco_dataset["images"]

	img2captions = defaultdict(list)
	img2split = dict()

	for item in tqdm(image_items):
	filename = item["filename"]
	img2split[filename] = item["split"]
	for i in item["sentences"]:
	img2captions[filename].append(i["raw"])
	train_split, val_split, test_split = {}, {}, {}
	for k, v in img2split.items():
	if v == 'train' or v == 'restval':
	train_split[k] = img2captions[k]
	elif v == 'val':
	val_split[k] = img2captions[k]
	else:
	assert v == 'test'
	test_split[k] = img2captions[k]
	return train_split, val_split, test_split


	def batch_gen(data_split, image_dir='coco', batch_size=4):
	ids, texts, imgs = [], [], []
	for img_file, captions in data_split.items():
	img_data = read_image(img_file, image_dir)
	ids.append(img_file)
	texts.append(captions)
	imgs.append(img_data)
	if len(ids) == batch_size:
	yield ids, texts, imgs
	ids, texts, imgs = [], [], []
	if ids:
	yield ids, texts, imgs


	def process_split(data_split, image_dir, save_file=None, batch_size=1000):
	save_path = Path(save_file)
	save_path.parent.mkdir(parents=True, exist_ok=True)
	count = 0
	p_bar = tqdm(batch_gen(data_split, image_dir, batch_size))

	schema = pa.schema(
	[('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())])
	with pa.OSFile(str(save_path), 'wb') as sink:
	with pa.ipc.new_file(sink, schema) as writer:
	for batch in p_bar:
	ids, texts, imgs = batch
	# print(ids, texts, len(imgs))
	p_id = pa.array(ids, type=pa.string())
	p_text = pa.array(texts, type=pa.list_(pa.string()))
	p_img = pa.array(imgs, type=pa.binary())
	batch = pa.record_batch([p_id, p_text, p_img], schema)
	writer.write(batch)
	count += len(ids)
	if count % batch_size == 0:
	p_bar.set_description(f'processed {count}')
	print(f'processed {count} in total')


	def process(dataset_dir):
	print('loading dataset...')
	train_split, val_split, test_split = get_data(dataset_dir)
	print('processing train split...')
	process_split(train_split, dataset_dir,
	save_file='coco-karpathy-train.arrow', batch_size=1000)
	print('processing val split...')
	process_split(val_split, dataset_dir,
	save_file='coco-karpathy-val.arrow', batch_size=1000)
	print('processing test split...')
	process_split(test_split, dataset_dir,
	save_file='coco-karpathy-test.arrow', batch_size=1000)
	print('all done...')


	def main():
	fire.Fire(process)


	if __name__ == "__main__":
	main()
	# download datasets and images from https://visualgenome.org/api/v0/api_home.html
	import json

	import pyarrow as pa
	from collections import defaultdict

	from tqdm import tqdm

	import fire

	from pathlib import Path


	def read_image(image_name, image_dir=Path.cwd(), mod='rb'):
	img_f1 = Path(image_dir, 'VG_100K', f'{image_name}.jpg')
	img_f2 = Path(image_dir, 'VG_100K_2', f'{image_name}.jpg')
	if img_f1.exists():
	image_file = img_f1
	elif img_f2.exists():
	image_file = img_f2
	else:
	raise ValueError(f'neither VG_100K nor VG_100K_2 has {image_name}.jpg !')

	with open(image_file, mod) as f:
	d = f.read()
	return d


	def get_data(dataset_dir):
	dataset_path = Path(dataset_dir)
	vg_dataset = json.load(open(dataset_path / 'region_descriptions.json'))
	img2captions = defaultdict(list)
	for cap in tqdm(vg_dataset):
	cap = cap["regions"]
	for c in cap:
	img2captions[c["image_id"]].append(c['phrase'])
	return img2captions


	def batch_gen(data_split, image_dir='vg', batch_size=4):
	ids, texts, imgs = [], [], []
	for img_file, captions in data_split.items():
	img_data = read_image(img_file, image_dir)
	ids.append(f'{image_dir}-{img_file}')
	texts.append(captions)
	imgs.append(img_data)
	if len(ids) == batch_size:
	yield ids, texts, imgs
	ids, texts, imgs = [], [], []
	if ids:
	yield ids, texts, imgs


	def process_split(data_split, image_dir, save_file=None, batch_size=1000):
	save_path = Path(save_file)
	save_path.parent.mkdir(parents=True, exist_ok=True)
	count = 0
	p_bar = tqdm(batch_gen(data_split, image_dir, batch_size))

	schema = pa.schema(
	[('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())])
	with pa.OSFile(str(save_path), 'wb') as sink:
	with pa.ipc.new_file(sink, schema) as writer:
	for batch in p_bar:
	ids, texts, imgs = batch
	# print(ids, texts, len(imgs))
	p_id = pa.array(ids, type=pa.string())
	p_text = pa.array(texts, type=pa.list_(pa.string()))
	p_img = pa.array(imgs, type=pa.binary())
	batch = pa.record_batch([p_id, p_text, p_img], schema)
	writer.write(batch)
	count += len(ids)
	if count % batch_size == 0:
	p_bar.set_description(f'processed {count}')
	print(f'processed {count} in total')


	def process(dataset_dir):
	print('loading dataset...')
	train_split = get_data(dataset_dir)
	print('processing vg train...')
	process_split(train_split, dataset_dir, save_file='vg-train.arrow', batch_size=1000)
	print('all done...')


	def main():
	fire.Fire(process)


	if __name__ == "__main__":
	main()
	# python read_arrow.py --arrow_file cc3m-img.arrow --iterate

	import fire
	import io
	import time
	from pathlib import Path
	from PIL import Image

	from tqdm import tqdm

	import pyarrow as pa


	def main(arrow_file, iterate=False):
	start = time.perf_counter()
	with pa.memory_map(arrow_file, 'rb') as source:
	pa_reader = pa.ipc.open_file(source)
	loaded_array = pa_reader.read_all()
	init = time.perf_counter()
	print(f"loaded {len(loaded_array)} in {init-start:.3f} s, used mem: {pa.total_allocated_bytes() >> 20} MB")
	if iterate:
	start = time.perf_counter()
	count = 0
	for batch in tqdm(loaded_array.to_batches()):
	d = batch.to_pydict()
	for c1, c2, c3 in zip(d['id'], d['text'], d['image']):
	count += 1
	print(f'{count} iters in {time.perf_counter()-start:.3f} s')


	# main('cc3msmall.arrow', True)

	# +
	# from IPython.display import display # to display images
	# for i, t, img in zip(loaded_array[0], loaded_array[1], loaded_array[2]):
	# print(i, t, )
	# image = Image.open(io.BytesIO(img.as_py()))
	# display(image)

	# +
	# loaded_array[2][1].as_py()
	# -

	if __name__ == "__main__":
	fire.Fire(main)
	# follow https://github.com/redcaps-dataset/redcaps-downloader

	for ann_file in annotations/[a-c]*.json; do
	redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
	# Set --resize -1 to turn off resizing shorter edge (saves disk space).
	done


	for ann_file in annotations/[d-n]*.json; do
	redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
	# Set --resize -1 to turn off resizing shorter edge (saves disk space).
	done


	for ann_file in annotations/[o-z]*.json; do
	redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4
	# Set --resize -1 to turn off resizing shorter edge (saves disk space).
	done
	stats:

	vg 108077, 0.1M
	coco 113287, 0.11M

	wit-filtered 1511216 1.51M
	sbu1m 867217 0.87M
	cc3m 2907484 2.91M
	cc12m 10914482 10.91M

	redcaps 11843894 11.84 M
	sbu1m loaded 867217 in 6.650 s

	convert_pyarrow.py --dataset_dir cc3m-img
	processed 2907000: : 2908it [9:19:07, 11.54s/it]

	python convert_pyarrow.py --dataset_dir img-datasets/wit-v1-en-filtered-img
	processed 1511000: : 1512it [1:55:07, 4.57s/it]
	processed 1511216 in total

	python convert_pyarrow.py --dataset_dir cc12m-img
	processed 10914000: : 10915it [14:38:39, 4.83s/it]
	processed 10914484 in total


	# 16 cores model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
	python read_arrow.py --arrow_file cc3m-img.arrow --iterate
	loaded 2907484 in 3.985 s, used mem: 0 MB
	100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2908/2908 [12:18<00:00, 3.94it/s]
	2907484 iters in 738.930 s
	tar --sort=name --transform='s,vg-,,2' --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train

	tar --sort=name --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train

	find vg-train -type f -print0 \| tar --sort=name --transform='s,vg-,,2' --transform='s,/,-,g' --show-transformed-names -cvf vgt.tar --null -T -



	find cc3m -type f -name '*.jpg' -print0 \| while read -d $'\0' image
	do
	new=${image//\//-}
	echo "${new%.jpg}.txt file:${image%.jpg}.txt"
	echo "$new file:$image"
	done \|
	sort \|
	tarp create -o - - \|
	tarp split -c 200000 -o 'cc3m-tars/cc3m-%06d.tar' -