albanie/minimal_profiler.py

## minimal_profiler.py
"""A minimal profiler to compare speed differences between reading a video with opencv
and DALI.

For both dataloaders, the goal is to read an mp4 video and then load batches of frames
onto the GPU.
"""
import os
import logging
import time
import subprocess
import argparse
import platform
from pathlib import Path
from collections import defaultdict

from datetime import datetime
import cv2
import numpy as np
import torch
import nvidia.dali
import nvidia.dali.fn as fn
from nvidia.dali import pipeline_def
from nvidia.dali.plugin.pytorch import DALIGenericIterator


def fetch_and_prep_movie(
        url: str,
        dest_path: Path,
        refresh: bool,
        height: int,
        width: int,
        trim: str,
):
    """Fetch a video from the web and prepare it for profiling.

    Args:
        url: the location of the video
        dest_path: location where the the prepared video will be stored
        refresh: whether to overwrite an existing video
        height: the height (in pixels) that the video will be resized to
        width: the width (in pixels) that the video will be resized to
        trim: the video will be cropped to this duration, specified as an ffmpeg-style
            formatted string (i.e. the `HH:MM.SS.MMM` format)
    """
    if dest_path.exists() and not refresh:
        print(f"Found existing sintel movie at {url}")
        return
    print(f"Fetching original sintel movie from {url}")
    dest_path.parent.mkdir(exist_ok=True, parents=True)
    raw_video = dest_path.parent / f"{dest_path.stem}-raw{dest_path.suffix}"
    os.system(f"wget {url} -O {raw_video}")
    prep = f'ffmpeg -y -i {raw_video} -vf "scale={height}x{width}" -to {trim} {dest_path}'
    os.system(prep)


def mini_cv2_dataloader(
        video_path: Path,
        batch_size: int,
        sequence_length: int,
        height: int,
        width: int,
):
    """A minimalist opencv2 dataloader

    Args:
        video_path: location of the video to be read
        batch_size: the batch dimension of the tensor to be loaded
        sequence_length: the sequence dimension of the tensor to be loaded
        height: the height (in pixels) of the video
        width: the width (in pixels) of the video

    Yields:
        A batch of video frames, stored as a pytorch tensor on the GPU in

    NOTE: This is not meant to be in any way optimised - it's the simplest way
    to load frames into a tensor (that I could think of).
    """
    video_capture = cv2.VideoCapture(str(video_path))
    while video_capture.isOpened():
        next_batch = torch.zeros((batch_size, sequence_length, height, width, 3),
                                  device="cuda")
        for batch_idx in range(batch_size):
            for seq_idx in range(sequence_length):
                _, im = video_capture.read()
                if im is None:
                    return
                gpu_im = torch.from_numpy(im)  # move to GPU to match DALI outputs
                next_batch[batch_idx, seq_idx] = gpu_im
        yield [{"data": next_batch}] # mimic DALI pipeline output


def profile(
        mode: str,
        video_path: Path,
        test_vid_width: int,
        test_vid_height: int,
        batch_size: int,
        sequence_length: int,
        num_threads: int,
        initial_prefetch_size: int,
        warmup: int,
):
    """Profile video readers

    Args:
        model: whether to use cv2 or dali as the data loader
        video_path: location of the video to be read
        test_vid_width: the width (in pixels) of the video
        test_vid_height: the height (in pixels) of the video
        batch_size: the batch dimension of the tensor to be loaded
        sequence_length: the sequence dimension of the tensor to be loaded
        num_threads: the number of threads to use for reading (DALI only)
        initial_prefetch_size: DALI dataloader parameter
        warmup: how many iterations to run before starting to profile speed

    Returns:
        The average speed at which frames were read.
    """

    if mode == "cv2":
        dataloader = mini_cv2_dataloader(
            video_path=video_path,
            batch_size=batch_size,
            sequence_length=sequence_length,
            width=test_vid_width,
            height=test_vid_height,
        )

    elif mode == "dali":
        @pipeline_def
        def video_pipe():
            video = fn.readers.video(
                name="video",
                device="gpu",
                filenames=[str(video_path)],
                sequence_length=sequence_length,
                shard_id=0,
                num_shards=1,
                random_shuffle=False,
                pad_last_batch=True,
                initial_fill=initial_prefetch_size,
                stride=1,
                step=sequence_length,
            )
            return video

        pipe = video_pipe( #pylint: disable = unexpected-keyword-arg
            batch_size=batch_size,
            prefetch_queue_depth=1,
            num_threads=num_threads,
            device_id=0,
            seed=0,
        )
        pipe.build()
        dataloader = DALIGenericIterator(pipe, ["data"], reader_name='video')

    # Compare video reading speed
    for ii, batch in enumerate(dataloader):
        if ii == warmup:
            start = time.time()
            total_frames = 0

        if ii > warmup:
            total_time = time.time() - start
            shape = batch[0]["data"].shape
            total_frames += shape[0] * shape[1]
            avg_hz = total_frames / max(total_time, 1E-5)
            if ii % 10 == 0:
                print(f"Speed {avg_hz:.1f} hz, {shape}")
    return avg_hz


def main():
    # pylint: disable=line-too-long
    # flake8: noqa: E501
    parser = argparse.ArgumentParser()
    parser.add_argument("--src_url", default="http://peach.themazzone.com/durian/movies/sintel-1024-surround.mp4")
    parser.add_argument("--video_path", type=Path, default="data/dali_mwp/sintel.mp4")
    parser.add_argument("--batch_sizes", type=int, nargs="+", default=[1, 10])
    parser.add_argument("--log_dir", type=Path, default="data/dali_mwp/logs")
    parser.add_argument("--refresh", action="store_true")
    parser.add_argument("--warmup", type=int, default=1)
    parser.add_argument("--modes", type=str, nargs="+", default=["cv2", "dali"])
    parser.add_argument("--test_vid_height", type=int, default=256)
    parser.add_argument("--test_vid_width", type=int, default=256)
    parser.add_argument("--test_vid_duration", type=str, default="00:03:00")
    parser.add_argument("--num_threads", type=int, nargs="+", default=[1, 4, 8, 16])
    parser.add_argument("--sequence_lengths", nargs="+", type=int, default=[8, 16, 32, 64])
    parser.add_argument("--num_runs", type=int, default=3)
    parser.add_argument("--initial_prefetch_size", type=int, default=0)
    args = parser.parse_args()

    # fetch sintel movie for profiling
    fetch_and_prep_movie(
        url=args.src_url,
        dest_path=args.video_path,
        refresh=args.refresh,
        width=args.test_vid_width,
        height=args.test_vid_height,
        trim=args.test_vid_duration,
    )

    # Set up logging
    log_name = f"timing-log-{datetime.now().strftime('%y-%m-%d_%H:%M:%S.txt')}"
    args.log_dir.mkdir(exist_ok=True, parents=True)
    handlers = [logging.FileHandler(str(args.log_dir / log_name)), logging.StreamHandler()]
    logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=handlers)
    logging.info(f"Launched profiler with args\n: {args}")

    # Compute some stats about how many runs will be profiled
    cv2_runs = len(args.batch_sizes) * len(args.sequence_lengths) * args.num_runs
    dali_runs = cv2_runs * len(args.num_threads)
    total_runs = cv2_runs + dali_runs
    logging.info(f"Running the profiler on {total_runs} configurations")

    # Keep track of current run
    curr_run_idx = 1

    # Store timing results in a defaultdict, with one list of timings per configuration
    # to make it easy to average across runs
    results = defaultdict(list)

    # First, profile video loading with opencv. Since this has fewer configuration options
    # than dali, we run it as a separate loop
    mode = "cv2"
    for batch_size in args.batch_sizes:
        for sequence_length in args.sequence_lengths:
            for _ in range(args.num_runs):
                tag = f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}"
                print(f"{tag} {len(results[tag])} [{curr_run_idx}/{total_runs}]")
                avg_hz = profile(
                    video_path=args.video_path,
                    test_vid_width=args.test_vid_width,
                    test_vid_height=args.test_vid_height,
                    batch_size=batch_size,
                    sequence_length=sequence_length,
                    num_threads=1,
                    initial_prefetch_size=args.initial_prefetch_size,
                    warmup=args.warmup,
                    mode=mode,
                )
                results[tag].append(avg_hz)
                curr_run_idx += 1

    # Second, repeat the same configurations with DALI, with the addition of an extra
    # set of configurations using different numbers of threads
    mode = "dali"
    for batch_size in args.batch_sizes:
        for sequence_length in args.sequence_lengths:
            for _ in range(args.num_runs):
                for num_thread in args.num_threads:
                    tag = (f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}"
                           f"-num_threads-{num_thread}")
                    print(f"{tag} run {len(results[tag])} [{curr_run_idx}/{total_runs}]")
                    avg_hz = profile(
                        video_path=args.video_path,
                        test_vid_width=args.test_vid_width,
                        test_vid_height=args.test_vid_height,
                        batch_size=batch_size,
                        sequence_length=sequence_length,
                        num_threads=num_thread,
                        initial_prefetch_size=args.initial_prefetch_size,
                        warmup=args.warmup,
                        mode=mode,
                    )
                    results[tag].append(avg_hz)
                    curr_run_idx += 1

    logging.info("===========================================")
    logging.info("Hardware summary (note, only using one GPU)")
    logging.info(subprocess.check_output(["nvidia-smi", "-L"]))
    logging.info("===========================================")
    logging.info("Platform summary:")
    logging.info(platform.sys.version)
    logging.info("===========================================")
    logging.info(f"DALI version: {nvidia.dali.__version__}")
    logging.info(f"Torch cuda: {torch.version.cuda}")
    logging.info("===========================================")
    logging.info(f"Timing results (averages across {args.num_runs}):")

    for mode, speeds in results.items():
        logging.info(f"{mode}: {np.mean(speeds):.1f} Hz +/- {np.std(speeds):.1f}")

if __name__ == "__main__":
    main()
	"""A minimal profiler to compare speed differences between reading a video with opencv
	and DALI.

	For both dataloaders, the goal is to read an mp4 video and then load batches of frames
	onto the GPU.
	"""
	import os
	import logging
	import time
	import subprocess
	import argparse
	import platform
	from pathlib import Path
	from collections import defaultdict

	from datetime import datetime
	import cv2
	import numpy as np
	import torch
	import nvidia.dali
	import nvidia.dali.fn as fn
	from nvidia.dali import pipeline_def
	from nvidia.dali.plugin.pytorch import DALIGenericIterator


	def fetch_and_prep_movie(
	url: str,
	dest_path: Path,
	refresh: bool,
	height: int,
	width: int,
	trim: str,
	):
	"""Fetch a video from the web and prepare it for profiling.

	Args:
	url: the location of the video
	dest_path: location where the the prepared video will be stored
	refresh: whether to overwrite an existing video
	height: the height (in pixels) that the video will be resized to
	width: the width (in pixels) that the video will be resized to
	trim: the video will be cropped to this duration, specified as an ffmpeg-style
	formatted string (i.e. the `HH:MM.SS.MMM` format)
	"""
	if dest_path.exists() and not refresh:
	print(f"Found existing sintel movie at {url}")
	return
	print(f"Fetching original sintel movie from {url}")
	dest_path.parent.mkdir(exist_ok=True, parents=True)
	raw_video = dest_path.parent / f"{dest_path.stem}-raw{dest_path.suffix}"
	os.system(f"wget {url} -O {raw_video}")
	prep = f'ffmpeg -y -i {raw_video} -vf "scale={height}x{width}" -to {trim} {dest_path}'
	os.system(prep)


	def mini_cv2_dataloader(
	video_path: Path,
	batch_size: int,
	sequence_length: int,
	height: int,
	width: int,
	):
	"""A minimalist opencv2 dataloader

	Args:
	video_path: location of the video to be read
	batch_size: the batch dimension of the tensor to be loaded
	sequence_length: the sequence dimension of the tensor to be loaded
	height: the height (in pixels) of the video
	width: the width (in pixels) of the video

	Yields:
	A batch of video frames, stored as a pytorch tensor on the GPU in

	NOTE: This is not meant to be in any way optimised - it's the simplest way
	to load frames into a tensor (that I could think of).
	"""
	video_capture = cv2.VideoCapture(str(video_path))
	while video_capture.isOpened():
	next_batch = torch.zeros((batch_size, sequence_length, height, width, 3),
	device="cuda")
	for batch_idx in range(batch_size):
	for seq_idx in range(sequence_length):
	_, im = video_capture.read()
	if im is None:
	return
	gpu_im = torch.from_numpy(im) # move to GPU to match DALI outputs
	next_batch[batch_idx, seq_idx] = gpu_im
	yield [{"data": next_batch}] # mimic DALI pipeline output


	def profile(
	mode: str,
	video_path: Path,
	test_vid_width: int,
	test_vid_height: int,
	batch_size: int,
	sequence_length: int,
	num_threads: int,
	initial_prefetch_size: int,
	warmup: int,
	):
	"""Profile video readers

	Args:
	model: whether to use cv2 or dali as the data loader
	video_path: location of the video to be read
	test_vid_width: the width (in pixels) of the video
	test_vid_height: the height (in pixels) of the video
	batch_size: the batch dimension of the tensor to be loaded
	sequence_length: the sequence dimension of the tensor to be loaded
	num_threads: the number of threads to use for reading (DALI only)
	initial_prefetch_size: DALI dataloader parameter
	warmup: how many iterations to run before starting to profile speed

	Returns:
	The average speed at which frames were read.
	"""

	if mode == "cv2":
	dataloader = mini_cv2_dataloader(
	video_path=video_path,
	batch_size=batch_size,
	sequence_length=sequence_length,
	width=test_vid_width,
	height=test_vid_height,
	)

	elif mode == "dali":
	@pipeline_def
	def video_pipe():
	video = fn.readers.video(
	name="video",
	device="gpu",
	filenames=[str(video_path)],
	sequence_length=sequence_length,
	shard_id=0,
	num_shards=1,
	random_shuffle=False,
	pad_last_batch=True,
	initial_fill=initial_prefetch_size,
	stride=1,
	step=sequence_length,
	)
	return video

	pipe = video_pipe( #pylint: disable = unexpected-keyword-arg
	batch_size=batch_size,
	prefetch_queue_depth=1,
	num_threads=num_threads,
	device_id=0,
	seed=0,
	)
	pipe.build()
	dataloader = DALIGenericIterator(pipe, ["data"], reader_name='video')

	# Compare video reading speed
	for ii, batch in enumerate(dataloader):
	if ii == warmup:
	start = time.time()
	total_frames = 0

	if ii > warmup:
	total_time = time.time() - start
	shape = batch[0]["data"].shape
	total_frames += shape[0] * shape[1]
	avg_hz = total_frames / max(total_time, 1E-5)
	if ii % 10 == 0:
	print(f"Speed {avg_hz:.1f} hz, {shape}")
	return avg_hz


	def main():
	# pylint: disable=line-too-long
	# flake8: noqa: E501
	parser = argparse.ArgumentParser()
	parser.add_argument("--src_url", default="http://peach.themazzone.com/durian/movies/sintel-1024-surround.mp4")
	parser.add_argument("--video_path", type=Path, default="data/dali_mwp/sintel.mp4")
	parser.add_argument("--batch_sizes", type=int, nargs="+", default=[1, 10])
	parser.add_argument("--log_dir", type=Path, default="data/dali_mwp/logs")
	parser.add_argument("--refresh", action="store_true")
	parser.add_argument("--warmup", type=int, default=1)
	parser.add_argument("--modes", type=str, nargs="+", default=["cv2", "dali"])
	parser.add_argument("--test_vid_height", type=int, default=256)
	parser.add_argument("--test_vid_width", type=int, default=256)
	parser.add_argument("--test_vid_duration", type=str, default="00:03:00")
	parser.add_argument("--num_threads", type=int, nargs="+", default=[1, 4, 8, 16])
	parser.add_argument("--sequence_lengths", nargs="+", type=int, default=[8, 16, 32, 64])
	parser.add_argument("--num_runs", type=int, default=3)
	parser.add_argument("--initial_prefetch_size", type=int, default=0)
	args = parser.parse_args()

	# fetch sintel movie for profiling
	fetch_and_prep_movie(
	url=args.src_url,
	dest_path=args.video_path,
	refresh=args.refresh,
	width=args.test_vid_width,
	height=args.test_vid_height,
	trim=args.test_vid_duration,
	)

	# Set up logging
	log_name = f"timing-log-{datetime.now().strftime('%y-%m-%d_%H:%M:%S.txt')}"
	args.log_dir.mkdir(exist_ok=True, parents=True)
	handlers = [logging.FileHandler(str(args.log_dir / log_name)), logging.StreamHandler()]
	logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=handlers)
	logging.info(f"Launched profiler with args\n: {args}")

	# Compute some stats about how many runs will be profiled
	cv2_runs = len(args.batch_sizes) * len(args.sequence_lengths) * args.num_runs
	dali_runs = cv2_runs * len(args.num_threads)
	total_runs = cv2_runs + dali_runs
	logging.info(f"Running the profiler on {total_runs} configurations")

	# Keep track of current run
	curr_run_idx = 1

	# Store timing results in a defaultdict, with one list of timings per configuration
	# to make it easy to average across runs
	results = defaultdict(list)

	# First, profile video loading with opencv. Since this has fewer configuration options
	# than dali, we run it as a separate loop
	mode = "cv2"
	for batch_size in args.batch_sizes:
	for sequence_length in args.sequence_lengths:
	for _ in range(args.num_runs):
	tag = f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}"
	print(f"{tag} {len(results[tag])} [{curr_run_idx}/{total_runs}]")
	avg_hz = profile(
	video_path=args.video_path,
	test_vid_width=args.test_vid_width,
	test_vid_height=args.test_vid_height,
	batch_size=batch_size,
	sequence_length=sequence_length,
	num_threads=1,
	initial_prefetch_size=args.initial_prefetch_size,
	warmup=args.warmup,
	mode=mode,
	)
	results[tag].append(avg_hz)
	curr_run_idx += 1

	# Second, repeat the same configurations with DALI, with the addition of an extra
	# set of configurations using different numbers of threads
	mode = "dali"
	for batch_size in args.batch_sizes:
	for sequence_length in args.sequence_lengths:
	for _ in range(args.num_runs):
	for num_thread in args.num_threads:
	tag = (f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}"
	f"-num_threads-{num_thread}")
	print(f"{tag} run {len(results[tag])} [{curr_run_idx}/{total_runs}]")
	avg_hz = profile(
	video_path=args.video_path,
	test_vid_width=args.test_vid_width,
	test_vid_height=args.test_vid_height,
	batch_size=batch_size,
	sequence_length=sequence_length,
	num_threads=num_thread,
	initial_prefetch_size=args.initial_prefetch_size,
	warmup=args.warmup,
	mode=mode,
	)
	results[tag].append(avg_hz)
	curr_run_idx += 1

	logging.info("===========================================")
	logging.info("Hardware summary (note, only using one GPU)")
	logging.info(subprocess.check_output(["nvidia-smi", "-L"]))
	logging.info("===========================================")
	logging.info("Platform summary:")
	logging.info(platform.sys.version)
	logging.info("===========================================")
	logging.info(f"DALI version: {nvidia.dali.__version__}")
	logging.info(f"Torch cuda: {torch.version.cuda}")
	logging.info("===========================================")
	logging.info(f"Timing results (averages across {args.num_runs}):")

	for mode, speeds in results.items():
	logging.info(f"{mode}: {np.mean(speeds):.1f} Hz +/- {np.std(speeds):.1f}")

	if __name__ == "__main__":
	main()