Skip to content

Instantly share code, notes, and snippets.

@albanie
Created May 13, 2021 09:12
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save albanie/49a1e0c844ed91c473648ee5803846f4 to your computer and use it in GitHub Desktop.
Save albanie/49a1e0c844ed91c473648ee5803846f4 to your computer and use it in GitHub Desktop.
"""A minimal profiler to compare speed differences between reading a video with opencv
and DALI.
For both dataloaders, the goal is to read an mp4 video and then load batches of frames
onto the GPU.
"""
import os
import logging
import time
import subprocess
import argparse
import platform
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import cv2
import numpy as np
import torch
import nvidia.dali
import nvidia.dali.fn as fn
from nvidia.dali import pipeline_def
from nvidia.dali.plugin.pytorch import DALIGenericIterator
def fetch_and_prep_movie(
url: str,
dest_path: Path,
refresh: bool,
height: int,
width: int,
trim: str,
):
"""Fetch a video from the web and prepare it for profiling.
Args:
url: the location of the video
dest_path: location where the the prepared video will be stored
refresh: whether to overwrite an existing video
height: the height (in pixels) that the video will be resized to
width: the width (in pixels) that the video will be resized to
trim: the video will be cropped to this duration, specified as an ffmpeg-style
formatted string (i.e. the `HH:MM.SS.MMM` format)
"""
if dest_path.exists() and not refresh:
print(f"Found existing sintel movie at {url}")
return
print(f"Fetching original sintel movie from {url}")
dest_path.parent.mkdir(exist_ok=True, parents=True)
raw_video = dest_path.parent / f"{dest_path.stem}-raw{dest_path.suffix}"
os.system(f"wget {url} -O {raw_video}")
prep = f'ffmpeg -y -i {raw_video} -vf "scale={height}x{width}" -to {trim} {dest_path}'
os.system(prep)
def mini_cv2_dataloader(
video_path: Path,
batch_size: int,
sequence_length: int,
height: int,
width: int,
):
"""A minimalist opencv2 dataloader
Args:
video_path: location of the video to be read
batch_size: the batch dimension of the tensor to be loaded
sequence_length: the sequence dimension of the tensor to be loaded
height: the height (in pixels) of the video
width: the width (in pixels) of the video
Yields:
A batch of video frames, stored as a pytorch tensor on the GPU in
NOTE: This is not meant to be in any way optimised - it's the simplest way
to load frames into a tensor (that I could think of).
"""
video_capture = cv2.VideoCapture(str(video_path))
while video_capture.isOpened():
next_batch = torch.zeros((batch_size, sequence_length, height, width, 3),
device="cuda")
for batch_idx in range(batch_size):
for seq_idx in range(sequence_length):
_, im = video_capture.read()
if im is None:
return
gpu_im = torch.from_numpy(im) # move to GPU to match DALI outputs
next_batch[batch_idx, seq_idx] = gpu_im
yield [{"data": next_batch}] # mimic DALI pipeline output
def profile(
mode: str,
video_path: Path,
test_vid_width: int,
test_vid_height: int,
batch_size: int,
sequence_length: int,
num_threads: int,
initial_prefetch_size: int,
warmup: int,
):
"""Profile video readers
Args:
model: whether to use cv2 or dali as the data loader
video_path: location of the video to be read
test_vid_width: the width (in pixels) of the video
test_vid_height: the height (in pixels) of the video
batch_size: the batch dimension of the tensor to be loaded
sequence_length: the sequence dimension of the tensor to be loaded
num_threads: the number of threads to use for reading (DALI only)
initial_prefetch_size: DALI dataloader parameter
warmup: how many iterations to run before starting to profile speed
Returns:
The average speed at which frames were read.
"""
if mode == "cv2":
dataloader = mini_cv2_dataloader(
video_path=video_path,
batch_size=batch_size,
sequence_length=sequence_length,
width=test_vid_width,
height=test_vid_height,
)
elif mode == "dali":
@pipeline_def
def video_pipe():
video = fn.readers.video(
name="video",
device="gpu",
filenames=[str(video_path)],
sequence_length=sequence_length,
shard_id=0,
num_shards=1,
random_shuffle=False,
pad_last_batch=True,
initial_fill=initial_prefetch_size,
stride=1,
step=sequence_length,
)
return video
pipe = video_pipe( #pylint: disable = unexpected-keyword-arg
batch_size=batch_size,
prefetch_queue_depth=1,
num_threads=num_threads,
device_id=0,
seed=0,
)
pipe.build()
dataloader = DALIGenericIterator(pipe, ["data"], reader_name='video')
# Compare video reading speed
for ii, batch in enumerate(dataloader):
if ii == warmup:
start = time.time()
total_frames = 0
if ii > warmup:
total_time = time.time() - start
shape = batch[0]["data"].shape
total_frames += shape[0] * shape[1]
avg_hz = total_frames / max(total_time, 1E-5)
if ii % 10 == 0:
print(f"Speed {avg_hz:.1f} hz, {shape}")
return avg_hz
def main():
# pylint: disable=line-too-long
# flake8: noqa: E501
parser = argparse.ArgumentParser()
parser.add_argument("--src_url", default="http://peach.themazzone.com/durian/movies/sintel-1024-surround.mp4")
parser.add_argument("--video_path", type=Path, default="data/dali_mwp/sintel.mp4")
parser.add_argument("--batch_sizes", type=int, nargs="+", default=[1, 10])
parser.add_argument("--log_dir", type=Path, default="data/dali_mwp/logs")
parser.add_argument("--refresh", action="store_true")
parser.add_argument("--warmup", type=int, default=1)
parser.add_argument("--modes", type=str, nargs="+", default=["cv2", "dali"])
parser.add_argument("--test_vid_height", type=int, default=256)
parser.add_argument("--test_vid_width", type=int, default=256)
parser.add_argument("--test_vid_duration", type=str, default="00:03:00")
parser.add_argument("--num_threads", type=int, nargs="+", default=[1, 4, 8, 16])
parser.add_argument("--sequence_lengths", nargs="+", type=int, default=[8, 16, 32, 64])
parser.add_argument("--num_runs", type=int, default=3)
parser.add_argument("--initial_prefetch_size", type=int, default=0)
args = parser.parse_args()
# fetch sintel movie for profiling
fetch_and_prep_movie(
url=args.src_url,
dest_path=args.video_path,
refresh=args.refresh,
width=args.test_vid_width,
height=args.test_vid_height,
trim=args.test_vid_duration,
)
# Set up logging
log_name = f"timing-log-{datetime.now().strftime('%y-%m-%d_%H:%M:%S.txt')}"
args.log_dir.mkdir(exist_ok=True, parents=True)
handlers = [logging.FileHandler(str(args.log_dir / log_name)), logging.StreamHandler()]
logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=handlers)
logging.info(f"Launched profiler with args\n: {args}")
# Compute some stats about how many runs will be profiled
cv2_runs = len(args.batch_sizes) * len(args.sequence_lengths) * args.num_runs
dali_runs = cv2_runs * len(args.num_threads)
total_runs = cv2_runs + dali_runs
logging.info(f"Running the profiler on {total_runs} configurations")
# Keep track of current run
curr_run_idx = 1
# Store timing results in a defaultdict, with one list of timings per configuration
# to make it easy to average across runs
results = defaultdict(list)
# First, profile video loading with opencv. Since this has fewer configuration options
# than dali, we run it as a separate loop
mode = "cv2"
for batch_size in args.batch_sizes:
for sequence_length in args.sequence_lengths:
for _ in range(args.num_runs):
tag = f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}"
print(f"{tag} {len(results[tag])} [{curr_run_idx}/{total_runs}]")
avg_hz = profile(
video_path=args.video_path,
test_vid_width=args.test_vid_width,
test_vid_height=args.test_vid_height,
batch_size=batch_size,
sequence_length=sequence_length,
num_threads=1,
initial_prefetch_size=args.initial_prefetch_size,
warmup=args.warmup,
mode=mode,
)
results[tag].append(avg_hz)
curr_run_idx += 1
# Second, repeat the same configurations with DALI, with the addition of an extra
# set of configurations using different numbers of threads
mode = "dali"
for batch_size in args.batch_sizes:
for sequence_length in args.sequence_lengths:
for _ in range(args.num_runs):
for num_thread in args.num_threads:
tag = (f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}"
f"-num_threads-{num_thread}")
print(f"{tag} run {len(results[tag])} [{curr_run_idx}/{total_runs}]")
avg_hz = profile(
video_path=args.video_path,
test_vid_width=args.test_vid_width,
test_vid_height=args.test_vid_height,
batch_size=batch_size,
sequence_length=sequence_length,
num_threads=num_thread,
initial_prefetch_size=args.initial_prefetch_size,
warmup=args.warmup,
mode=mode,
)
results[tag].append(avg_hz)
curr_run_idx += 1
logging.info("===========================================")
logging.info("Hardware summary (note, only using one GPU)")
logging.info(subprocess.check_output(["nvidia-smi", "-L"]))
logging.info("===========================================")
logging.info("Platform summary:")
logging.info(platform.sys.version)
logging.info("===========================================")
logging.info(f"DALI version: {nvidia.dali.__version__}")
logging.info(f"Torch cuda: {torch.version.cuda}")
logging.info("===========================================")
logging.info(f"Timing results (averages across {args.num_runs}):")
for mode, speeds in results.items():
logging.info(f"{mode}: {np.mean(speeds):.1f} Hz +/- {np.std(speeds):.1f}")
if __name__ == "__main__":
main()
@albanie
Copy link
Author

albanie commented May 13, 2021

Outputs:

===========================================
Hardware summary (note, only using one GPU)
b'GPU 0: GeForce GTX 1080 Ti (UUID: GPU-a1823874-987c-b285-32e6-f91b3dd502eb)\nGPU 1: GeForce GTX 1080 Ti (UUID: GPU-f73e44b8-6002-e665-81f1-bdf9f0270663)\nGPU 2: GeForce GTX 1080 Ti (UUID: GPU-82c2b8d4-4b3d-e086-bf34-ad1ee7f95632)\nGPU 3: GeForce GTX 1080 Ti (UUID: GPU-4904b74a-b7eb-d607-89e2-3dc19af768e0)\n'
===========================================
Platform summary:
3.7.7 (default, May  7 2020, 21:25:33) 
[GCC 7.3.0]
===========================================
DALI version: 1.1.0
Torch cuda: 10.1
===========================================
Timing results (averages across 3):
cv2-batch_size-1-seq_len-8: 1956.2 Hz +/- 57.9
cv2-batch_size-1-seq_len-16: 1973.8 Hz +/- 39.0
cv2-batch_size-1-seq_len-32: 2099.1 Hz +/- 55.0
cv2-batch_size-1-seq_len-64: 1989.5 Hz +/- 22.4
cv2-batch_size-10-seq_len-8: 2014.3 Hz +/- 37.7
cv2-batch_size-10-seq_len-16: 2086.0 Hz +/- 13.3
cv2-batch_size-10-seq_len-32: 2089.2 Hz +/- 27.8
cv2-batch_size-10-seq_len-64: 2081.2 Hz +/- 41.6
dali-batch_size-1-seq_len-8-num_threads-1: 380.8 Hz +/- 0.8
dali-batch_size-1-seq_len-8-num_threads-4: 380.4 Hz +/- 0.7
dali-batch_size-1-seq_len-8-num_threads-8: 378.9 Hz +/- 0.2
dali-batch_size-1-seq_len-8-num_threads-16: 380.0 Hz +/- 0.5
dali-batch_size-1-seq_len-16-num_threads-1: 707.7 Hz +/- 2.3
dali-batch_size-1-seq_len-16-num_threads-4: 706.8 Hz +/- 1.4
dali-batch_size-1-seq_len-16-num_threads-8: 705.7 Hz +/- 1.3
dali-batch_size-1-seq_len-16-num_threads-16: 703.5 Hz +/- 0.8
dali-batch_size-1-seq_len-32-num_threads-1: 1233.7 Hz +/- 2.4
dali-batch_size-1-seq_len-32-num_threads-4: 1229.5 Hz +/- 1.5
dali-batch_size-1-seq_len-32-num_threads-8: 1231.5 Hz +/- 1.9
dali-batch_size-1-seq_len-32-num_threads-16: 1231.0 Hz +/- 4.0
dali-batch_size-1-seq_len-64-num_threads-1: 2027.1 Hz +/- 15.8
dali-batch_size-1-seq_len-64-num_threads-4: 2017.5 Hz +/- 20.7
dali-batch_size-1-seq_len-64-num_threads-8: 2004.3 Hz +/- 6.0
dali-batch_size-1-seq_len-64-num_threads-16: 2003.3 Hz +/- 12.8
dali-batch_size-10-seq_len-8-num_threads-1: 373.1 Hz +/- 1.1
dali-batch_size-10-seq_len-8-num_threads-4: 372.1 Hz +/- 0.1
dali-batch_size-10-seq_len-8-num_threads-8: 373.0 Hz +/- 1.2
dali-batch_size-10-seq_len-8-num_threads-16: 372.0 Hz +/- 0.2
dali-batch_size-10-seq_len-16-num_threads-1: 719.7 Hz +/- 0.7
dali-batch_size-10-seq_len-16-num_threads-4: 720.3 Hz +/- 1.1
dali-batch_size-10-seq_len-16-num_threads-8: 720.4 Hz +/- 0.7
dali-batch_size-10-seq_len-16-num_threads-16: 719.5 Hz +/- 0.5
dali-batch_size-10-seq_len-32-num_threads-1: 1300.1 Hz +/- 2.8
dali-batch_size-10-seq_len-32-num_threads-4: 1304.8 Hz +/- 3.0
dali-batch_size-10-seq_len-32-num_threads-8: 1300.0 Hz +/- 2.1
dali-batch_size-10-seq_len-32-num_threads-16: 1301.2 Hz +/- 3.0
dali-batch_size-10-seq_len-64-num_threads-1: 2058.8 Hz +/- 3.7
dali-batch_size-10-seq_len-64-num_threads-4: 2053.5 Hz +/- 3.6
dali-batch_size-10-seq_len-64-num_threads-8: 2059.3 Hz +/- 5.4
dali-batch_size-10-seq_len-64-num_threads-16: 2065.4 Hz +/- 10.5

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment