Skip to content

Instantly share code, notes, and snippets.

@twmht
Created November 17, 2021 05:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save twmht/d9b2ae03162942b1ebc0bed86133ce1f to your computer and use it in GitHub Desktop.
Save twmht/d9b2ae03162942b1ebc0bed86133ce1f to your computer and use it in GitHub Desktop.
reproduce cupy runtime error with pytorch
import torch
import cupy as cp
import numpy as np
def fp16_clamp(x, min=None, max=None):
if not x.is_cuda and x.dtype == torch.float16:
# clamp for cpu float16, tensor fp16 has no clamp implementation
return x.float().clamp(min, max).half()
return x.clamp(min, max)
def torch_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
"""Calculate overlap between two set of bboxes.
FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
Note:
Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
there are some new generated variable when calculating IOU
using bbox_overlaps function:
1) is_aligned is False
area1: M x 1
area2: N x 1
lt: M x N x 2
rb: M x N x 2
wh: M x N x 2
overlap: M x N x 1
union: M x N x 1
ious: M x N x 1
Total memory:
S = (9 x N x M + N + M) * 4 Byte,
When using FP16, we can reduce:
R = (9 x N x M + N + M) * 4 / 2 Byte
R large than (N + M) * 4 * 2 is always true when N and M >= 1.
Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
N + 1 < 3 * N, when N or M is 1.
Given M = 40 (ground truth), N = 400000 (three anchor boxes
in per grid, FPN, R-CNNs),
R = 275 MB (one times)
A special case (dense detection), M = 512 (ground truth),
R = 3516 MB = 3.43 GB
When the batch size is B, reduce:
B x R
Therefore, CUDA memory runs out frequently.
Experiments on GeForce RTX 2080Ti (11019 MiB):
| dtype | M | N | Use | Real | Ideal |
|:----:|:----:|:----:|:----:|:----:|:----:|
| FP32 | 512 | 400000 | 8020 MiB | -- | -- |
| FP16 | 512 | 400000 | 4504 MiB | 3516 MiB | 3516 MiB |
| FP32 | 40 | 400000 | 1540 MiB | -- | -- |
| FP16 | 40 | 400000 | 1264 MiB | 276MiB | 275 MiB |
2) is_aligned is True
area1: N x 1
area2: N x 1
lt: N x 2
rb: N x 2
wh: N x 2
overlap: N x 1
union: N x 1
ious: N x 1
Total memory:
S = 11 x N * 4 Byte
When using FP16, we can reduce:
R = 11 x N * 4 / 2 Byte
So do the 'giou' (large than 'iou').
Time-wise, FP16 is generally faster than FP32.
When gpu_assign_thr is not -1, it takes more time on cpu
but not reduce memory.
There, we can reduce half the memory and keep the speed.
If ``is_aligned`` is ``False``, then calculate the overlaps between each
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
pair of bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned`` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union), "iof" (intersection over
foreground) or "giou" (generalized intersection over union).
Default "iou".
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 10, 10],
>>> [10, 10, 20, 20],
>>> [32, 32, 38, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 10, 20],
>>> [0, 10, 10, 19],
>>> [10, 10, 20, 20],
>>> ])
>>> overlaps = bbox_overlaps(bboxes1, bboxes2)
>>> assert overlaps.shape == (3, 3)
>>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
>>> assert overlaps.shape == (3, )
Example:
>>> empty = torch.empty(0, 4)
>>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
# Either the boxes are empty or the length of boxes' last dimension is 4
assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.size(-2)
cols = bboxes2.size(-2)
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return bboxes1.new(batch_shape + (rows, ))
else:
return bboxes1.new(batch_shape + (rows, cols))
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
bboxes1[..., 3] - bboxes1[..., 1])
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
bboxes2[..., 3] - bboxes2[..., 1])
if is_aligned:
lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]
wh = fp16_clamp(rb - lt, min=0)
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
else:
lt = torch.max(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
rb = torch.min(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]
wh = fp16_clamp(rb - lt, min=0)
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1[..., None] + area2[..., None, :] - overlap
else:
union = area1[..., None]
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2])
enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:])
eps = union.new_tensor([eps])
union = torch.max(union, eps)
ious = overlap / union
if mode in ['iou', 'iof']:
return ious
# calculate gious
enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
enclose_area = torch.max(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return gious
def cupy_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
"""Calculate overlap between two set of bboxes.
Args:
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned`` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union), "iof" (intersection over
foreground) or "giou" (generalized intersection over union).
Default "iou".
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
"""
assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
# Either the boxes are empty or the length of boxes' last dimension is 4
assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.shape[-2]
cols = bboxes2.shape[-2]
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return cp.empty(batch_shape + (rows, ))
else:
return cp.empty(batch_shape + (rows, cols))
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
bboxes1[..., 3] - bboxes1[..., 1])
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
bboxes2[..., 3] - bboxes2[..., 1])
if is_aligned:
lt = cp.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
rb = cp.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]
wh = fp16_clamp(rb - lt, min=0)
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = cp.minimum(bboxes1[..., :2], bboxes2[..., :2])
enclosed_rb = cp.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
else:
lt = cp.maximum(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
rb = cp.minimum(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]
wh = fp16_clamp(rb - lt, min=0)
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1[..., None] + area2[..., None, :] - overlap
else:
union = area1[..., None]
if mode == 'giou':
enclosed_lt = cp.minimum(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2])
enclosed_rb = cp.maximum(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:])
eps = cp.array([eps], dtype=cp.float32)
union = cp.maximum(union, eps)
ious = overlap / union
if mode in ['iou', 'iof']:
return ious
# calculate gious
enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
enclose_area = cp.maximum(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return gious
def cupy_fast_nms(multi_bboxes,
multi_scores,
score_thr,
iou_thr,
top_k,
max_num=-1):
"""Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
Fast NMS allows already-removed detections to suppress other detections so
that every instance can be decided to be kept or discarded in parallel,
which is not possible in traditional NMS. This relaxation allows us to
implement Fast NMS entirely in standard GPU-accelerated matrix operations.
Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class+1), where the last column
contains scores of the background class, but this will be ignored.
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
iou_thr (float): IoU threshold to be considered as conflicted.
top_k (int): if there are more than top_k bboxes before NMS,
only top top_k will be kept.
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept. If -1, keep all the bboxes.
Default: -1.
Returns:
tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
and (k, coeffs_dim). Dets are boxes with scores.
Labels are 0-based.
"""
scores = cp.ascontiguousarray(cp.transpose(multi_scores[:, :-1], (1,0))) # [#class, n]
idx = cp.argsort(-scores, axis=1)
scores = cp.take_along_axis(scores, idx, axis=1)
idx = cp.ascontiguousarray(idx[:, :top_k])
scores = scores[:, :top_k] # [#class, topk]
num_classes, num_dets = idx.shape
boxes = multi_bboxes[idx.reshape(-1), :].reshape(num_classes, num_dets, 4)
iou = cupy_bbox_overlaps(boxes, boxes) # [#class, topk, topk]
iou = cp.triu(iou, k=1)
iou_max = cp.amax(iou, axis=1)
# Now just filter out the ones higher than the threshold
keep = iou_max <= iou_thr
# Second thresholding introduces 0.2 mAP gain at negligible time cost
keep *= scores > score_thr
# Assign each kept detection to its corresponding class
classes = cp.broadcast_to(cp.arange(num_classes)[:, None], keep.shape)
classes = classes[keep]
boxes = boxes[keep]
scores = scores[keep]
# Only keep the top max_num highest scores across all classes
# scores, idx = scores.sort(0, descending=True)
idx = cp.argsort(-scores, axis=0)
scores = scores[idx]
if max_num > 0:
idx = idx[:max_num]
scores = scores[:max_num]
classes = classes[idx]
boxes = boxes[idx]
cls_dets = cp.concatenate([boxes, scores[:, None]], axis=1)
return cls_dets, classes
def torch_fast_nms(multi_bboxes,
multi_scores,
multi_coeffs,
score_thr,
iou_thr,
top_k,
max_num=-1):
"""Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
Fast NMS allows already-removed detections to suppress other detections so
that every instance can be decided to be kept or discarded in parallel,
which is not possible in traditional NMS. This relaxation allows us to
implement Fast NMS entirely in standard GPU-accelerated matrix operations.
Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class+1), where the last column
contains scores of the background class, but this will be ignored.
multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
iou_thr (float): IoU threshold to be considered as conflicted.
top_k (int): if there are more than top_k bboxes before NMS,
only top top_k will be kept.
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept. If -1, keep all the bboxes.
Default: -1.
Returns:
tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
and (k, coeffs_dim). Dets are boxes with scores.
Labels are 0-based.
"""
scores = multi_scores[:, :-1].t() # [#class, n]
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k] # [#class, topk]
num_classes, num_dets = idx.size()
boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = torch_bbox_overlaps(boxes, boxes) # [#class, topk, topk]
iou.triu_(diagonal=1)
iou_max, _ = iou.max(dim=1)
# Now just filter out the ones higher than the threshold
keep = iou_max <= iou_thr
# Second thresholding introduces 0.2 mAP gain at negligible time cost
keep *= scores > score_thr
# Assign each kept detection to its corresponding class
classes = torch.arange(
num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
coeffs = coeffs[keep]
scores = scores[keep]
# Only keep the top max_num highest scores across all classes
scores, idx = scores.sort(0, descending=True)
if max_num > 0:
idx = idx[:max_num]
scores = scores[:max_num]
classes = classes[idx]
boxes = boxes[idx]
coeffs = coeffs[idx]
cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
return cls_dets, classes, coeffs
num_class = 5
n = 1000
score_thr = 0.01
iou_thr = 0.01
top_k = 100
multi_bboxes = np.random.randn(n, 4).astype(np.float32)
multi_scores = np.random.randn(n, num_class + 1).astype(np.float32)
torch_multi_bboxes = torch.from_numpy(multi_bboxes).cuda()
torch_multi_scores = torch.from_numpy(multi_scores).cuda()
multi_coeffs = np.random.randn(n, num_class).astype(np.float32)
torch_multi_coeffs = torch.from_numpy(multi_coeffs).cuda()
# FIXME: when running torch with cuda, and then run cp would throw cudaErrorIllegalAddress
torch_cls_dets, torch_labels, _ = torch_fast_nms(torch_multi_bboxes, torch_multi_scores, torch_multi_coeffs, score_thr, iou_thr, top_k)
cp_multi_bboxes = cp.asarray(multi_bboxes)
cp_multi_scores = cp.asarray(multi_scores)
# FIXME: when running cp fast nms first, and then run mmdet fast nms, pytest would stuck
cls_dets, labels = cupy_fast_nms(cp_multi_bboxes, cp_multi_scores, score_thr, iou_thr, top_k)
assert(cls_dets.ndim == 2)
assert(cls_dets.shape[1] == 5)
assert(labels.shape[0] == cls_dets.shape[0])
assert(np.allclose(cp.asnumpy(cls_dets), torch_cls_dets.numpy()))
assert(np.allclose(cp.asnumpy(labels), torch_labels.numpy()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment