Last active
November 29, 2022 14:51
-
-
Save howff/bcb104a3486fdc8a2dd6c2134ad4a0f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Redact a DICOM image to deidentify it: | |
# * can remove the overlay planes stored in the high bits of the image data | |
# * can redact rectangles from any of the image frames or overlay frames | |
# TODO: change all errors to raise exceptions | |
# NOTE: | |
# overlays may be smaller than their images. Rectangle coordinates | |
# are within the overlay, not relative to the original image, so | |
# if you want to use image coordinates you'll need to subtract | |
# the overlay origin coordinate. | |
import logging | |
import numpy as np | |
import os | |
import pydicom | |
from pydicom.pixel_data_handlers.numpy_handler import pack_bits | |
import sys | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.DEBUG) | |
# Need examples of: | |
# high-bit overlays (and multiple of them) | |
# separate overlays (and multiple of them) | |
# multiple frames in separate overlays | |
# multiple frames of images | |
# mono, inverted mono, signed ints, RGB, Palette, etc | |
# various compression | |
filename = 'XA_GE_JPEG_02_with_Overlays.dcm' # has high-bit overlays | |
filename = 'MR-SIEMENS-DICOM-WithOverlays.dcm' # has separate overlays | |
elem_OverlayBitPosition = 0x0102 | |
elem_OverlayData = 0x3000 | |
elem_OverlayRows = 0x0010 | |
elem_OverlayCols = 0x0011 | |
elem_OverlayNumFrames = 0x0015 | |
elem_OverlayOrigin = 0x0050 | |
def mark_as_uncompressed(ds): | |
""" Call this after you've uncompressed the PixelData | |
and written it back into the dataset using tobytes() | |
""" | |
if sys.byteorder == 'little': | |
ds.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRLittleEndian | |
else: | |
ds.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRBigEndian | |
def overlay_tag_group_from_index(overlay): | |
""" Convert an overlay index 0..15 into a DICOM tag group | |
They start at 0x6000 and go up in twos. | |
""" | |
return 0x6000 + 2 * overlay | |
def overlay_bit_position(ds, overlay): | |
""" Return the bit index of the overlay, if it's hidden | |
inside the high bits of the image data, or 0 if it's a | |
standalone overlay, i.e. given overlay number N, | |
returns the actual number of the bit inside the byte | |
for that overlay, e.g. overlay 0 is bit 8. | |
""" | |
overlay_group_num = overlay_tag_group_from_index(overlay) | |
if [overlay_group_num, elem_OverlayBitPosition] in ds: | |
overlay_bit = ds[overlay_group_num, elem_OverlayBitPosition].value | |
# Sometimes it is prevent but value is None! | |
if not overlay_bit: | |
overlay_bit = 0 | |
# Bit position must be >0 for a high-bit overlay | |
if overlay_bit > 0: | |
logger.debug('Found overlay %d in high-bit %d' % (overlay, overlay_bit)) | |
return overlay_bit | |
return 0 | |
def remove_overlays_in_high_bits(ds): | |
""" Mask off the high-bits of all the pixel values | |
just in case there are overlays hidden in there | |
which we want to remove (simply removing the 60xx tags | |
does not actually remove the overlay pixels). | |
""" | |
# Sanity check that image has pixels | |
if not 'PixelData' in ds: | |
logger.error('no pixel data present') | |
return | |
# bits_allocated is the physical space used, 1 or a multiple of 8. | |
# bits_stored is the number of meaningful bits within those allocated. | |
bits_stored = ds['BitsStored'].value if 'BitsStored' in ds else -1 | |
bits_allocated = ds['BitsAllocated'].value if 'BitsAllocated' in ds else -1 | |
bit_mask = (~((~0) << bits_stored)) | |
samples = ds['SamplesPerPixel'].value if 'SamplesPerPixel' in ds else -1 | |
photometric = ds['PhotometricInterpretation'].value if 'PhotometricInterpretation' in ds else 'MONOCHROME2' | |
# This code calculates a bit mask from the actual overlays in use. | |
# It is not used. | |
# Instead we simply mask off everything outside the bits_stored bits. | |
overlay_bitmask = 0 | |
for overlay_num in range(16): | |
overlay_bit = overlay_bit_position(ds, overlay_num) | |
if overlay_bit > 0: | |
logger.debug('found overlay %d in high-bit %d' % (overlay_num, overlay_bit)) | |
overlay_bitmask |= (1 << overlay_bit) | |
logger.debug('bits_stored = %d (image bits used)' % bits_stored) | |
logger.debug('bits_allocated = %d (physical space needed)' % bits_allocated) | |
logger.debug('bit_mask = %x (use & to get only image data)' % bit_mask) | |
logger.debug('overlay_bitmask = %x (for overlays in use)' % overlay_bitmask) | |
logger.debug('samples = %d' % samples) | |
# Can only handle greyscale or palette images | |
# XXX would an overlay every be present in an RGB image? Doesn't make sense? | |
if photometric not in ['MONOCHROME1', 'MONOCHROME2', 'PALETTE COLOR']: | |
logger.error('cannot remove overlays from %s' % photometric) | |
return | |
# Can only handle 1 sample per pixel | |
# XXX would an overlay be present if multiple samples per pixel? Doesn't make sense? | |
if samples > 1: | |
logger.error('cannot remove overlays from %d samples per pixel' % samples) | |
return | |
pixel_data = ds.pixel_array # this can raise an exception in some files | |
logger.debug('ndim = %d' % pixel_data.ndim) | |
# Use numpy to mask the bits, handles both 8 and 16 bits per pixel. | |
masked = (pixel_data & bit_mask) | |
# Could also do this? | |
#if pixel_data.ndim == 3: | |
# masked = (pixel_data[frame,:,:] & bit_mask) | |
#else: | |
# masked = (pixel_data[frame,:,:,:] & bit_mask) | |
ds.PixelData = masked.tobytes() | |
# XXX does not re-compress | |
mark_as_uncompressed(ds) | |
# No need to handle ICC Profile, Colour Space, Palette Lookup Table, etc | |
# No need to handle 2's complement Pixel Representation. | |
return | |
def redact_rectangles_from_high_bit_overlay(ds, overlay=0, rect_list=[]): | |
""" | |
""" | |
# bits_allocated is the physical space used, 1 or a multiple of 8. | |
# bits_stored is the number of meaningful bits within those allocated. | |
bits_stored = ds['BitsStored'].value if 'BitsStored' in ds else -1 | |
bits_allocated = ds['BitsAllocated'].value if 'BitsAllocated' in ds else -1 | |
overlay_bit = overlay_bit_position(ds, overlay) | |
bit_mask = ~(1 << overlay_bit) | |
samples = ds['SamplesPerPixel'].value if 'SamplesPerPixel' in ds else -1 | |
photometric = ds['PhotometricInterpretation'].value if 'PhotometricInterpretation' in ds else 'MONOCHROME2' | |
# Can only handle greyscale or palette images | |
# XXX would an overlay every be present in an RGB image? Doesn't make sense? | |
if photometric not in ['MONOCHROME1', 'MONOCHROME2', 'PALETTE COLOR']: | |
logger.error('cannot remove overlays from %s' % photometric) | |
return | |
# Can only handle 1 sample per pixel | |
# XXX would an overlay be present if multiple samples per pixel? Doesn't make sense? | |
if samples > 1: | |
logger.error('cannot remove overlays from %d samples per pixel' % samples) | |
return | |
pixel_data = ds.pixel_array # this can raise an exception in some files | |
#logger.debug('ndim = %d' % pixel_data.ndim) | |
# Use numpy to mask the bits, handles both 8 and 16 bits per pixel. | |
# Can't simply &=bit_mask if dtype differs so use 1-elem array. | |
bit_mask_arr = np.array([bit_mask], dtype=pixel_data.dtype) | |
for rect in rect_list: | |
x0, y0, w, h = rect | |
x1 = x0 + w | |
y1 = y0 + h | |
pixel_data[y0:y1, x0:x1] &= bit_mask_arr | |
ds.PixelData = pixel_data.tobytes() | |
# XXX does not re-compress | |
mark_as_uncompressed(ds) | |
return | |
def redact_rectangles_from_image_frame(ds, frame=0, rect_list=[]): | |
""" Redact a list of rectangles from a specific image frame | |
counting from zero. | |
""" | |
samples = ds['SamplesPerPixel'].value if 'SamplesPerPixel' in ds else -1 | |
photometric = ds['PhotometricInterpretation'].value if 'PhotometricInterpretation' in ds else 'MONOCHROME2' | |
num_frames = ds['NumberOfFrames'].value if 'NumberOfFrames' in ds else 1 | |
bits_stored = ds['BitsStored'].value if 'BitsStored' in ds else -1 | |
bit_mask = ((~0) << bits_stored) | |
if frame >= num_frames: | |
logger.error('cannot redact frame %d, max is %d' % (frame, num_frames-1)) | |
return | |
pixel_data = ds.pixel_array # this can raise an exception in some files | |
#logger.debug('ndim = %d (should be the same as samples)' % pixel_data.ndim) | |
# Use numpy to mask the bits, handles both 8 and 16 bits per pixel. | |
# Can't simply &=bit_mask if dtype differs so use 1-elem array. | |
bit_mask_arr = np.array([bit_mask], dtype=pixel_data.dtype) | |
for rect in rect_list: | |
x0, y0, w, h = rect | |
x1 = x0 + w | |
y1 = y0 + h | |
if pixel_data.ndim == 2: | |
pixel_data[y0:y1, x0:x1] &= bit_mask_arr | |
elif pixel_data.ndim == 3: | |
pixel_data[frame, y0:y1, x0:x1] &= bit_mask_arr | |
elif pixel_data.ndim == 4: | |
pixel_data[frame, :, y0:y1, x0:x1] &= bit_mask_arr | |
ds.PixelData = pixel_data.tobytes() | |
# XXX does not re-compress | |
mark_as_uncompressed(ds) | |
return | |
def redact_rectangles_from_overlay_frame(ds, frame=0, overlay=0, rect_list=[]): | |
""" Redact a list of rectangles from a specific frame of an overlay. | |
Frame and overlay count from zero. | |
""" | |
redacted_colour = 0 | |
overlay_group_num = overlay_tag_group_from_index(overlay) | |
if not [overlay_group_num, elem_OverlayData] in ds: | |
logger.error('no overlay data found for overlay %d' % overlay) | |
return | |
pixel_data = ds.overlay_array(overlay_group_num) | |
for rect in rect_list: | |
x0, y0, w, h = rect | |
x1 = x0 + w | |
y1 = y0 + h | |
if pixel_data.ndim == 2: | |
pixel_data[y0:y1, x0:x1] = redacted_colour | |
else: | |
pixel_data[frame, y0:y1, x0:x1] = redacted_colour | |
packed_bytes = pack_bits(pixel_data) | |
ds[overlay_group_num, elem_OverlayData].value = packed_bytes | |
# XXX not sure if there's a transfer syntax for overlaydata | |
return | |
def redact_rectangles(ds, frame=-1, overlay=-1, rect_list=[]): | |
""" Redact a list of rectangles from: | |
* the given image frame (if overlay == -1) | |
* or the given overlay (if frame == -1) | |
* or the given frame of the given overlay (if both >= 0) | |
* The rectangles are (x,y,width,height) where | |
x,y are from 0,0 top-left | |
""" | |
if not rect_list: | |
logger.debug('no rectangles to redact') | |
return | |
if not 'PixelData' in ds: | |
logger.error('no pixel data present') | |
return | |
if overlay == -1: | |
return redact_rectangles_from_image_frame(ds, frame, rect_list) | |
if overlay < 0 or overlay > 15: | |
logger.error('invalid overlay requested %d' % overlay) | |
return | |
if overlay_bit_position(ds, overlay) > 0: | |
if frame > 0: | |
logger.error('cannot specify an overlay frame %d when overlay %d is in image high bits' % (frame, overlay)) | |
return | |
return redact_rectangles_from_high_bit_overlay(ds, overlay, rect_list) | |
else: | |
return redact_rectangles_from_overlay_frame(ds, frame, overlay, rect_list) | |
return | |
if __name__ == '__main__': | |
if len(sys.argv)>1: | |
filename = sys.argv[1] | |
ds = pydicom.dcmread(filename) | |
remove_overlays_in_high_bits(ds) | |
ds.save_as(filename + ".nooverlays.dcm") | |
ds = pydicom.dcmread(filename) | |
#redact_rectangles(ds, frame=0, overlay=-1, rect_list=[(10,10, 20,10)]) | |
#redact_rectangles(ds, frame=1, overlay=-1, rect_list=[(20,20, 20,10)]) | |
redact_rectangles(ds, frame=0, overlay=-1, rect_list=[(200,200, 20,10)]) | |
redact_rectangles(ds, frame=1, overlay=-1, rect_list=[(220,220, 20,10)]) | |
ds.save_as(filename + ".redactedframes.dcm") | |
ds = pydicom.dcmread(filename) | |
redact_rectangles(ds, overlay=0, frame=0, rect_list=[(10,10, 20,20), (30,30, 20,10)]) | |
redact_rectangles(ds, overlay=1, frame=0, rect_list=[(10,10, 20,20), (40,40, 20,10)]) | |
redact_rectangles(ds, overlay=2, frame=0, rect_list=[(10,10, 20,20), (50,50, 20,10)]) | |
redact_rectangles(ds, overlay=3, frame=0, rect_list=[(10,10, 20,20), (60,60, 20,10)]) | |
redact_rectangles(ds, overlay=4, frame=0, rect_list=[(10,10, 20,20), (70,70, 20,10)]) | |
redact_rectangles(ds, overlay=5, frame=0, rect_list=[(10,10, 20,20), (80,80, 20,10)]) | |
redact_rectangles(ds, overlay=6, frame=0, rect_list=[(10,10, 20,20), (90,90, 20,10)]) | |
redact_rectangles(ds, overlay=7, frame=0, rect_list=[(10,10, 20,20), (100,100, 20,10)]) | |
redact_rectangles(ds, overlay=8, frame=0, rect_list=[(10,10, 20,20), (110,110, 20,10)]) | |
redact_rectangles(ds, overlay=9, frame=0, rect_list=[(10,10, 20,20), (120,120, 20,10)]) | |
ds.save_as(filename + ".redactedoverlays.dcm") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment