Skip to content

Instantly share code, notes, and snippets.

@wassname
Last active Jun 9, 2021
Embed
What would you like to do?
dicoms2df.py
"""
read dicom header and cache
url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900
"""
from diskcache import Cache, JSONDisk
import pandas as pd
from tqdm.contrib.concurrent import thread_map, process_map
import logging
from pathlib import Path
from functools import partial
from . dcm2json import header2json
cache_dir = Path(f'data/.cache')
cache = Cache(cache_dir/'dcm2dict.jsdb', disk=JSONDisk, disk_compress_level=6)
@cache.memoize()
def header2json_cache(f:str, px_summ=False):
header = header2json(f)
return header
def h2j(f, px_summ=False):
try:
return header2json_cache(f, px_summ=px_summ)
except Exception as e:
logger.error(f'error header2json on {f}: {e}')
def dicoms2df(files:list, px_summ = False, n_workers=None)->pd.DataFrame:
"""
Convert list of files to dataframe of headers.
Usage:
data_dir = Path('../data/raw/ProstateX/dicom/')
fs = [str(p) for p in data_dir.glob('**/*.dcm')]
df_dicom = dicoms2df(fs, n_workers=4)
df_dicom
Example output:
| | SpecificCharacterSet | ImageType | ImageType1 |...| Modality | Manufacturer | SeriesDescription | st_mtime |
|-----:|:-----------------------|:------------|:-------------|...|:-----------|:---------------|:------------------------------|-----------:|
| 202 | ISO_IR 100 | ORIGINAL | PRIMARY |...| MR | SIEMENS | tfl_3d dynamisch fast | 1.6214e+09 |
| 2494 | ISO_IR 100 | DERIVED | PRIMARY |...| MR | SIEMENS | ep2d_diff_tra_DYNDIST_MIX_ADC | 1.6214e+09 |
| 1568 | ISO_IR 100 | ORIGINAL | PRIMARY |...| MR | SIEMENS | tfl_3d dynamisch fast | 1.6214e+09 |
"""
# h2j = partial(header2json_cache, px_summ=False)
if n_workers==0:
# sync for debug
headers = [h2j(f) for f in tqdm(files)]
else:
headers = thread_map(h2j, files, max_workers=n_workers)
df_dicom = pd.DataFrame.from_records(headers)
df_dicom['Filename'] = files
# df_dicom.to_csv(cache_file/f'df_{len(files)}_px{px_summ}.csv.gz', index=False)
return df_dicom
# and make a version cached to csv.gz
import pickle
import hashlib
def md5hash(s: str) -> str:
return hashlib.md5(s).hexdigest()
def _pd_cache(func):
def cache(*args, **kw):
# The file name contains the hash of functions args and kwargs
key = pickle.dumps(args, 1) + pickle.dumps(kw, 1)
hsh = md5hash(key)[:6]
f = cache_dir / f"{func.__name__}_{hsh}.csv.gz"
if not f.exists():
df = func(*args, **kw)
logging.debug(f'writing to {f}')
df.to_csv(f, index=False)
return pd.read_csv(f)
return cache
# cached version
dicoms2df_cached = _pd_cache(dicoms2df)
"""
modified from https://github.com/fastai/fastai/blob/dded785401/fastai/medical/imaging.py#L363
url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900
to fail with logging then continue
and to only read header (much faster)
"""
import pydicom
from pydicom.dataset import Dataset as DcmDataset
from pydicom.tag import BaseTag as DcmTag
from pydicom.multival import MultiValue as DcmMultiValue
import scipy.stats
import logging
import json
import pandas as pd
from pathlib import Path
log = logging.getLogger('dicomheader')
cache_dir = Path(f'.cache')
# log.setLevel(logging.DEBUG)
def _split_elem(k, v):
"""Split a dicom multivalue."""
if not isinstance(v, DcmMultiValue):
yield k, v
else:
for i, o in enumerate(v):
k2 = f'{k}{"" if i==0 else i}'
yield k2, o
def _cast_dicom_special(x):
"""Attempt to convert pydicom types to primitives."""
cls = type(x)
if not cls.__module__.startswith('pydicom'): return x
if cls.__base__ == object: return x
if hasattr(x, '__len__') and len(x) == 0: return []
try:
# sometimes this requires extract args
return cls.__base__(x)
except Exception as e:
log.debug(f"Error while processing tag x={x}, e={e}")
return str(cls)
def as_dict(self: DcmDataset, px_summ:bool=False) -> dict:
"Convert the header of a dicom into a dictionary"
pxdata = (0x7FE0, 0x0010)
res = {}
for k in self.keys():
if k != pxdata:
try:
tag = self[k]
value = tag.value
keyword = tag.keyword
for keyword, value in _split_elem(keyword, value):
value = _cast_dicom_special(value)
# if it not json serialisable, just take the string representation
try:
pd.io.json.dumps([keyword, value]) # if it can't be made into json, so ignore
except:
value = str(value)
# crop huge tags, usually in private or unkown
if len(str(value))>1000:
value = str(value)[:1000] + '...'
res[keyword] = value
except Exception as e:
log.debug(f"Error while processing tag k={k}, file={self.filename}, e={e}")
if px_summ:
# extract a range of stats
# TODO use pmin, p05, p25, mode, p75, p95, pmax = np.quantile(x.flatten(), [0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
stats = {
'min':np.min,
'max':np.max,
'mean':np.mean,
'std':np.std,
'dtype': lambda x:str(x.dtype),
'mode_bin': lambda x: np.bincount(x.flatten()).argmax(), # infer from most freq value
'mode': lambda x:scipy.stats.mode(x.flatten()).mode[0],
'median': np.median,
'count': lambda x: np.prod(x.shape),
'shape_0': lambda x:x.shape[0],
'shape_1': lambda x:x.shape[1],
'skew': lambda x:scipy.stats.skew(x.flatten()),
'p0.05': lambda x:np.quantile(x.flatten(), 0.05),
'p0.95': lambda x:np.quantile(x.flatten(), 0.95),
'p0.75': lambda x:np.quantile(x.flatten(), 0.75),
'p0.25': lambda x:np.quantile(x.flatten(), 0.25),
'p0.5': lambda x:np.quantile(x.flatten(), 0.5),
}
try:
pxs = self.pixel_array
for n, f in stats.items():
res['img_'+n] = f(pxs)
except Exception as e:
for f in stats:
res['img_'+f] = 0
log.debug(
f"Error while processing pixel data {self.filename}, e={e}"
)
res = json.loads(pd.io.json.dumps(res)) # to primitives
return res
def header2json(f:str, px_summ=False, defer_size='64 KB', force=True):
with pydicom.read_file(f, stop_before_pixels=not px_summ, force=force, defer_size=defer_size) as dcm:
header = as_dict(dcm, px_summ=px_summ)
header['path'] = str(f)
s = Path(f).stat()
header['st_size'] = s.st_size
header['st_ctime'] = s.st_ctime
header['st_mtime'] = s.st_mtime
return header
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment