el-hult/cals2tiff.py

## cals2tiff.py
"""

Python script that takes a folder, and for each CALS Raster file, it converts to a TIFF file.

https://en.wikipedia.org/wiki/CALS_Raster_file_format

file ending ".cal"

It so happens that TIFF can be compressed with Group 4 compression (as in faxes), and that is the compression format of CALS Raster images Type 1.

If the image is rotated, on must fix that.
If it is a CALS Type 2 image, it is a tiled image and one must decompress each tiles on its own, and I don't think that TIFF handles that.

In such a case, you need to do some more work. But I didn't need to think more about such problems.

Ludvig Hult
2021-06-13


"""
import os
import struct
import argparse


def parse_cals(data):
    """Read a CALS file, unpack some part of the header, reaturn the parsed header and the data block

    The quick data format description is from here
    http://support.ricoh.com/bb_v1oi/pub_e/oi_view/0001060/0001060558/view/rpgl_rtiff/int/0192.htm

    Writen by Ludvig Hult 2021-06-13

    """

    header_block_starts = [
        0,
        0x80,
        0x100,
        0x180,
        0x200,
        0x280,
        0x300,
        0x380,
        0x400,
        0x480,
        0x500,
        0x580,
        0x600,
        0x680,
        0x700,
        0x780,
    ]
    header_block_len = 128
    data_block_start=0x800
    header_data = []
    for start in header_block_starts:
        header_block = data[start:start+header_block_len]
        header_data.append(header_block.decode('ANSI'))

    text_header = "\n".join(header_data)
    binary_data = data[data_block_start:]
    noneify = lambda s: None if s == "NONE" else s
    header = {a: noneify(b.strip())  for a,b in [a.split(":",1) for a in header_data[:10]]}
    header['notes'] = noneify(data[0x507:0x800].decode('ANSI').strip())
    header['rtype'] = int(header['rtype'])
    header['rdensty'] = int(header['rdensty'])
    header['rpelcnt'] = tuple(int(a) for a in header['rpelcnt'].split(","))
    header['rorient'] = tuple(int(a) for a in header['rorient'].split(","))
    return header, binary_data


#####
# These functions are from https://shreevatsa.github.io/site/ccitt.html
#######

def tiff_header_for_CCITT(width, height, img_size, CCITT_group=4, blackIsZero=False):
    """Returns the appropriate header that will make it a valid TIFF file."""
    tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
    return struct.pack(tiff_header_struct,
                       b'II',  # Byte order indication: Little-endian
                       42,  # Version number (always 42)
                       8,  # Offset to first IFD
                       8,  # Number of tags in IFD
                       256, 4, 1, width,  # ImageWidth, LONG, 1, width
                       257, 4, 1, height,  # ImageLength, LONG, 1, length
                       258, 3, 1, 1,  # BitsPerSample, SHORT, 1, 1
                       259, 3, 1, CCITT_group,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
                       262, 3, 1, int(blackIsZero),  # Threshholding, SHORT, 1, 0 = WhiteIsZero
                       273, 4, 1, struct.calcsize(tiff_header_struct),  # StripOffsets, LONG, 1, len of header
                       278, 4, 1, height,  # RowsPerStrip, LONG, 1, length
                       279, 4, 1, img_size,  # StripByteCounts, LONG, 1, size of image
                       0  # last IFD
                       )

def decode_ccitt_data(data, width, height, CCITT_group=4, blackIsZero=False):
    """Decodes CCITT-encoded data, if its intended width, height, etc are known."""
    img_size = len(data)
    tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group)
    return tiff_header + data

###################################################################################################

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--indir",help="input dir (N.B. does not recurse)",required=True)
    p.add_argument("--outdir",help="output dir (must exist before hand)",required=True)
    args = p.parse_args()

    files = [f for f in os.listdir(args.indir) if f[-4:] == ".cal"]

    for fname in files:
        base = fname[:-4]
        in_path = os.path.join(args.indir,f"{base}.cal")
        out_path = os.path.join(args.outdir,f"{base}.tiff")

        with open(in_path,'rb') as cals_file:
            header,data = parse_cals(cals_file.read())

        assert header['rtype'] == 1, "I only deal with type 1"
        assert header['rorient'] == (0,270), "I only deal with simple orientation"
        width, height = header['rpelcnt']

        with open(out_path, 'wb') as tiff_file:
            tiff_file.write(decode_ccitt_data(data, width, height, CCITT_group=4, blackIsZero=True))


if __name__ == "__main__":
    main()
	"""

	Python script that takes a folder, and for each CALS Raster file, it converts to a TIFF file.

	https://en.wikipedia.org/wiki/CALS_Raster_file_format

	file ending ".cal"

	It so happens that TIFF can be compressed with Group 4 compression (as in faxes), and that is the compression format of CALS Raster images Type 1.

	If the image is rotated, on must fix that.
	If it is a CALS Type 2 image, it is a tiled image and one must decompress each tiles on its own, and I don't think that TIFF handles that.

	In such a case, you need to do some more work. But I didn't need to think more about such problems.

	Ludvig Hult
	2021-06-13


	"""
	import os
	import struct
	import argparse



	def parse_cals(data):
	"""Read a CALS file, unpack some part of the header, reaturn the parsed header and the data block

	The quick data format description is from here
	http://support.ricoh.com/bb_v1oi/pub_e/oi_view/0001060/0001060558/view/rpgl_rtiff/int/0192.htm

	Writen by Ludvig Hult 2021-06-13

	"""

	header_block_starts = [
	0,
	0x80,
	0x100,
	0x180,
	0x200,
	0x280,
	0x300,
	0x380,
	0x400,
	0x480,
	0x500,
	0x580,
	0x600,
	0x680,
	0x700,
	0x780,
	]
	header_block_len = 128
	data_block_start=0x800
	header_data = []
	for start in header_block_starts:
	header_block = data[start:start+header_block_len]
	header_data.append(header_block.decode('ANSI'))

	text_header = "\n".join(header_data)
	binary_data = data[data_block_start:]
	noneify = lambda s: None if s == "NONE" else s
	header = {a: noneify(b.strip()) for a,b in [a.split(":",1) for a in header_data[:10]]}
	header['notes'] = noneify(data[0x507:0x800].decode('ANSI').strip())
	header['rtype'] = int(header['rtype'])
	header['rdensty'] = int(header['rdensty'])
	header['rpelcnt'] = tuple(int(a) for a in header['rpelcnt'].split(","))
	header['rorient'] = tuple(int(a) for a in header['rorient'].split(","))
	return header, binary_data


	#####
	# These functions are from https://shreevatsa.github.io/site/ccitt.html
	#######

	def tiff_header_for_CCITT(width, height, img_size, CCITT_group=4, blackIsZero=False):
	"""Returns the appropriate header that will make it a valid TIFF file."""
	tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
	return struct.pack(tiff_header_struct,
	b'II', # Byte order indication: Little-endian
	42, # Version number (always 42)
	8, # Offset to first IFD
	8, # Number of tags in IFD
	256, 4, 1, width, # ImageWidth, LONG, 1, width
	257, 4, 1, height, # ImageLength, LONG, 1, length
	258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
	259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
	262, 3, 1, int(blackIsZero), # Threshholding, SHORT, 1, 0 = WhiteIsZero
	273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header
	278, 4, 1, height, # RowsPerStrip, LONG, 1, length
	279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image
	0 # last IFD
	)

	def decode_ccitt_data(data, width, height, CCITT_group=4, blackIsZero=False):
	"""Decodes CCITT-encoded data, if its intended width, height, etc are known."""
	img_size = len(data)
	tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group)
	return tiff_header + data

	###################################################################################################

	def main():
	p = argparse.ArgumentParser()
	p.add_argument("--indir",help="input dir (N.B. does not recurse)",required=True)
	p.add_argument("--outdir",help="output dir (must exist before hand)",required=True)
	args = p.parse_args()

	files = [f for f in os.listdir(args.indir) if f[-4:] == ".cal"]

	for fname in files:
	base = fname[:-4]
	in_path = os.path.join(args.indir,f"{base}.cal")
	out_path = os.path.join(args.outdir,f"{base}.tiff")

	with open(in_path,'rb') as cals_file:
	header,data = parse_cals(cals_file.read())

	assert header['rtype'] == 1, "I only deal with type 1"
	assert header['rorient'] == (0,270), "I only deal with simple orientation"
	width, height = header['rpelcnt']

	with open(out_path, 'wb') as tiff_file:
	tiff_file.write(decode_ccitt_data(data, width, height, CCITT_group=4, blackIsZero=True))


	if __name__ == "__main__":
	main()