seeb0h/ preprocess_sd19_text.py

## preprocess_sd19_text.py
#! /usr/bin/python
# -- coding: utf-8 --

# On ignore le style pour le nom des méthode / fonctions etc...
# pylint: disable=C0103

'''
Read and pre-process SD19 characters text file.
Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/

Characters in txt file are in 128x128 images with much padded zeros.
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones

Following preprocessing is applied to the dataset:
 - Read glyph (see read_glyph())
 - Moment-based image deskew (see deskew())
 - Trim zeros rows and columns (see trim_padding())
 - Resize image while keeping aspect ratio (see resize_with_constant_ratio())
 - Pad zeros in order to get a square image (see pad_digit())

Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/

Usage:
   preprocess_sd19_text.py
'''
#

import os
import re
import sys
import pickle
import cv2
import numpy as np
import math

def read_glyph(_line):
    """Extract digit from the text file

	Parameters
	----------
	_line : string
        current line in SD19 text file

	Returns
	-------
    digit : np.array
        2D digit 128x128
	label : int
		the label
	"""
    match = re.search("^(\S+) (\d+)", _line)
    label = match.group(1)
    vector = list(match.group(2))
    vector = [int(x) for x in vector]

    label = ord(label)
    label = int(symbol_map[label])

    digit = np.array(vector, 'float32')
    digit = (digit*-1.+1.).reshape(128, 128)

    return digit, label

def deskew(img):
    """Deskew digit

	Parameters
	----------
	img : np.array
        2D digit array

	Returns
	-------
    dst : Deskewed digit
	"""
    m = cv2.moments(img)
    if abs(m['mu02']) < 1e-2:
        return img.copy()
    skew = m['mu11']/m['mu02']
    rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]])
    img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
    return img


def resize_with_constant_ratio(img, char_dim):
    """Resize image while keeping aspect ratio. Max dim is char_dim
	pad_dim is applied in order to have derivative friendly image

	Parameters
	----------
	img : np.array
        2D digit array
    char_dim : int
        dst dim

	Returns
	-------
    dst : resized digit
	"""
    roi_h = img.shape[0]
    roi_w = img.shape[1]

    max_dim = max(roi_w, roi_h)
    pad_dim = 2
    scale = float(char_dim-pad_dim) / max_dim
    if roi_w >= roi_h:
        new_w = int(char_dim-pad_dim)
        new_h = int(roi_h * scale)
    else:
        new_w = int(roi_w * scale)
        new_h = int(char_dim-pad_dim)

    dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

    return dst

def trim_padding(img):
    """Trim zeros rows and columns

	Parameters
	----------
	img : np.array
        2D digit array

	Returns
	-------
    dst : trimmed digit
	"""
    mask_row = np.all(np.equal(img, 0), axis=1)
    dst = img[~mask_row]

    mask_col = np.all(np.equal(dst, 0), axis=0)
    dst = dst[:, ~mask_col]

    return dst

def pad_digit(img, char_dim):
    """Pad zeros in order to get a square char_dimxchar_dim image

	Parameters
	----------
	img : np.array
        2D digit array
    char_dim : int
        image dim

	Returns
	-------
    dst : padded digit
	"""
    pad_h = char_dim-img.shape[0]
    pad_w = char_dim-img.shape[1]
    pad_h_b = math.floor(pad_h/2)
    pad_h_t = pad_h - pad_h_b
    pad_w_r = math.floor(pad_w/2)
    pad_w_l = pad_w - pad_w_r

    dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
    dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))

    dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
    dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))

    return dst


def print_overwrite(text):
    """Print with overwrite (for progression counter)

    Parameters
    ----------
    text : string
          text to display
    """
    delete = "\b" * (len (text)+1)
    print "{0}{1}".format(delete, text),

if __name__ == '__main__':
    print __doc__

    sd19_filename = "sd19-binary_digits.txt"
    data = open(sd19_filename, "r")
    dataset = []
    symbol_map = dict([(x, chr(x)) for x in range(48, 58) + range (65, 91) + range(97, 123)])

    current_dir = os.curdir

    num_records = 0
    num_lines = 402953

    char_dim=28
    pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"


    for line in data:
        num_records += 1

        if num_records%20000 == 0:
            with open(os.path.join(current_dir, pickle_name +\
                    str(num_records) + ".pickle"), 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        print_overwrite("num_records : {}/{} - {:5.2f}%"\
            .format(num_records, num_lines, num_records*1./num_lines*100))

        digit, label = read_glyph(line)
        digit_deskewed = deskew(digit)
        digit_trimmed = trim_padding(digit_deskewed)
        digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
        digit_padded = pad_digit(digit_resized, char_dim)

        item = []
        item.append((digit_padded*255).astype('uint8'))
        item.append(label)
        dataset.append(item)


    with open(os.path.join(current_dir, pickle_name +\
                str(num_lines) + ".pickle"), 'wb') as f:
        pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
	#! /usr/bin/python
	# -- coding: utf-8 --

	# On ignore le style pour le nom des méthode / fonctions etc...
	# pylint: disable=C0103

	'''
	Read and pre-process SD19 characters text file.
	Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/

	Characters in txt file are in 128x128 images with much padded zeros.
	It may be suitable for learning to have smaller, deskewed, trimmed, squared ones

	Following preprocessing is applied to the dataset:
	- Read glyph (see read_glyph())
	- Moment-based image deskew (see deskew())
	- Trim zeros rows and columns (see trim_padding())
	- Resize image while keeping aspect ratio (see resize_with_constant_ratio())
	- Pad zeros in order to get a square image (see pad_digit())

	Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/

	Usage:
	preprocess_sd19_text.py
	'''
	#

	import os
	import re
	import sys
	import pickle
	import cv2
	import numpy as np
	import math

	def read_glyph(_line):
	"""Extract digit from the text file

	Parameters
	----------
	_line : string
	current line in SD19 text file

	Returns
	-------
	digit : np.array
	2D digit 128x128
	label : int
	the label
	"""
	match = re.search("^(\S+) (\d+)", _line)
	label = match.group(1)
	vector = list(match.group(2))
	vector = [int(x) for x in vector]

	label = ord(label)
	label = int(symbol_map[label])

	digit = np.array(vector, 'float32')
	digit = (digit*-1.+1.).reshape(128, 128)

	return digit, label

	def deskew(img):
	"""Deskew digit

	Parameters
	----------
	img : np.array
	2D digit array

	Returns
	-------
	dst : Deskewed digit
	"""
	m = cv2.moments(img)
	if abs(m['mu02']) < 1e-2:
	return img.copy()
	skew = m['mu11']/m['mu02']
	rot_mat = np.float32([[1, skew, -0.5max(img.shape[0], img.shape[1])skew], [0, 1, 0]])
	img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP \| cv2.INTER_LINEAR)
	return img


	def resize_with_constant_ratio(img, char_dim):
	"""Resize image while keeping aspect ratio. Max dim is char_dim
	pad_dim is applied in order to have derivative friendly image

	Parameters
	----------
	img : np.array
	2D digit array
	char_dim : int
	dst dim

	Returns
	-------
	dst : resized digit
	"""
	roi_h = img.shape[0]
	roi_w = img.shape[1]

	max_dim = max(roi_w, roi_h)
	pad_dim = 2
	scale = float(char_dim-pad_dim) / max_dim
	if roi_w >= roi_h:
	new_w = int(char_dim-pad_dim)
	new_h = int(roi_h * scale)
	else:
	new_w = int(roi_w * scale)
	new_h = int(char_dim-pad_dim)

	dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

	return dst

	def trim_padding(img):
	"""Trim zeros rows and columns

	Parameters
	----------
	img : np.array
	2D digit array

	Returns
	-------
	dst : trimmed digit
	"""
	mask_row = np.all(np.equal(img, 0), axis=1)
	dst = img[~mask_row]

	mask_col = np.all(np.equal(dst, 0), axis=0)
	dst = dst[:, ~mask_col]

	return dst

	def pad_digit(img, char_dim):
	"""Pad zeros in order to get a square char_dimxchar_dim image

	Parameters
	----------
	img : np.array
	2D digit array
	char_dim : int
	image dim

	Returns
	-------
	dst : padded digit
	"""
	pad_h = char_dim-img.shape[0]
	pad_w = char_dim-img.shape[1]
	pad_h_b = math.floor(pad_h/2)
	pad_h_t = pad_h - pad_h_b
	pad_w_r = math.floor(pad_w/2)
	pad_w_l = pad_w - pad_w_r

	dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
	dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))

	dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
	dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))

	return dst


	def print_overwrite(text):
	"""Print with overwrite (for progression counter)

	Parameters
	----------
	text : string
	text to display
	"""
	delete = "\b" * (len (text)+1)
	print "{0}{1}".format(delete, text),

	if __name__ == '__main__':
	print __doc__

	sd19_filename = "sd19-binary_digits.txt"
	data = open(sd19_filename, "r")
	dataset = []
	symbol_map = dict([(x, chr(x)) for x in range(48, 58) + range (65, 91) + range(97, 123)])

	current_dir = os.curdir

	num_records = 0
	num_lines = 402953

	char_dim=28
	pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"


	for line in data:
	num_records += 1

	if num_records%20000 == 0:
	with open(os.path.join(current_dir, pickle_name +\
	str(num_records) + ".pickle"), 'wb') as f:
	pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
	print_overwrite("num_records : {}/{} - {:5.2f}%"\
	.format(num_records, num_lines, num_records1./num_lines100))

	digit, label = read_glyph(line)
	digit_deskewed = deskew(digit)
	digit_trimmed = trim_padding(digit_deskewed)
	digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
	digit_padded = pad_digit(digit_resized, char_dim)

	item = []
	item.append((digit_padded*255).astype('uint8'))
	item.append(label)
	dataset.append(item)


	with open(os.path.join(current_dir, pickle_name +\
	str(num_lines) + ".pickle"), 'wb') as f:
	pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)