#! /usr/bin/python
# -​- coding: utf-8 -​-
# On ignore le style pour le nom des méthode / fonctions etc...
# pylint: disable=C0103
Read and pre-process SD19 characters text file.
Blog post :
Characters in txt file are in 128x128 images with much padded zeros.
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones
Following preprocessing is applied to the dataset:
- Read glyph (see read_glyph())
- Moment-based image deskew (see deskew())
- Trim zeros rows and columns (see trim_padding())
- Resize image while keeping aspect ratio (see resize_with_constant_ratio())
- Pad zeros in order to get a square image (see pad_digit())
Extends original code from
import os
import re
import sys
import pickle
import cv2
import numpy as np
import math
def read_glyph(_line):
"""Extract digit from the text file
_line : string
current line in SD19 text file
digit : np.array
2D digit 128x128
label : int
the label
match ="^(\S+) (\d+)", _line)
label =
vector = list(
vector = [int(x) for x in vector]
label = ord(label)
label = int(symbol_map[label])
digit = np.array(vector, 'float32')
digit = (digit*-1.+1.).reshape(128, 128)
return digit, label
def deskew(img):
"""Deskew digit
img : np.array
2D digit array
dst : Deskewed digit
m = cv2.moments(img)
if abs(m['mu02']) < 1e-2:
return img.copy()
skew = m['mu11']/m['mu02']
rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]])
img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
return img
def resize_with_constant_ratio(img, char_dim):
"""Resize image while keeping aspect ratio. Max dim is char_dim
pad_dim is applied in order to have derivative friendly image
img : np.array
2D digit array
char_dim : int
dst dim
dst : resized digit
roi_h = img.shape[0]
roi_w = img.shape[1]
max_dim = max(roi_w, roi_h)
pad_dim = 2
scale = float(char_dim-pad_dim) / max_dim
if roi_w >= roi_h:
new_w = int(char_dim-pad_dim)
new_h = int(roi_h * scale)
new_w = int(roi_w * scale)
new_h = int(char_dim-pad_dim)
dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
return dst
def trim_padding(img):
"""Trim zeros rows and columns
img : np.array
2D digit array
dst : trimmed digit
mask_row = np.all(np.equal(img, 0), axis=1)
dst = img[~mask_row]
mask_col = np.all(np.equal(dst, 0), axis=0)
dst = dst[:, ~mask_col]
return dst
def pad_digit(img, char_dim):
"""Pad zeros in order to get a square char_dimxchar_dim image
img : np.array
2D digit array
char_dim : int
image dim
dst : padded digit
pad_h = char_dim-img.shape[0]
pad_w = char_dim-img.shape[1]
pad_h_b = math.floor(pad_h/2)
pad_h_t = pad_h - pad_h_b
pad_w_r = math.floor(pad_w/2)
pad_w_l = pad_w - pad_w_r
dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))
dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))
return dst
def print_overwrite(text):
"""Print with overwrite (for progression counter)
text : string
text to display
delete = "\b" * (len (text)+1)
print "{0}{1}".format(delete, text),
if __name__ == '__main__':
print __doc__
sd19_filename = "sd19-binary_digits.txt"
data = open(sd19_filename, "r")
dataset = []
symbol_map = dict([(x, chr(x)) for x in range(48, 58) + range (65, 91) + range(97, 123)])
current_dir = os.curdir
num_records = 0
num_lines = 402953
pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"
for line in data:
num_records += 1
if num_records%20000 == 0:
with open(os.path.join(current_dir, pickle_name +\
str(num_records) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print_overwrite("num_records : {}/{} - {:5.2f}%"\
.format(num_records, num_lines, num_records*1./num_lines*100))
digit, label = read_glyph(line)
digit_deskewed = deskew(digit)
digit_trimmed = trim_padding(digit_deskewed)
digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
digit_padded = pad_digit(digit_resized, char_dim)
item = []
with open(os.path.join(current_dir, pickle_name +\
str(num_lines) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
