Skip to content

Instantly share code, notes, and snippets.

@seeb0h
Last active October 4, 2017 13:35
Show Gist options
  • Save seeb0h/838bfad11852f4d2123f2c46253ea3ac to your computer and use it in GitHub Desktop.
Save seeb0h/838bfad11852f4d2123f2c46253ea3ac to your computer and use it in GitHub Desktop.
Read and pre-process SD19 characters text file
#! /usr/bin/python
# -​- coding: utf-8 -​-
# On ignore le style pour le nom des méthode / fonctions etc...
# pylint: disable=C0103
'''
Read and pre-process SD19 characters text file.
Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/
Characters in txt file are in 128x128 images with much padded zeros.
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones
Following preprocessing is applied to the dataset:
- Read glyph (see read_glyph())
- Moment-based image deskew (see deskew())
- Trim zeros rows and columns (see trim_padding())
- Resize image while keeping aspect ratio (see resize_with_constant_ratio())
- Pad zeros in order to get a square image (see pad_digit())
Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/
Usage:
preprocess_sd19_text.py
'''
#
import os
import re
import sys
import pickle
import cv2
import numpy as np
import math
def read_glyph(_line):
"""Extract digit from the text file
Parameters
----------
_line : string
current line in SD19 text file
Returns
-------
digit : np.array
2D digit 128x128
label : int
the label
"""
match = re.search("^(\S+) (\d+)", _line)
label = match.group(1)
vector = list(match.group(2))
vector = [int(x) for x in vector]
label = ord(label)
label = int(symbol_map[label])
digit = np.array(vector, 'float32')
digit = (digit*-1.+1.).reshape(128, 128)
return digit, label
def deskew(img):
"""Deskew digit
Parameters
----------
img : np.array
2D digit array
Returns
-------
dst : Deskewed digit
"""
m = cv2.moments(img)
if abs(m['mu02']) < 1e-2:
return img.copy()
skew = m['mu11']/m['mu02']
rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]])
img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
return img
def resize_with_constant_ratio(img, char_dim):
"""Resize image while keeping aspect ratio. Max dim is char_dim
pad_dim is applied in order to have derivative friendly image
Parameters
----------
img : np.array
2D digit array
char_dim : int
dst dim
Returns
-------
dst : resized digit
"""
roi_h = img.shape[0]
roi_w = img.shape[1]
max_dim = max(roi_w, roi_h)
pad_dim = 2
scale = float(char_dim-pad_dim) / max_dim
if roi_w >= roi_h:
new_w = int(char_dim-pad_dim)
new_h = int(roi_h * scale)
else:
new_w = int(roi_w * scale)
new_h = int(char_dim-pad_dim)
dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
return dst
def trim_padding(img):
"""Trim zeros rows and columns
Parameters
----------
img : np.array
2D digit array
Returns
-------
dst : trimmed digit
"""
mask_row = np.all(np.equal(img, 0), axis=1)
dst = img[~mask_row]
mask_col = np.all(np.equal(dst, 0), axis=0)
dst = dst[:, ~mask_col]
return dst
def pad_digit(img, char_dim):
"""Pad zeros in order to get a square char_dimxchar_dim image
Parameters
----------
img : np.array
2D digit array
char_dim : int
image dim
Returns
-------
dst : padded digit
"""
pad_h = char_dim-img.shape[0]
pad_w = char_dim-img.shape[1]
pad_h_b = math.floor(pad_h/2)
pad_h_t = pad_h - pad_h_b
pad_w_r = math.floor(pad_w/2)
pad_w_l = pad_w - pad_w_r
dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))
dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))
return dst
def print_overwrite(text):
"""Print with overwrite (for progression counter)
Parameters
----------
text : string
text to display
"""
delete = "\b" * (len (text)+1)
print "{0}{1}".format(delete, text),
if __name__ == '__main__':
print __doc__
sd19_filename = "sd19-binary_digits.txt"
data = open(sd19_filename, "r")
dataset = []
symbol_map = dict([(x, chr(x)) for x in range(48, 58) + range (65, 91) + range(97, 123)])
current_dir = os.curdir
num_records = 0
num_lines = 402953
char_dim=28
pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"
for line in data:
num_records += 1
if num_records%20000 == 0:
with open(os.path.join(current_dir, pickle_name +\
str(num_records) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print_overwrite("num_records : {}/{} - {:5.2f}%"\
.format(num_records, num_lines, num_records*1./num_lines*100))
digit, label = read_glyph(line)
digit_deskewed = deskew(digit)
digit_trimmed = trim_padding(digit_deskewed)
digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
digit_padded = pad_digit(digit_resized, char_dim)
item = []
item.append((digit_padded*255).astype('uint8'))
item.append(label)
dataset.append(item)
with open(os.path.join(current_dir, pickle_name +\
str(num_lines) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
@Link009
Copy link

Link009 commented Oct 4, 2017

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment