Last active
October 4, 2017 13:35
-
-
Save seeb0h/838bfad11852f4d2123f2c46253ea3ac to your computer and use it in GitHub Desktop.
Read and pre-process SD19 characters text file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
# -- coding: utf-8 -- | |
# On ignore le style pour le nom des méthode / fonctions etc... | |
# pylint: disable=C0103 | |
''' | |
Read and pre-process SD19 characters text file. | |
Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/ | |
Characters in txt file are in 128x128 images with much padded zeros. | |
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones | |
Following preprocessing is applied to the dataset: | |
- Read glyph (see read_glyph()) | |
- Moment-based image deskew (see deskew()) | |
- Trim zeros rows and columns (see trim_padding()) | |
- Resize image while keeping aspect ratio (see resize_with_constant_ratio()) | |
- Pad zeros in order to get a square image (see pad_digit()) | |
Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/ | |
Usage: | |
preprocess_sd19_text.py | |
''' | |
# | |
import os | |
import re | |
import sys | |
import pickle | |
import cv2 | |
import numpy as np | |
import math | |
def read_glyph(_line): | |
"""Extract digit from the text file | |
Parameters | |
---------- | |
_line : string | |
current line in SD19 text file | |
Returns | |
------- | |
digit : np.array | |
2D digit 128x128 | |
label : int | |
the label | |
""" | |
match = re.search("^(\S+) (\d+)", _line) | |
label = match.group(1) | |
vector = list(match.group(2)) | |
vector = [int(x) for x in vector] | |
label = ord(label) | |
label = int(symbol_map[label]) | |
digit = np.array(vector, 'float32') | |
digit = (digit*-1.+1.).reshape(128, 128) | |
return digit, label | |
def deskew(img): | |
"""Deskew digit | |
Parameters | |
---------- | |
img : np.array | |
2D digit array | |
Returns | |
------- | |
dst : Deskewed digit | |
""" | |
m = cv2.moments(img) | |
if abs(m['mu02']) < 1e-2: | |
return img.copy() | |
skew = m['mu11']/m['mu02'] | |
rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]]) | |
img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR) | |
return img | |
def resize_with_constant_ratio(img, char_dim): | |
"""Resize image while keeping aspect ratio. Max dim is char_dim | |
pad_dim is applied in order to have derivative friendly image | |
Parameters | |
---------- | |
img : np.array | |
2D digit array | |
char_dim : int | |
dst dim | |
Returns | |
------- | |
dst : resized digit | |
""" | |
roi_h = img.shape[0] | |
roi_w = img.shape[1] | |
max_dim = max(roi_w, roi_h) | |
pad_dim = 2 | |
scale = float(char_dim-pad_dim) / max_dim | |
if roi_w >= roi_h: | |
new_w = int(char_dim-pad_dim) | |
new_h = int(roi_h * scale) | |
else: | |
new_w = int(roi_w * scale) | |
new_h = int(char_dim-pad_dim) | |
dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR) | |
return dst | |
def trim_padding(img): | |
"""Trim zeros rows and columns | |
Parameters | |
---------- | |
img : np.array | |
2D digit array | |
Returns | |
------- | |
dst : trimmed digit | |
""" | |
mask_row = np.all(np.equal(img, 0), axis=1) | |
dst = img[~mask_row] | |
mask_col = np.all(np.equal(dst, 0), axis=0) | |
dst = dst[:, ~mask_col] | |
return dst | |
def pad_digit(img, char_dim): | |
"""Pad zeros in order to get a square char_dimxchar_dim image | |
Parameters | |
---------- | |
img : np.array | |
2D digit array | |
char_dim : int | |
image dim | |
Returns | |
------- | |
dst : padded digit | |
""" | |
pad_h = char_dim-img.shape[0] | |
pad_w = char_dim-img.shape[1] | |
pad_h_b = math.floor(pad_h/2) | |
pad_h_t = pad_h - pad_h_b | |
pad_w_r = math.floor(pad_w/2) | |
pad_w_l = pad_w - pad_w_r | |
dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r)))) | |
dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst)) | |
dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1])))) | |
dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst)) | |
return dst | |
def print_overwrite(text): | |
"""Print with overwrite (for progression counter) | |
Parameters | |
---------- | |
text : string | |
text to display | |
""" | |
delete = "\b" * (len (text)+1) | |
print "{0}{1}".format(delete, text), | |
if __name__ == '__main__': | |
print __doc__ | |
sd19_filename = "sd19-binary_digits.txt" | |
data = open(sd19_filename, "r") | |
dataset = [] | |
symbol_map = dict([(x, chr(x)) for x in range(48, 58) + range (65, 91) + range(97, 123)]) | |
current_dir = os.curdir | |
num_records = 0 | |
num_lines = 402953 | |
char_dim=28 | |
pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_" | |
for line in data: | |
num_records += 1 | |
if num_records%20000 == 0: | |
with open(os.path.join(current_dir, pickle_name +\ | |
str(num_records) + ".pickle"), 'wb') as f: | |
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) | |
print_overwrite("num_records : {}/{} - {:5.2f}%"\ | |
.format(num_records, num_lines, num_records*1./num_lines*100)) | |
digit, label = read_glyph(line) | |
digit_deskewed = deskew(digit) | |
digit_trimmed = trim_padding(digit_deskewed) | |
digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim) | |
digit_padded = pad_digit(digit_resized, char_dim) | |
item = [] | |
item.append((digit_padded*255).astype('uint8')) | |
item.append(label) | |
dataset.append(item) | |
with open(os.path.join(current_dir, pickle_name +\ | |
str(num_lines) + ".pickle"), 'wb') as f: | |
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ported to Python 3.
https://github.com/ad3f/preprocess_sd19_text.py