black7375/create_dataset.py

## create_dataset.py
import h5py
import PIL, PIL.ImageFont, PIL.Image, PIL.ImageDraw, PIL.ImageChops, PIL.ImageOps
import os
import random
import string
import numpy
import sys

# -------------------- Convert to Numpy Array --------------------
w, h = 64, 64
w0, h0 = 256, 256

s_ascii = string.ascii_uppercase + string.ascii_lowercase + string.digits
# s_hangul = "가나다라마바사아자차카타파하"
s_hangul = "독창적인스물네자로만들어진표음문자로서고유하고특별해지켜져야한다"
blank = PIL.Image.new('L', (w0*5, h0*3), 255)


def read_font(fn, chars=s_ascii):
    font = PIL.ImageFont.truetype(fn, min(w0, h0))

    # We need to make sure we scale down the fonts but preserve the vertical alignment
    min_ly = float('inf')
    max_hy = float('-inf')
    max_width = 0
    imgs = []

    for char in chars:
        print('...', char)
        # Draw character
        img = PIL.Image.new("L", (w0*5, h0*3), 255)
        draw = PIL.ImageDraw.Draw(img)
        draw.text((w0, h0), char, font=font)

        # Get bounding box
        diff = PIL.ImageChops.difference(img, blank)
        lx, ly, hx, hy = diff.getbbox()
        min_ly = min(min_ly, ly)
        max_hy = max(max_hy, hy)
        max_width = max(max_width, hx - lx)
        imgs.append((lx, hx, img))

    print('crop dims:', max_hy - min_ly, max_width)
    scale_factor = min(1.0 * h / (max_hy - min_ly), 1.0 * w / max_width)
    data = []

    for lx, hx, img in imgs:
        img = img.crop((lx, min_ly, hx, max_hy))

        # Resize to smaller
        new_width = (hx-lx) * scale_factor
        new_height = (max_hy - min_ly) * scale_factor
        img = img.resize((int(new_width),
                          int(new_height)), PIL.Image.ANTIALIAS)

        # Expand to square
        img_sq = PIL.Image.new('L', (w, h), 255)
        offset_x = (w - new_width)/2
        offset_y = (h - new_height)/2
        print(offset_x, offset_y)
        img_sq.paste(img, (int(offset_x), int(offset_y)))

        # Convert to numpy array
        matrix = numpy.array(img_sq.getdata()).reshape((h, w))
        matrix = 255 - matrix
        data.append(matrix)

    return numpy.array(data)


# -------------------- Get Files --------------------
def get_ttfs(d='../dataset/fonts'):
    for dirpath, dirname, filenames in os.walk(d):
        for filename in filenames:
            if filename.endswith('.ttf') or filename.endswith('.otf'):
                yield os.path.join(dirpath, filename)

def new_dataset(f, dshape, label):
    dmax = (None,) + dshape[1:]
    f.create_dataset(label,
                     dshape, chunks=dshape,
                     maxshape=dmax, dtype='u1')
    return f

def new_file(path, dshape, label):
    f = h5py.File(path, 'w')
    f = new_dataset(f, dshape, label)

    return f

def get_h5py(path='fonts.hdf5', dshape=(1,), label=None):
    if os.path.exists(path):
        if os.path.exists('fonts.hdf5'):
            f = h5py.File(path, 'r+')
        else:
            f = new_file(path, dshape, label)
    else:
        f = new_file(path, dshape, label)

    return f

# -------------------- Create Dataset --------------------
def create_dataset(path='../dataset/fonts', chars=s_ascii, label='ascii',
                   h5dfP = 'fonts.hdf5',):
    dshape = (1, len(chars), h, w)
    f = get_h5py(h5dfP, dshape, label)

    try:
        dset = f[label]
    except KeyError:
        f = new_dataset(f, dshape, label)
        dset = f[label]

    if dset.shape[0] == 1:
        i = dset.shape[0] - 1
    else:
        i = dset.shape[0]

    for fn in get_ttfs(path):
        print(fn)
        try:
            data = read_font(fn, chars=chars)
        except:  # IOError:
            print('was not able to read', fn)
            continue

        print(data.shape)
        dset.resize((i+1,) + dshape[1:])
        dset[i] = data
        i += 1
        f.flush()

    f.close()

def file_dataset(path,  h5dfP = 'fonts.hdf5'):
    dshape = (1, len(chars), h, w)
    f = get_h5py(h5dfP, dshape)

    dset = f['fonts']
    try:
        data = read_font(path)
    except:
        print("Can't able to read")
    dset[0] = data
    f.flush()
    f.close()

if __name__ == "__main__":
    create_dataset(path='../font/fonts', chars=s_ascii, label='ascii')
    create_dataset(path='../font/fonts', chars=s_hangul, label='hangul')
	import h5py
	import PIL, PIL.ImageFont, PIL.Image, PIL.ImageDraw, PIL.ImageChops, PIL.ImageOps
	import os
	import random
	import string
	import numpy
	import sys

	# -------------------- Convert to Numpy Array --------------------
	w, h = 64, 64
	w0, h0 = 256, 256

	s_ascii = string.ascii_uppercase + string.ascii_lowercase + string.digits
	# s_hangul = "가나다라마바사아자차카타파하"
	s_hangul = "독창적인스물네자로만들어진표음문자로서고유하고특별해지켜져야한다"
	blank = PIL.Image.new('L', (w05, h03), 255)


	def read_font(fn, chars=s_ascii):
	font = PIL.ImageFont.truetype(fn, min(w0, h0))

	# We need to make sure we scale down the fonts but preserve the vertical alignment
	min_ly = float('inf')
	max_hy = float('-inf')
	max_width = 0
	imgs = []

	for char in chars:
	print('...', char)
	# Draw character
	img = PIL.Image.new("L", (w05, h03), 255)
	draw = PIL.ImageDraw.Draw(img)
	draw.text((w0, h0), char, font=font)

	# Get bounding box
	diff = PIL.ImageChops.difference(img, blank)
	lx, ly, hx, hy = diff.getbbox()
	min_ly = min(min_ly, ly)
	max_hy = max(max_hy, hy)
	max_width = max(max_width, hx - lx)
	imgs.append((lx, hx, img))

	print('crop dims:', max_hy - min_ly, max_width)
	scale_factor = min(1.0 * h / (max_hy - min_ly), 1.0 * w / max_width)
	data = []

	for lx, hx, img in imgs:
	img = img.crop((lx, min_ly, hx, max_hy))

	# Resize to smaller
	new_width = (hx-lx) * scale_factor
	new_height = (max_hy - min_ly) * scale_factor
	img = img.resize((int(new_width),
	int(new_height)), PIL.Image.ANTIALIAS)

	# Expand to square
	img_sq = PIL.Image.new('L', (w, h), 255)
	offset_x = (w - new_width)/2
	offset_y = (h - new_height)/2
	print(offset_x, offset_y)
	img_sq.paste(img, (int(offset_x), int(offset_y)))

	# Convert to numpy array
	matrix = numpy.array(img_sq.getdata()).reshape((h, w))
	matrix = 255 - matrix
	data.append(matrix)

	return numpy.array(data)


	# -------------------- Get Files --------------------
	def get_ttfs(d='../dataset/fonts'):
	for dirpath, dirname, filenames in os.walk(d):
	for filename in filenames:
	if filename.endswith('.ttf') or filename.endswith('.otf'):
	yield os.path.join(dirpath, filename)

	def new_dataset(f, dshape, label):
	dmax = (None,) + dshape[1:]
	f.create_dataset(label,
	dshape, chunks=dshape,
	maxshape=dmax, dtype='u1')
	return f

	def new_file(path, dshape, label):
	f = h5py.File(path, 'w')
	f = new_dataset(f, dshape, label)

	return f

	def get_h5py(path='fonts.hdf5', dshape=(1,), label=None):
	if os.path.exists(path):
	if os.path.exists('fonts.hdf5'):
	f = h5py.File(path, 'r+')
	else:
	f = new_file(path, dshape, label)
	else:
	f = new_file(path, dshape, label)

	return f

	# -------------------- Create Dataset --------------------
	def create_dataset(path='../dataset/fonts', chars=s_ascii, label='ascii',
	h5dfP = 'fonts.hdf5',):
	dshape = (1, len(chars), h, w)
	f = get_h5py(h5dfP, dshape, label)

	try:
	dset = f[label]
	except KeyError:
	f = new_dataset(f, dshape, label)
	dset = f[label]

	if dset.shape[0] == 1:
	i = dset.shape[0] - 1
	else:
	i = dset.shape[0]

	for fn in get_ttfs(path):
	print(fn)
	try:
	data = read_font(fn, chars=chars)
	except: # IOError:
	print('was not able to read', fn)
	continue

	print(data.shape)
	dset.resize((i+1,) + dshape[1:])
	dset[i] = data
	i += 1
	f.flush()

	f.close()

	def file_dataset(path, h5dfP = 'fonts.hdf5'):
	dshape = (1, len(chars), h, w)
	f = get_h5py(h5dfP, dshape)

	dset = f['fonts']
	try:
	data = read_font(path)
	except:
	print("Can't able to read")
	dset[0] = data
	f.flush()
	f.close()

	if __name__ == "__main__":
	create_dataset(path='../font/fonts', chars=s_ascii, label='ascii')
	create_dataset(path='../font/fonts', chars=s_hangul, label='hangul')