Skip to content

Instantly share code, notes, and snippets.

@adiprasad
Created September 8, 2019 21:37
Show Gist options
  • Save adiprasad/8125e2bc68f167eda965571e15fb2948 to your computer and use it in GitHub Desktop.
Save adiprasad/8125e2bc68f167eda965571e15fb2948 to your computer and use it in GitHub Desktop.
Tokens image dataset creating using Stanford OCR and Gutenberg
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Creating an image dataset using character images from Stanford OCR and Tokens from Gutenberg"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sets import Set \n",
"from collections import defaultdict\n",
"import pickle\n",
"import nltk\n",
"import re\n",
"import os\n",
"import pickle\n",
"from string import lower\n",
"from sklearn.model_selection import train_test_split\n",
"from collections import Counter\n",
"import matplotlib.pyplot as plt\n",
"from string import lower"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creating a character image variants dictionary"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Character : a\n",
"Number of variants: 4021\n",
"Character : c\n",
"Number of variants: 2072\n",
"Character : b\n",
"Number of variants: 1282\n",
"Character : e\n",
"Number of variants: 4945\n",
"Character : d\n",
"Number of variants: 1441\n",
"Character : g\n",
"Number of variants: 2471\n",
"Character : f\n",
"Number of variants: 921\n",
"Character : i\n",
"Number of variants: 4027\n",
"Character : h\n",
"Number of variants: 861\n",
"Character : k\n",
"Number of variants: 909\n",
"Character : j\n",
"Number of variants: 189\n",
"Character : m\n",
"Number of variants: 1587\n",
"Character : l\n",
"Number of variants: 1696\n",
"Character : o\n",
"Number of variants: 3854\n",
"Character : n\n",
"Number of variants: 4988\n",
"Character : q\n",
"Number of variants: 341\n",
"Character : p\n",
"Number of variants: 1377\n",
"Character : s\n",
"Number of variants: 1394\n",
"Character : r\n",
"Number of variants: 2634\n",
"Character : u\n",
"Number of variants: 2538\n",
"Character : t\n",
"Number of variants: 2126\n",
"Character : w\n",
"Number of variants: 520\n",
"Character : v\n",
"Number of variants: 661\n",
"Character : y\n",
"Number of variants: 1221\n",
"Character : x\n",
"Number of variants: 413\n",
"Character : z\n",
"Number of variants: 1091\n"
]
}
],
"source": [
"data_file = open('letter.data', 'r')\n",
"char_pixel_dict = defaultdict(Set)\t# key : character, value : set of pixel array variants for that character\n",
"\n",
"\n",
"for line in data_file:\n",
"\tdata_arr = line.split()\n",
"\tchar = data_arr[1]\n",
"\n",
"\tchar_label = ord(char) - 97\n",
"\n",
"\tchar_features = map(lambda x : int(x), data_arr[6:])\n",
"\n",
"\tchar_set = char_pixel_dict.get(char, Set())\n",
"\n",
"\tchar_feature_tup = tuple(char_features)\n",
"\n",
"\tchar_set.add(char_feature_tup)\n",
"\n",
"\tchar_pixel_dict[char] = char_set\n",
"\n",
" \n",
"\n",
"for key, val in char_pixel_dict.items():\n",
"\tprint \"Character : \" + str(key)\n",
"\tprint \"Number of variants: \" + str(len(val))\n",
"\n",
"\n",
"with open(\"./letter_variants.npy\", \"w\") as f:\n",
"\tpickle.dump(char_pixel_dict, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Script to create images and labels for tokens extracted from the book Moby Dick"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def get_dir(dir_name):\n",
"\tdir_path = os.path.join(os.getcwd(), dir_name)\n",
"\t\n",
"\tif (not os.path.exists(dir_path)):\n",
"\t\tos.makedirs(dir_path)\n",
"\n",
"\treturn dir_path\n",
"\n",
"def init_gutenberg():\n",
"\tnltk.download('gutenberg')\n",
"\n",
"\n",
"def get_book_tokens_gt_len(book_name, length):\n",
"\tbook_tokens = nltk.corpus.gutenberg.words(book_name)\n",
"\tbook_token_set = Set(book_tokens)\n",
"\tbook_token_list = map(lambda x : x.strip(), book_token_set)\n",
"\tbook_words = filter(lambda x : re.match(r'^[a-zA-Z]+$', x), book_token_list)\n",
"\tbook_words_gt_eq_len = filter(lambda x : len(x) >= length, book_words)\n",
"\n",
"\tbook_words_gt_eq_len = map(lambda x : lower(x), book_words_gt_eq_len)\n",
"\n",
"\treturn np.array(book_words_gt_eq_len)\n",
"\n",
"def _get_file_name(img_num, file_type = \"img\"):\n",
"\tif (file_type == \"img\"):\n",
"\t\tfile_name = \"img_\" + str(img_num) + \".png\"\n",
"\telse:\n",
"\t\tfile_name = \"img_\" + str(img_num) + \".txt\"\n",
"\n",
"\treturn file_name\n",
"\n",
"def _save_img_in_folder(dir_path, img_num, word_arr):\n",
"\tfile_name = _get_file_name(img_num)\n",
"\n",
"\tfile_path = os.path.join(dir_path, file_name)\n",
"\n",
"\tplt.imsave(file_path, word_arr)\n",
"\n",
"\n",
"def _draw_and_save_word_in_dir(img_num, img_arr, dir_path):\n",
"\timg_arr = img_arr.reshape(img_arr.shape[0], 16,- 1)\n",
"\n",
"\tword_arr = img_arr[0]\n",
"\n",
"\tfor i in range(1, len(img_arr)):\n",
"\t\tword_arr = np.hstack((word_arr, np.zeros((16,2), dtype = int)))\n",
"\t\tword_arr = np.hstack((word_arr, img_arr[i]))\n",
"\n",
"\tword_arr*=255\n",
"\tword_arr = word_arr[:,8:]\t\t# first 8 cols are zeros, 2 col padding at start and end\n",
"\n",
"\t_save_img_in_folder(dir_path, img_num, word_arr)\n",
"\n",
"\n",
"def _write_word_arr_in_dir(img_num, img_arr, dir_path):\n",
"\tfile_name = _get_file_name(img_num, \"txt\")\n",
"\n",
"\tfile_path = os.path.join(dir_path, file_name)\n",
"\n",
"\timg_arr = img_arr[1:]\n",
"\n",
"\twith open(file_path, \"w\") as f:\n",
"\t\tfor img in img_arr:\n",
"\t\t\tline = \" \".join([`num` for num in img]) + \"\\n\"\n",
"\t\t\tf.write(line)\n",
"\n",
"def save_word_to_dir(word_arr, word_num, dir_path):\n",
"\t_draw_and_save_word_in_dir(word_num, word_arr, dir_path)\n",
"\t_write_word_arr_in_dir(word_num, word_arr, dir_path)\n",
"\n",
"\n",
"def get_char_img_len(char_to_img_arr_dict):\n",
"\tlist_of_a_variants = list(char_to_img_arr_dict['a'])\n",
"\n",
"\treturn len(list_of_a_variants[0])\n",
"\n",
"\n",
"def get_random_variant_of_char(char_to_img_arr_dict, char):\n",
"\tchar_variants_list = list(char_to_img_arr_dict[char])\n",
"\tnum_variants = len(char_variants_list)\n",
"\n",
"\t#print \"Char : \" + char\n",
"\t#rint \"Number of variants : \" + str(num_variants)\n",
"\n",
"\trandom_variant_idx = np.random.choice(num_variants)\n",
"\n",
"\trandom_variant = char_variants_list[random_variant_idx]\n",
"\n",
"\treturn np.array(random_variant)\n",
"\n",
"\n",
"def save_word_imgs_to_dir(words, dir_path, char_to_img_arr_dict):\n",
"\tchar_feature_len = get_char_img_len(char_to_img_arr_dict)\n",
"\n",
"\t#print \"last word \" + str(words[-1])\n",
"\n",
"\tfor word_num, word in enumerate(words):\n",
"\t\t#print \"Writing \" + str(word) + \" to file\"\n",
"\t\tword_arr = np.zeros((1,char_feature_len), dtype = int)\n",
"\n",
"\t\tfor char in word:\n",
"\t\t\tvariant_of_char = get_random_variant_of_char(char_to_img_arr_dict, char)\n",
"\t\t\tword_arr = np.vstack((word_arr, variant_of_char))\n",
"\n",
"\t\tsave_word_to_dir(word_arr, word_num, dir_path)\n",
"\t\t#print \"Write successful!\"\n",
"\n",
"\n",
"def write_words_list_to_dir(words, file_path):\n",
"\twith open(file_path, 'w') as word_file:\n",
"\t\tfor word_num, word in enumerate(words):\n",
"\t\t\tword_file.write(str(word_num) + \" \" + word + \"\\n\")\n",
"\n",
"\n",
"def filter_words_for_stratification(words_list, test_split):\n",
"\tlen_words_list = map(lambda x : len(x), words_list)\n",
"\n",
"\tmin_words_req_per_len = int(np.ceil(test_split * 10))\n",
"\n",
"\tlen_words_counter = Counter(len_words_list)\n",
"\n",
"\tprint \"Len words counter \" + str(len_words_counter)\n",
"\n",
"\tnot_allowed_lengths = [x[0] for x in len_words_counter.items() if x[1] < min_words_req_per_len]\n",
"\n",
"\tprint \"Not allowed lengths \" + str(not_allowed_lengths)\n",
"\n",
"\tlen_words_list = np.array(len_words_list)\n",
"\n",
"\tmask = np.zeros(len(words_list), dtype=bool)\n",
"\n",
"\tfor n_length in not_allowed_lengths:\n",
"\t\tn_len_mask = (len_words_list == n_length)\n",
"\t\tmask = np.ma.mask_or(mask, n_len_mask)\n",
"\n",
"\tinverted_mask = np.invert(mask)\n",
"\n",
"\treturn words_list[inverted_mask]\n",
"\n",
"\n",
"def create_data_set(words_list, char_to_img_arr_dict, test_split):\n",
"\twords_list = filter_words_for_stratification(words_list, test_split)\n",
"\n",
"\tprint \"Number of tokens after filtering \" + str(len(words_list))\n",
"\n",
"\tlen_words_list = map(lambda x : len(x), words_list)\n",
"\n",
"\twords_train, words_test, _ , _ = train_test_split(words_list, len_words_list, test_size = test_split, shuffle = True, stratify = len_words_list)\n",
"\t\n",
"\tdata_dir = get_dir('data')\n",
"\n",
"\ttrain_dir_path = get_dir('data/train_words')\n",
"\ttest_dir_path = get_dir('data/test_words')\n",
"\n",
"\ttrain_words_file_path = os.path.join(data_dir, 'train_words.txt')\n",
"\ttest_words_file_path = os.path.join(data_dir, 'test_words.txt')\n",
"\n",
"\tsave_word_imgs_to_dir(words_train, train_dir_path, char_to_img_arr_dict)\n",
"\tsave_word_imgs_to_dir(words_test, test_dir_path, char_to_img_arr_dict)\n",
"\n",
"\twrite_words_list_to_dir(words_train, train_words_file_path)\n",
"\twrite_words_list_to_dir(words_test, test_words_file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Running the data preparation script"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package gutenberg to /Users/ady/nltk_data...\n",
"[nltk_data] Package gutenberg is already up-to-date!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of extracted tokens 18862\n",
"Len words counter Counter({7: 3005, 6: 2941, 8: 2708, 5: 2393, 9: 2174, 4: 1701, 10: 1440, 11: 881, 3: 589, 12: 550, 13: 286, 14: 122, 15: 48, 16: 13, 17: 9, 18: 1, 20: 1})\n",
"Not allowed lengths [18, 20]\n",
"Number of tokens after filtering 18860\n",
"Dataset preparation successful.. Check ./data\n"
]
}
],
"source": [
"init_gutenberg()\n",
"\n",
"mb_dick_tokens = get_book_tokens_gt_len('melville-moby_dick.txt', 3)\n",
"\n",
"print \"Number of extracted tokens \" + str(len(mb_dick_tokens))\n",
"\n",
"with open(\"letter_variants.npy\") as f:\n",
" char_to_img_arr_dict = pickle.load(f)\n",
"\n",
"create_data_set(mb_dick_tokens, char_to_img_arr_dict, 0.2)\n",
"\n",
"print \"Dataset preparation successful.. Check ./data\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment