Last active
October 31, 2021 16:40
-
-
Save femioladeji/d250a0e9ff6d05a9e518bf6facd04d49 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
from sklearn.svm import SVC | |
from sklearn.model_selection import cross_val_score | |
from sklearn.externals import joblib | |
from skimage.io import imread | |
from skimage.filters import threshold_otsu | |
letters = [ | |
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', | |
'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', | |
'U', 'V', 'W', 'X', 'Y', 'Z' | |
] | |
def read_training_data(training_directory): | |
image_data = [] | |
target_data = [] | |
for each_letter in letters: | |
for each in range(10): | |
image_path = os.path.join(training_directory, each_letter, each_letter + '_' + str(each) + '.jpg') | |
# read each image of each character | |
img_details = imread(image_path, as_gray=True) | |
# converts each character image to binary image | |
binary_image = img_details < threshold_otsu(img_details) | |
# the 2D array of each image is flattened because the machine learning | |
# classifier requires that each sample is a 1D array | |
# therefore the 20*20 image becomes 1*400 | |
# in machine learning terms that's 400 features with each pixel | |
# representing a feature | |
flat_bin_image = binary_image.reshape(-1) | |
image_data.append(flat_bin_image) | |
target_data.append(each_letter) | |
return (np.array(image_data), np.array(target_data)) | |
def cross_validation(model, num_of_fold, train_data, train_label): | |
# this uses the concept of cross validation to measure the accuracy | |
# of a model, the num_of_fold determines the type of validation | |
# e.g if num_of_fold is 4, then we are performing a 4-fold cross validation | |
# it will divide the dataset into 4 and use 1/4 of it for testing | |
# and the remaining 3/4 for the training | |
accuracy_result = cross_val_score(model, train_data, train_label, | |
cv=num_of_fold) | |
print("Cross Validation Result for ", str(num_of_fold), " -fold") | |
print(accuracy_result * 100) | |
current_dir = os.path.dirname(os.path.realpath(__file__)) | |
training_dataset_dir = os.path.join(current_dir, 'train') | |
image_data, target_data = read_training_data(training_dataset_dir) | |
# the kernel can be 'linear', 'poly' or 'rbf' | |
# the probability was set to True so as to show | |
# how sure the model is of it's prediction | |
svc_model = SVC(kernel='linear', probability=True) | |
cross_validation(svc_model, 4, image_data, target_data) | |
# let's train the model with all the input data | |
svc_model.fit(image_data, target_data) | |
# we will use the joblib module to persist the model | |
# into files. This means that the next time we need to | |
# predict, we don't need to train the model again | |
save_directory = os.path.join(current_dir, 'models/svc/') | |
if not os.path.exists(save_directory): | |
os.makedirs(save_directory) | |
joblib.dump(svc_model, save_directory+'/svc.pkl') |
@amzad21115 are you running it with jupyter? If you are then you'll need to specify the full path. I think running os.path.dirname
in jupyter environment throws an error
image_data, target_data = read_training_data(training_dataset_dir)
line no:53 shows error. Can anybody help please?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
current_dir = os.path.dirname(os.path.realpath(file))
this line shows error . how can solv it?