-
-
Save gerwim/1247a691f20a1b524784c2d73caada02 to your computer and use it in GitHub Desktop.
Machine learning python example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install autokeras==1.0.16 tensorflow==2.4.3 keras-tuner==1.0.3 | |
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
STOPWORDS = set(stopwords.words('dutch')) | |
import re | |
def preprocess_text(sen): | |
sentence = sen | |
# Remove stopwords | |
for word in STOPWORDS: | |
token = ' ' + word + ' ' | |
sentence = sentence.replace(token, ' ') | |
sentence = sentence.replace(' ', ' ') | |
# Remove punctuations and numbers | |
sentence = re.sub('[^a-zA-Z]', ' ', sentence) | |
# Single character removal | |
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) | |
# Removing multiple spaces | |
sentence = re.sub(r'\s+', ' ', sentence) | |
# Perform to lower | |
sentence = sentence.lower() | |
return sentence | |
import csv | |
from pathlib import Path | |
MAIN_CATEGORY="main" # in my case MAIN_CATEGORY was part of an array, but for sake of simplicity I've removed the array | |
targetDir = Path(f"drive/MyDrive/Models/{MAIN_CATEGORY}") | |
if targetDir.exists(): | |
print(f"Skipping category {MAIN_CATEGORY} because it already exists") | |
continue | |
print(f"Starting training category {MAIN_CATEGORY}") | |
!wget "https://mystorage.endpoint/public/deeplearning/"$MAIN_CATEGORY".csv" | |
products = [] | |
categories = [] | |
# Clean data | |
import shutil | |
try: | |
shutil.rmtree('text_classifier') | |
except: | |
print("No folder text_classifier found") | |
with open(f"{MAIN_CATEGORY}.csv", 'r') as csvfile: | |
reader = csv.reader(csvfile, delimiter='\\') | |
next(reader) | |
for row in reader: | |
category = row[1] | |
categories.append(category) | |
product = row[0] | |
product = preprocess_text(product) | |
products.append(product) | |
# Print unique categories | |
uniqueCategories = set(categories) | |
print(f"Unique categories: {len(uniqueCategories)}") | |
# Shuffle products | |
from sklearn.utils import shuffle | |
import numpy as np | |
products, categories = shuffle(np.array(products), np.array(categories)) | |
# Create Training and Validation Set | |
training_portion = 0.8 # train on 80% of the data | |
train_size = int(len(products) * training_portion) | |
train_products = products[:train_size] | |
train_categories = categories[:train_size] | |
test_products = products[train_size:] | |
test_categories = categories[train_size:] | |
x_train = np.array(train_products) | |
y_train = np.array(train_categories) | |
x_test = np.array(test_products) | |
y_test = np.array(test_categories) | |
print(x_train.shape) | |
print(y_train.shape) | |
print(x_train[0][:50]) | |
import autokeras as ak | |
import kerastuner | |
# Initialize the text classifier. | |
clf = ak.TextClassifier( | |
overwrite=True, | |
max_trials=15, | |
objective=kerastuner.Objective('val_accuracy', direction='max'), | |
) | |
# Feed the text classifier with training data. | |
clf.fit(x_train, y_train, validation_split=0.15) | |
# Predict with the best model. | |
predicted_y = clf.predict(x_test) | |
# Evaluate the best model with testing data. | |
print(clf.evaluate(x_test, y_test)) | |
# Save the model to disk. | |
model = clf.export_model() | |
try: | |
model.save("1", save_format="tf") | |
except: | |
model.save(f"{MAIN_CATEGORY}.h5") | |
# Move model to Drive folder | |
!mkdir $MAIN_CATEGORY | |
!mv 1 $MAIN_CATEGORY | |
!mv $MAIN_CATEGORY drive/MyDrive/Models/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment