Skip to content

Instantly share code, notes, and snippets.

@gerwim
Created January 29, 2022 15:27
Show Gist options
  • Save gerwim/1247a691f20a1b524784c2d73caada02 to your computer and use it in GitHub Desktop.
Save gerwim/1247a691f20a1b524784c2d73caada02 to your computer and use it in GitHub Desktop.
Machine learning python example
!pip install autokeras==1.0.16 tensorflow==2.4.3 keras-tuner==1.0.3
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('dutch'))
import re
def preprocess_text(sen):
sentence = sen
# Remove stopwords
for word in STOPWORDS:
token = ' ' + word + ' '
sentence = sentence.replace(token, ' ')
sentence = sentence.replace(' ', ' ')
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sentence)
# Perform to lower
sentence = sentence.lower()
return sentence
import csv
from pathlib import Path
MAIN_CATEGORY="main" # in my case MAIN_CATEGORY was part of an array, but for sake of simplicity I've removed the array
targetDir = Path(f"drive/MyDrive/Models/{MAIN_CATEGORY}")
if targetDir.exists():
print(f"Skipping category {MAIN_CATEGORY} because it already exists")
continue
print(f"Starting training category {MAIN_CATEGORY}")
!wget "https://mystorage.endpoint/public/deeplearning/"$MAIN_CATEGORY".csv"
products = []
categories = []
# Clean data
import shutil
try:
shutil.rmtree('text_classifier')
except:
print("No folder text_classifier found")
with open(f"{MAIN_CATEGORY}.csv", 'r') as csvfile:
reader = csv.reader(csvfile, delimiter='\\')
next(reader)
for row in reader:
category = row[1]
categories.append(category)
product = row[0]
product = preprocess_text(product)
products.append(product)
# Print unique categories
uniqueCategories = set(categories)
print(f"Unique categories: {len(uniqueCategories)}")
# Shuffle products
from sklearn.utils import shuffle
import numpy as np
products, categories = shuffle(np.array(products), np.array(categories))
# Create Training and Validation Set
training_portion = 0.8 # train on 80% of the data
train_size = int(len(products) * training_portion)
train_products = products[:train_size]
train_categories = categories[:train_size]
test_products = products[train_size:]
test_categories = categories[train_size:]
x_train = np.array(train_products)
y_train = np.array(train_categories)
x_test = np.array(test_products)
y_test = np.array(test_categories)
print(x_train.shape)
print(y_train.shape)
print(x_train[0][:50])
import autokeras as ak
import kerastuner
# Initialize the text classifier.
clf = ak.TextClassifier(
overwrite=True,
max_trials=15,
objective=kerastuner.Objective('val_accuracy', direction='max'),
)
# Feed the text classifier with training data.
clf.fit(x_train, y_train, validation_split=0.15)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))
# Save the model to disk.
model = clf.export_model()
try:
model.save("1", save_format="tf")
except:
model.save(f"{MAIN_CATEGORY}.h5")
# Move model to Drive folder
!mkdir $MAIN_CATEGORY
!mv 1 $MAIN_CATEGORY
!mv $MAIN_CATEGORY drive/MyDrive/Models/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment