gerwim/example.py Secret

## example.py
!pip install autokeras==1.0.16 tensorflow==2.4.3 keras-tuner==1.0.3

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('dutch'))

import re

def preprocess_text(sen):
    sentence = sen
    # Remove stopwords
    for word in STOPWORDS:
        token = ' ' + word + ' '
        sentence = sentence.replace(token, ' ')
        sentence = sentence.replace(' ', ' ')

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    # Perform to lower
    sentence = sentence.lower()

    return sentence

import csv
from pathlib import Path

MAIN_CATEGORY="main" # in my case MAIN_CATEGORY was part of an array, but for sake of simplicity I've removed the array
targetDir = Path(f"drive/MyDrive/Models/{MAIN_CATEGORY}")
if targetDir.exists():
print(f"Skipping category {MAIN_CATEGORY} because it already exists")
continue

print(f"Starting training category {MAIN_CATEGORY}")
!wget "https://mystorage.endpoint/public/deeplearning/"$MAIN_CATEGORY".csv"
products = []
categories = []

# Clean data
import shutil
try:
shutil.rmtree('text_classifier')
except:
print("No folder text_classifier found")

with open(f"{MAIN_CATEGORY}.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\\')
    next(reader)
    for row in reader:
        category = row[1]
        categories.append(category)
        product = row[0]

        product = preprocess_text(product)
        products.append(product)

# Print unique categories
uniqueCategories = set(categories)
print(f"Unique categories: {len(uniqueCategories)}")

# Shuffle products
from sklearn.utils import shuffle
import numpy as np
products, categories = shuffle(np.array(products), np.array(categories))

# Create Training and Validation Set
training_portion = 0.8 # train on 80% of the data
train_size = int(len(products) * training_portion)

train_products = products[:train_size]
train_categories = categories[:train_size]

test_products = products[train_size:]
test_categories = categories[train_size:]

x_train = np.array(train_products)
y_train = np.array(train_categories)
x_test = np.array(test_products)
y_test = np.array(test_categories)

print(x_train.shape)
print(y_train.shape)
print(x_train[0][:50])

import autokeras as ak
import kerastuner
# Initialize the text classifier.
clf = ak.TextClassifier(
    overwrite=True,
    max_trials=15,
    objective=kerastuner.Objective('val_accuracy', direction='max'),
    )
# Feed the text classifier with training data.
clf.fit(x_train, y_train, validation_split=0.15)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))

# Save the model to disk.
model = clf.export_model()
try:
    model.save("1", save_format="tf")
except:
    model.save(f"{MAIN_CATEGORY}.h5")

# Move model to Drive folder
!mkdir $MAIN_CATEGORY
!mv 1 $MAIN_CATEGORY
!mv $MAIN_CATEGORY drive/MyDrive/Models/
	!pip install autokeras==1.0.16 tensorflow==2.4.3 keras-tuner==1.0.3

	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords

	STOPWORDS = set(stopwords.words('dutch'))

	import re

	def preprocess_text(sen):
	sentence = sen
	# Remove stopwords
	for word in STOPWORDS:
	token = ' ' + word + ' '
	sentence = sentence.replace(token, ' ')
	sentence = sentence.replace(' ', ' ')

	# Remove punctuations and numbers
	sentence = re.sub('[^a-zA-Z]', ' ', sentence)
	# Single character removal
	sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
	# Removing multiple spaces
	sentence = re.sub(r'\s+', ' ', sentence)
	# Perform to lower
	sentence = sentence.lower()

	return sentence

	import csv
	from pathlib import Path

	MAIN_CATEGORY="main" # in my case MAIN_CATEGORY was part of an array, but for sake of simplicity I've removed the array
	targetDir = Path(f"drive/MyDrive/Models/{MAIN_CATEGORY}")
	if targetDir.exists():
	print(f"Skipping category {MAIN_CATEGORY} because it already exists")
	continue

	print(f"Starting training category {MAIN_CATEGORY}")
	!wget "https://mystorage.endpoint/public/deeplearning/"$MAIN_CATEGORY".csv"
	products = []
	categories = []

	# Clean data
	import shutil
	try:
	shutil.rmtree('text_classifier')
	except:
	print("No folder text_classifier found")

	with open(f"{MAIN_CATEGORY}.csv", 'r') as csvfile:
	reader = csv.reader(csvfile, delimiter='\\')
	next(reader)
	for row in reader:
	category = row[1]
	categories.append(category)
	product = row[0]

	product = preprocess_text(product)
	products.append(product)

	# Print unique categories
	uniqueCategories = set(categories)
	print(f"Unique categories: {len(uniqueCategories)}")

	# Shuffle products
	from sklearn.utils import shuffle
	import numpy as np
	products, categories = shuffle(np.array(products), np.array(categories))

	# Create Training and Validation Set
	training_portion = 0.8 # train on 80% of the data
	train_size = int(len(products) * training_portion)

	train_products = products[:train_size]
	train_categories = categories[:train_size]

	test_products = products[train_size:]
	test_categories = categories[train_size:]

	x_train = np.array(train_products)
	y_train = np.array(train_categories)
	x_test = np.array(test_products)
	y_test = np.array(test_categories)

	print(x_train.shape)
	print(y_train.shape)
	print(x_train[0][:50])

	import autokeras as ak
	import kerastuner
	# Initialize the text classifier.
	clf = ak.TextClassifier(
	overwrite=True,
	max_trials=15,
	objective=kerastuner.Objective('val_accuracy', direction='max'),
	)
	# Feed the text classifier with training data.
	clf.fit(x_train, y_train, validation_split=0.15)
	# Predict with the best model.
	predicted_y = clf.predict(x_test)
	# Evaluate the best model with testing data.
	print(clf.evaluate(x_test, y_test))

	# Save the model to disk.
	model = clf.export_model()
	try:
	model.save("1", save_format="tf")
	except:
	model.save(f"{MAIN_CATEGORY}.h5")

	# Move model to Drive folder
	!mkdir $MAIN_CATEGORY
	!mv 1 $MAIN_CATEGORY
	!mv $MAIN_CATEGORY drive/MyDrive/Models/