Skip to content

Instantly share code, notes, and snippets.

@dougdroper
Last active October 29, 2020 10:10
Show Gist options
  • Save dougdroper/ed62d94cafab0f0b504456991af13b70 to your computer and use it in GitHub Desktop.
Save dougdroper/ed62d94cafab0f0b504456991af13b70 to your computer and use it in GitHub Desktop.
Tensorflow model to train SMS requests
require 'csv'
class Classifier
attr_reader :csv
attr_accessor :results
def initialize(csv)
@csv = csv
@results = {true: [], false: []}
end
def parse
c = CSV.read(csv)
c.map do |e|
if e[1] =~ /new\s?(blood)?\s?[kit|test]/i
results[:true] << e[1]
elsif e[1] =~ /send a new (HIV)? test kit/i
results[:true] << e[1]
elsif e[1] =~ /send a new one/i
results[:true] << e[1]
elsif e[1] =~ /request a new one/i
results[:true] << e[1]
else
results[:false] << e[1]
end
end
nil
end
def directory
'~'
end
def print
results[:true].each_with_index do |e,i|
File.open("#{directory}/train/pos/#{i}_1.txt", 'w') {|f| f.write(e)}
end
results[:false].each_with_index do |e,i|
File.open("#{directory}/train/neg/#{i}_2.txt", 'w') {|f| f.write(e)}
end
puts results[:true].count
puts results[:false].count
end
end
c = Classifier.new('messages.csv')
c.parse
c.print
{
"format": "layers-model",
"generatedBy": "keras v2.4.0",
"convertedBy": "TensorFlow.js Converter v2.7.0",
"modelTopology": {
"keras_version": "2.4.0",
"backend": "tensorflow",
"model_config": {
"class_name": "Sequential",
"config": {
"name": "sequential",
"layers": [
{
"class_name": "InputLayer",
"config": {
"batch_input_shape": [
null,
null
],
"dtype": "float32",
"sparse": false,
"ragged": false,
"name": "embedding_input"
}
},
{
"class_name": "Embedding",
"config": {
"name": "embedding",
"trainable": true,
"batch_input_shape": [
null,
null
],
"dtype": "float32",
"input_dim": 10001,
"output_dim": 16,
"embeddings_initializer": {
"class_name": "RandomUniform",
"config": {
"minval": -0.05,
"maxval": 0.05,
"seed": null
}
},
"embeddings_regularizer": null,
"activity_regularizer": null,
"embeddings_constraint": null,
"mask_zero": false,
"input_length": null
}
},
{
"class_name": "Dropout",
"config": {
"name": "dropout",
"trainable": true,
"dtype": "float32",
"rate": 0.2,
"noise_shape": null,
"seed": null
}
},
{
"class_name": "GlobalAveragePooling1D",
"config": {
"name": "global_average_pooling1d",
"trainable": true,
"dtype": "float32",
"data_format": "channels_last"
}
},
{
"class_name": "Dropout",
"config": {
"name": "dropout_1",
"trainable": true,
"dtype": "float32",
"rate": 0.2,
"noise_shape": null,
"seed": null
}
},
{
"class_name": "Dense",
"config": {
"name": "dense",
"trainable": true,
"dtype": "float32",
"units": 1,
"activation": "linear",
"use_bias": true,
"kernel_initializer": {
"class_name": "GlorotUniform",
"config": {
"seed": null
}
},
"bias_initializer": {
"class_name": "Zeros",
"config": null
},
"kernel_regularizer": null,
"bias_regularizer": null,
"activity_regularizer": null,
"kernel_constraint": null,
"bias_constraint": null
}
}
]
}
},
"training_config": {
"loss": {
"class_name": "BinaryCrossentropy",
"config": {
"reduction": "auto",
"name": "binary_crossentropy",
"from_logits": true,
"label_smoothing": 0
}
},
"metrics": {
"class_name": "BinaryAccuracy",
"config": {
"name": "binary_accuracy",
"dtype": "float32",
"threshold": 0.0
}
},
"weighted_metrics": null,
"loss_weights": null,
"optimizer_config": {
"class_name": "Adam",
"config": {
"name": "Adam",
"learning_rate": 0.0010000000474974513,
"decay": 0.0,
"beta_1": 0.8999999761581421,
"beta_2": 0.9990000128746033,
"epsilon": 1e-07,
"amsgrad": false
}
}
}
},
"weightsManifest": [
{
"paths": [
"group1-shard1of1.bin"
],
"weights": [
{
"name": "dense/kernel",
"shape": [
16,
1
],
"dtype": "float32"
},
{
"name": "dense/bias",
"shape": [
1
],
"dtype": "float32"
},
{
"name": "embedding/embeddings",
"shape": [
10001,
16
],
"dtype": "float32"
}
]
}
]
}
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
print(tf.__version__)
# directory with training and testing data
# 1 file per test category eg neg/pos
dataset_dir = '~/train'
test_dir = '~/test'
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
dataset_dir,
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
dataset_dir,
batch_size=batch_size,
validation_split=0.2,
subset='validation',
seed=seed)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
test_dir,
batch_size=batch_size)
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '\d', '')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 10000
sequence_length = 250
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
embedding_dim = 16
model = tf.keras.Sequential([
layers.Embedding(max_features + 1, embedding_dim),
layers.Dropout(0.2),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.2),
layers.Dense(1)])
model.summary()
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
optimizer='adam',
metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
epochs = 20
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=epochs)
export_model = tf.keras.Sequential([
vectorize_layer,
model,
layers.Activation('sigmoid')
])
export_model.compile(
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)
examples = [
"new blood test kit",
"can I have a new kit",
"new blood please",
"nothing to do with test kits",
]
export_model.predict(examples)
import * as use from '@tensorflow-models/universal-sentence-encoder'
import * as tf from '@tensorflow/tfjs'
import * as tfviz from '@tensorflow/tfjs-vis'
import {loadGraphModel} from '@tensorflow/tfjs-converter';
import { useEffect, useState} from 'react'
const loadModel = async () => {
const model = await tf.loadLayersModel('http://localhost:3000/tsjs_out/model.json')
const encoder = await use.load()
const embedded = await encoder.embed(['new test kit'])
const p = await model.predict(embedded).data()
console.log('prediction', p[0])
return model
}
const Index = () => {
const [model, setModel] = useState()
useEffect(() => {
const f = async () => await loadModel()
const mo = f()
setModel(mo)
}, [])
return (
<>
<a href='/'>Reload to see values in console</a>
</>
)
}
export default Index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment