Last active
October 29, 2020 10:10
-
-
Save dougdroper/ed62d94cafab0f0b504456991af13b70 to your computer and use it in GitHub Desktop.
Tensorflow model to train SMS requests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'csv' | |
class Classifier | |
attr_reader :csv | |
attr_accessor :results | |
def initialize(csv) | |
@csv = csv | |
@results = {true: [], false: []} | |
end | |
def parse | |
c = CSV.read(csv) | |
c.map do |e| | |
if e[1] =~ /new\s?(blood)?\s?[kit|test]/i | |
results[:true] << e[1] | |
elsif e[1] =~ /send a new (HIV)? test kit/i | |
results[:true] << e[1] | |
elsif e[1] =~ /send a new one/i | |
results[:true] << e[1] | |
elsif e[1] =~ /request a new one/i | |
results[:true] << e[1] | |
else | |
results[:false] << e[1] | |
end | |
end | |
nil | |
end | |
def directory | |
'~' | |
end | |
def print | |
results[:true].each_with_index do |e,i| | |
File.open("#{directory}/train/pos/#{i}_1.txt", 'w') {|f| f.write(e)} | |
end | |
results[:false].each_with_index do |e,i| | |
File.open("#{directory}/train/neg/#{i}_2.txt", 'w') {|f| f.write(e)} | |
end | |
puts results[:true].count | |
puts results[:false].count | |
end | |
end | |
c = Classifier.new('messages.csv') | |
c.parse | |
c.print |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"format": "layers-model", | |
"generatedBy": "keras v2.4.0", | |
"convertedBy": "TensorFlow.js Converter v2.7.0", | |
"modelTopology": { | |
"keras_version": "2.4.0", | |
"backend": "tensorflow", | |
"model_config": { | |
"class_name": "Sequential", | |
"config": { | |
"name": "sequential", | |
"layers": [ | |
{ | |
"class_name": "InputLayer", | |
"config": { | |
"batch_input_shape": [ | |
null, | |
null | |
], | |
"dtype": "float32", | |
"sparse": false, | |
"ragged": false, | |
"name": "embedding_input" | |
} | |
}, | |
{ | |
"class_name": "Embedding", | |
"config": { | |
"name": "embedding", | |
"trainable": true, | |
"batch_input_shape": [ | |
null, | |
null | |
], | |
"dtype": "float32", | |
"input_dim": 10001, | |
"output_dim": 16, | |
"embeddings_initializer": { | |
"class_name": "RandomUniform", | |
"config": { | |
"minval": -0.05, | |
"maxval": 0.05, | |
"seed": null | |
} | |
}, | |
"embeddings_regularizer": null, | |
"activity_regularizer": null, | |
"embeddings_constraint": null, | |
"mask_zero": false, | |
"input_length": null | |
} | |
}, | |
{ | |
"class_name": "Dropout", | |
"config": { | |
"name": "dropout", | |
"trainable": true, | |
"dtype": "float32", | |
"rate": 0.2, | |
"noise_shape": null, | |
"seed": null | |
} | |
}, | |
{ | |
"class_name": "GlobalAveragePooling1D", | |
"config": { | |
"name": "global_average_pooling1d", | |
"trainable": true, | |
"dtype": "float32", | |
"data_format": "channels_last" | |
} | |
}, | |
{ | |
"class_name": "Dropout", | |
"config": { | |
"name": "dropout_1", | |
"trainable": true, | |
"dtype": "float32", | |
"rate": 0.2, | |
"noise_shape": null, | |
"seed": null | |
} | |
}, | |
{ | |
"class_name": "Dense", | |
"config": { | |
"name": "dense", | |
"trainable": true, | |
"dtype": "float32", | |
"units": 1, | |
"activation": "linear", | |
"use_bias": true, | |
"kernel_initializer": { | |
"class_name": "GlorotUniform", | |
"config": { | |
"seed": null | |
} | |
}, | |
"bias_initializer": { | |
"class_name": "Zeros", | |
"config": null | |
}, | |
"kernel_regularizer": null, | |
"bias_regularizer": null, | |
"activity_regularizer": null, | |
"kernel_constraint": null, | |
"bias_constraint": null | |
} | |
} | |
] | |
} | |
}, | |
"training_config": { | |
"loss": { | |
"class_name": "BinaryCrossentropy", | |
"config": { | |
"reduction": "auto", | |
"name": "binary_crossentropy", | |
"from_logits": true, | |
"label_smoothing": 0 | |
} | |
}, | |
"metrics": { | |
"class_name": "BinaryAccuracy", | |
"config": { | |
"name": "binary_accuracy", | |
"dtype": "float32", | |
"threshold": 0.0 | |
} | |
}, | |
"weighted_metrics": null, | |
"loss_weights": null, | |
"optimizer_config": { | |
"class_name": "Adam", | |
"config": { | |
"name": "Adam", | |
"learning_rate": 0.0010000000474974513, | |
"decay": 0.0, | |
"beta_1": 0.8999999761581421, | |
"beta_2": 0.9990000128746033, | |
"epsilon": 1e-07, | |
"amsgrad": false | |
} | |
} | |
} | |
}, | |
"weightsManifest": [ | |
{ | |
"paths": [ | |
"group1-shard1of1.bin" | |
], | |
"weights": [ | |
{ | |
"name": "dense/kernel", | |
"shape": [ | |
16, | |
1 | |
], | |
"dtype": "float32" | |
}, | |
{ | |
"name": "dense/bias", | |
"shape": [ | |
1 | |
], | |
"dtype": "float32" | |
}, | |
{ | |
"name": "embedding/embeddings", | |
"shape": [ | |
10001, | |
16 | |
], | |
"dtype": "float32" | |
} | |
] | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import os | |
import re | |
import shutil | |
import string | |
import tensorflow as tf | |
from tensorflow.keras import layers | |
from tensorflow.keras import losses | |
from tensorflow.keras import preprocessing | |
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization | |
print(tf.__version__) | |
# directory with training and testing data | |
# 1 file per test category eg neg/pos | |
dataset_dir = '~/train' | |
test_dir = '~/test' | |
batch_size = 32 | |
seed = 42 | |
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory( | |
dataset_dir, | |
batch_size=batch_size, | |
validation_split=0.2, | |
subset='training', | |
seed=seed) | |
print("Label 0 corresponds to", raw_train_ds.class_names[0]) | |
print("Label 1 corresponds to", raw_train_ds.class_names[1]) | |
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory( | |
dataset_dir, | |
batch_size=batch_size, | |
validation_split=0.2, | |
subset='validation', | |
seed=seed) | |
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory( | |
test_dir, | |
batch_size=batch_size) | |
def custom_standardization(input_data): | |
lowercase = tf.strings.lower(input_data) | |
stripped_html = tf.strings.regex_replace(lowercase, '\d', '') | |
return tf.strings.regex_replace(stripped_html, | |
'[%s]' % re.escape(string.punctuation), | |
'') | |
max_features = 10000 | |
sequence_length = 250 | |
vectorize_layer = TextVectorization( | |
standardize=custom_standardization, | |
max_tokens=max_features, | |
output_mode='int', | |
output_sequence_length=sequence_length) | |
train_text = raw_train_ds.map(lambda x, y: x) | |
vectorize_layer.adapt(train_text) | |
def vectorize_text(text, label): | |
text = tf.expand_dims(text, -1) | |
return vectorize_layer(text), label | |
text_batch, label_batch = next(iter(raw_train_ds)) | |
first_review, first_label = text_batch[0], label_batch[0] | |
print("Review", first_review) | |
print("Label", raw_train_ds.class_names[first_label]) | |
print("Vectorized review", vectorize_text(first_review, first_label)) | |
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287]) | |
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313]) | |
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary()))) | |
train_ds = raw_train_ds.map(vectorize_text) | |
val_ds = raw_val_ds.map(vectorize_text) | |
test_ds = raw_test_ds.map(vectorize_text) | |
AUTOTUNE = tf.data.experimental.AUTOTUNE | |
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE) | |
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE) | |
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE) | |
embedding_dim = 16 | |
model = tf.keras.Sequential([ | |
layers.Embedding(max_features + 1, embedding_dim), | |
layers.Dropout(0.2), | |
layers.GlobalAveragePooling1D(), | |
layers.Dropout(0.2), | |
layers.Dense(1)]) | |
model.summary() | |
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), | |
optimizer='adam', | |
metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) | |
epochs = 20 | |
history = model.fit( | |
train_ds, | |
validation_data=val_ds, | |
epochs=epochs) | |
export_model = tf.keras.Sequential([ | |
vectorize_layer, | |
model, | |
layers.Activation('sigmoid') | |
]) | |
export_model.compile( | |
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy'] | |
) | |
loss, accuracy = export_model.evaluate(raw_test_ds) | |
print(accuracy) | |
examples = [ | |
"new blood test kit", | |
"can I have a new kit", | |
"new blood please", | |
"nothing to do with test kits", | |
] | |
export_model.predict(examples) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as use from '@tensorflow-models/universal-sentence-encoder' | |
import * as tf from '@tensorflow/tfjs' | |
import * as tfviz from '@tensorflow/tfjs-vis' | |
import {loadGraphModel} from '@tensorflow/tfjs-converter'; | |
import { useEffect, useState} from 'react' | |
const loadModel = async () => { | |
const model = await tf.loadLayersModel('http://localhost:3000/tsjs_out/model.json') | |
const encoder = await use.load() | |
const embedded = await encoder.embed(['new test kit']) | |
const p = await model.predict(embedded).data() | |
console.log('prediction', p[0]) | |
return model | |
} | |
const Index = () => { | |
const [model, setModel] = useState() | |
useEffect(() => { | |
const f = async () => await loadModel() | |
const mo = f() | |
setModel(mo) | |
}, []) | |
return ( | |
<> | |
<a href='/'>Reload to see values in console</a> | |
</> | |
) | |
} | |
export default Index |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment