dougdroper/classifier.rb

## classifier.rb
require 'csv'

class Classifier
  attr_reader :csv
  attr_accessor :results

  def initialize(csv)
    @csv = csv
    @results = {true: [], false: []}
  end

  def parse
    c = CSV.read(csv)
    c.map do |e|
      if e[1] =~ /new\s?(blood)?\s?[kit|test]/i
        results[:true] << e[1]
      elsif e[1] =~ /send a new (HIV)? test kit/i
        results[:true] << e[1]
      elsif e[1] =~ /send a new one/i
        results[:true] << e[1]
      elsif e[1] =~ /request a new one/i
        results[:true] << e[1]
      else
        results[:false] << e[1]
      end
    end
    nil
  end

  def directory
    '~'
  end

  def print
    results[:true].each_with_index do |e,i|
      File.open("#{directory}/train/pos/#{i}_1.txt", 'w') {|f| f.write(e)}
    end
    results[:false].each_with_index do |e,i|
      File.open("#{directory}/train/neg/#{i}_2.txt", 'w') {|f| f.write(e)}
    end
    puts results[:true].count
    puts results[:false].count
  end
end

c = Classifier.new('messages.csv')
c.parse
c.print

## model.json
{
  "format": "layers-model",
  "generatedBy": "keras v2.4.0",
  "convertedBy": "TensorFlow.js Converter v2.7.0",
  "modelTopology": {
    "keras_version": "2.4.0",
    "backend": "tensorflow",
    "model_config": {
      "class_name": "Sequential",
      "config": {
        "name": "sequential",
        "layers": [
          {
            "class_name": "InputLayer",
            "config": {
              "batch_input_shape": [
                null,
                null
              ],
              "dtype": "float32",
              "sparse": false,
              "ragged": false,
              "name": "embedding_input"
            }
          },
          {
            "class_name": "Embedding",
            "config": {
              "name": "embedding",
              "trainable": true,
              "batch_input_shape": [
                null,
                null
              ],
              "dtype": "float32",
              "input_dim": 10001,
              "output_dim": 16,
              "embeddings_initializer": {
                "class_name": "RandomUniform",
                "config": {
                  "minval": -0.05,
                  "maxval": 0.05,
                  "seed": null
                }
              },
              "embeddings_regularizer": null,
              "activity_regularizer": null,
              "embeddings_constraint": null,
              "mask_zero": false,
              "input_length": null
            }
          },
          {
            "class_name": "Dropout",
            "config": {
              "name": "dropout",
              "trainable": true,
              "dtype": "float32",
              "rate": 0.2,
              "noise_shape": null,
              "seed": null
            }
          },
          {
            "class_name": "GlobalAveragePooling1D",
            "config": {
              "name": "global_average_pooling1d",
              "trainable": true,
              "dtype": "float32",
              "data_format": "channels_last"
            }
          },
          {
            "class_name": "Dropout",
            "config": {
              "name": "dropout_1",
              "trainable": true,
              "dtype": "float32",
              "rate": 0.2,
              "noise_shape": null,
              "seed": null
            }
          },
          {
            "class_name": "Dense",
            "config": {
              "name": "dense",
              "trainable": true,
              "dtype": "float32",
              "units": 1,
              "activation": "linear",
              "use_bias": true,
              "kernel_initializer": {
                "class_name": "GlorotUniform",
                "config": {
                  "seed": null
                }
              },
              "bias_initializer": {
                "class_name": "Zeros",
                "config": null
              },
              "kernel_regularizer": null,
              "bias_regularizer": null,
              "activity_regularizer": null,
              "kernel_constraint": null,
              "bias_constraint": null
            }
          }
        ]
      }
    },
    "training_config": {
      "loss": {
        "class_name": "BinaryCrossentropy",
        "config": {
          "reduction": "auto",
          "name": "binary_crossentropy",
          "from_logits": true,
          "label_smoothing": 0
        }
      },
      "metrics": {
        "class_name": "BinaryAccuracy",
        "config": {
          "name": "binary_accuracy",
          "dtype": "float32",
          "threshold": 0.0
        }
      },
      "weighted_metrics": null,
      "loss_weights": null,
      "optimizer_config": {
        "class_name": "Adam",
        "config": {
          "name": "Adam",
          "learning_rate": 0.0010000000474974513,
          "decay": 0.0,
          "beta_1": 0.8999999761581421,
          "beta_2": 0.9990000128746033,
          "epsilon": 1e-07,
          "amsgrad": false
        }
      }
    }
  },
  "weightsManifest": [
    {
      "paths": [
        "group1-shard1of1.bin"
      ],
      "weights": [
        {
          "name": "dense/kernel",
          "shape": [
            16,
            1
          ],
          "dtype": "float32"
        },
        {
          "name": "dense/bias",
          "shape": [
            1
          ],
          "dtype": "float32"
        },
        {
          "name": "embedding/embeddings",
          "shape": [
            10001,
            16
          ],
          "dtype": "float32"
        }
      ]
    }
  ]
}

## training.py
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

print(tf.__version__)

# directory with training and testing data
# 1 file per test category eg neg/pos
dataset_dir = '~/train'
test_dir = '~/test'

batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    dataset_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    dataset_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size)

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '\d', '')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]

print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

embedding_dim = 16
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])
model.summary()

model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

epochs = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])
export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

examples = [
  "new blood test kit",
  "can I have a new kit",
  "new blood please",
  "nothing to do with test kits",
]
export_model.predict(examples)

## useTrainedModel.js
import * as use from '@tensorflow-models/universal-sentence-encoder'
import * as tf from '@tensorflow/tfjs'
import * as tfviz from '@tensorflow/tfjs-vis'
import {loadGraphModel} from '@tensorflow/tfjs-converter';

import { useEffect, useState} from 'react'

const loadModel = async () => {
  const model = await tf.loadLayersModel('http://localhost:3000/tsjs_out/model.json')
  const encoder = await use.load()
  const embedded = await encoder.embed(['new test kit'])
  const p = await model.predict(embedded).data()
  console.log('prediction', p[0])
  return model
}

const Index = () => {
  const [model, setModel] = useState()

  useEffect(() => {
    const f = async () => await loadModel()
    const mo = f()
    setModel(mo)
  }, [])

  return (
    <>
      <a href='/'>Reload to see values in console</a>
    </>
  )
}

export default Index
	require 'csv'

	class Classifier
	attr_reader :csv
	attr_accessor :results

	def initialize(csv)
	@csv = csv
	@results = {true: [], false: []}
	end

	def parse
	c = CSV.read(csv)
	c.map do \|e\|
	if e[1] =~ /new\s?(blood)?\s?[kit\|test]/i
	results[:true] << e[1]
	elsif e[1] =~ /send a new (HIV)? test kit/i
	results[:true] << e[1]
	elsif e[1] =~ /send a new one/i
	results[:true] << e[1]
	elsif e[1] =~ /request a new one/i
	results[:true] << e[1]
	else
	results[:false] << e[1]
	end
	end
	nil
	end

	def directory
	'~'
	end

	def print
	results[:true].each_with_index do \|e,i\|
	File.open("#{directory}/train/pos/#{i}_1.txt", 'w') {\|f\| f.write(e)}
	end
	results[:false].each_with_index do \|e,i\|
	File.open("#{directory}/train/neg/#{i}_2.txt", 'w') {\|f\| f.write(e)}
	end
	puts results[:true].count
	puts results[:false].count
	end
	end

	c = Classifier.new('messages.csv')
	c.parse
	c.print
	{
	"format": "layers-model",
	"generatedBy": "keras v2.4.0",
	"convertedBy": "TensorFlow.js Converter v2.7.0",
	"modelTopology": {
	"keras_version": "2.4.0",
	"backend": "tensorflow",
	"model_config": {
	"class_name": "Sequential",
	"config": {
	"name": "sequential",
	"layers": [
	{
	"class_name": "InputLayer",
	"config": {
	"batch_input_shape": [
	null,
	null
	],
	"dtype": "float32",
	"sparse": false,
	"ragged": false,
	"name": "embedding_input"
	}
	},
	{
	"class_name": "Embedding",
	"config": {
	"name": "embedding",
	"trainable": true,
	"batch_input_shape": [
	null,
	null
	],
	"dtype": "float32",
	"input_dim": 10001,
	"output_dim": 16,
	"embeddings_initializer": {
	"class_name": "RandomUniform",
	"config": {
	"minval": -0.05,
	"maxval": 0.05,
	"seed": null
	}
	},
	"embeddings_regularizer": null,
	"activity_regularizer": null,
	"embeddings_constraint": null,
	"mask_zero": false,
	"input_length": null
	}
	},
	{
	"class_name": "Dropout",
	"config": {
	"name": "dropout",
	"trainable": true,
	"dtype": "float32",
	"rate": 0.2,
	"noise_shape": null,
	"seed": null
	}
	},
	{
	"class_name": "GlobalAveragePooling1D",
	"config": {
	"name": "global_average_pooling1d",
	"trainable": true,
	"dtype": "float32",
	"data_format": "channels_last"
	}
	},
	{
	"class_name": "Dropout",
	"config": {
	"name": "dropout_1",
	"trainable": true,
	"dtype": "float32",
	"rate": 0.2,
	"noise_shape": null,
	"seed": null
	}
	},
	{
	"class_name": "Dense",
	"config": {
	"name": "dense",
	"trainable": true,
	"dtype": "float32",
	"units": 1,
	"activation": "linear",
	"use_bias": true,
	"kernel_initializer": {
	"class_name": "GlorotUniform",
	"config": {
	"seed": null
	}
	},
	"bias_initializer": {
	"class_name": "Zeros",
	"config": null
	},
	"kernel_regularizer": null,
	"bias_regularizer": null,
	"activity_regularizer": null,
	"kernel_constraint": null,
	"bias_constraint": null
	}
	}
	]
	}
	},
	"training_config": {
	"loss": {
	"class_name": "BinaryCrossentropy",
	"config": {
	"reduction": "auto",
	"name": "binary_crossentropy",
	"from_logits": true,
	"label_smoothing": 0
	}
	},
	"metrics": {
	"class_name": "BinaryAccuracy",
	"config": {
	"name": "binary_accuracy",
	"dtype": "float32",
	"threshold": 0.0
	}
	},
	"weighted_metrics": null,
	"loss_weights": null,
	"optimizer_config": {
	"class_name": "Adam",
	"config": {
	"name": "Adam",
	"learning_rate": 0.0010000000474974513,
	"decay": 0.0,
	"beta_1": 0.8999999761581421,
	"beta_2": 0.9990000128746033,
	"epsilon": 1e-07,
	"amsgrad": false
	}
	}
	}
	},
	"weightsManifest": [
	{
	"paths": [
	"group1-shard1of1.bin"
	],
	"weights": [
	{
	"name": "dense/kernel",
	"shape": [
	16,
	1
	],
	"dtype": "float32"
	},
	{
	"name": "dense/bias",
	"shape": [
	1
	],
	"dtype": "float32"
	},
	{
	"name": "embedding/embeddings",
	"shape": [
	10001,
	16
	],
	"dtype": "float32"
	}
	]
	}
	]
	}
	import matplotlib.pyplot as plt
	import os
	import re
	import shutil
	import string
	import tensorflow as tf

	from tensorflow.keras import layers
	from tensorflow.keras import losses
	from tensorflow.keras import preprocessing
	from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

	print(tf.__version__)

	# directory with training and testing data
	# 1 file per test category eg neg/pos
	dataset_dir = '~/train'
	test_dir = '~/test'

	batch_size = 32
	seed = 42

	raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
	dataset_dir,
	batch_size=batch_size,
	validation_split=0.2,
	subset='training',
	seed=seed)

	print("Label 0 corresponds to", raw_train_ds.class_names[0])
	print("Label 1 corresponds to", raw_train_ds.class_names[1])

	raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
	dataset_dir,
	batch_size=batch_size,
	validation_split=0.2,
	subset='validation',
	seed=seed)

	raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
	test_dir,
	batch_size=batch_size)

	def custom_standardization(input_data):
	lowercase = tf.strings.lower(input_data)
	stripped_html = tf.strings.regex_replace(lowercase, '\d', '')
	return tf.strings.regex_replace(stripped_html,
	'[%s]' % re.escape(string.punctuation),
	'')

	max_features = 10000
	sequence_length = 250

	vectorize_layer = TextVectorization(
	standardize=custom_standardization,
	max_tokens=max_features,
	output_mode='int',
	output_sequence_length=sequence_length)

	train_text = raw_train_ds.map(lambda x, y: x)
	vectorize_layer.adapt(train_text)

	def vectorize_text(text, label):
	text = tf.expand_dims(text, -1)
	return vectorize_layer(text), label

	text_batch, label_batch = next(iter(raw_train_ds))
	first_review, first_label = text_batch[0], label_batch[0]

	print("Review", first_review)
	print("Label", raw_train_ds.class_names[first_label])
	print("Vectorized review", vectorize_text(first_review, first_label))

	print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
	print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
	print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

	train_ds = raw_train_ds.map(vectorize_text)
	val_ds = raw_val_ds.map(vectorize_text)
	test_ds = raw_test_ds.map(vectorize_text)

	AUTOTUNE = tf.data.experimental.AUTOTUNE

	train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
	val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
	test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

	embedding_dim = 16
	model = tf.keras.Sequential([
	layers.Embedding(max_features + 1, embedding_dim),
	layers.Dropout(0.2),
	layers.GlobalAveragePooling1D(),
	layers.Dropout(0.2),
	layers.Dense(1)])
	model.summary()

	model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
	optimizer='adam',
	metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

	epochs = 20
	history = model.fit(
	train_ds,
	validation_data=val_ds,
	epochs=epochs)

	export_model = tf.keras.Sequential([
	vectorize_layer,
	model,
	layers.Activation('sigmoid')
	])
	export_model.compile(
	loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
	)
	loss, accuracy = export_model.evaluate(raw_test_ds)
	print(accuracy)

	examples = [
	"new blood test kit",
	"can I have a new kit",
	"new blood please",
	"nothing to do with test kits",
	]
	export_model.predict(examples)
	import * as use from '@tensorflow-models/universal-sentence-encoder'
	import * as tf from '@tensorflow/tfjs'
	import * as tfviz from '@tensorflow/tfjs-vis'
	import {loadGraphModel} from '@tensorflow/tfjs-converter';

	import { useEffect, useState} from 'react'

	const loadModel = async () => {
	const model = await tf.loadLayersModel('http://localhost:3000/tsjs_out/model.json')
	const encoder = await use.load()
	const embedded = await encoder.embed(['new test kit'])
	const p = await model.predict(embedded).data()
	console.log('prediction', p[0])
	return model
	}

	const Index = () => {
	const [model, setModel] = useState()

	useEffect(() => {
	const f = async () => await loadModel()
	const mo = f()
	setModel(mo)
	}, [])

	return (
	<>
	<a href='/'>Reload to see values in console</a>
	</>
	)
	}

	export default Index