Zikovich/convert_TCN_speech_recog_model.ipynb

## convert_TCN_speech_recog_model.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!git clone --recursive https://github.com/mpc001/Lipreading_using_Temporal_Convolutional_Networks.git\n",
        "%cd Lipreading_using_Temporal_Convolutional_Networks/\n",
        "!git checkout 47872c9a7a357b70a4adc97e51658c1e43fde8d9\n",
        "!pip install -r requirements.txt\n",
        "!gdown --id 12mHlNQKCE2AXkFHzvRyqSbsmOMEs259i\n",
        "!gdown --id 16asCjDdGnnP3AFJZtDlYehHe7qrQ5AXq\n",
        "!unzip LRW_landmarks.zip -o ./landmarks\n",
        "!cd models && gdown --id 1tYNYOiJhVNQgf8Rt-X64uzso3Py-RSvu\n",
        "!cd models && gdown --id 1h6JVCAoLlq-StCkT_a7n_-qmViHK-iUT"
      ],
      "metadata": {
        "id": "yPDzOQdV7PQp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cT7SCY4e7ExD"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import json\n",
        "from pathlib import Path\n",
        "from lipreading.model import *\n",
        "import torch.onnx"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "class AVLipreading(nn.Module):\n",
        "    def __init__( self, comb_wt=0.5, margin=20):\n",
        "        # comb_wt is the wt given to audio. Should be between 0 and 1\n",
        "        self.margin=margin\n",
        "        if(comb_wt<0 or comb_wt>1):\n",
        "            raise Exception()\n",
        "        super(AVLipreading, self).__init__()\n",
        "        with open('configs/lrw_resnet18_mstcn.json') as fp:\n",
        "            config = json.load(fp)\n",
        "        tcn_options = {\n",
        "            'num_layers': config['tcn_num_layers'],\n",
        "            'kernel_size': config['tcn_kernel_size'],\n",
        "            'dropout': config['tcn_dropout'],\n",
        "            'dwpw': config['tcn_dwpw'],\n",
        "            'width_mult': config['tcn_width_mult'],\n",
        "        }\n",
        "        self.audio_model = Lipreading(\n",
        "                    num_classes=500,\n",
        "                    tcn_options=tcn_options,\n",
        "                    backbone_type=config['backbone_type'],\n",
        "                    relu_type=config['relu_type'],\n",
        "                    width_mult=config['width_mult'],\n",
        "                    extract_feats=False,\n",
        "                    modality = 'raw_audio'\n",
        "                )\n",
        "        self.video_model = Lipreading(\n",
        "                    num_classes=500,\n",
        "                    tcn_options=tcn_options,\n",
        "                    backbone_type=config['backbone_type'],\n",
        "                    relu_type=config['relu_type'],\n",
        "                    width_mult=config['width_mult'],\n",
        "                    extract_feats=False,\n",
        "                    modality = 'video'\n",
        "                )\n",
        "        \n",
        "        self.video_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_adamw_s3.pth.tar'), map_location='cpu')['model_state_dict'])\n",
        "        self.audio_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_audio_adamw.pth.tar'), map_location='cpu')['model_state_dict'])\n",
        "\n",
        "        self.wt = comb_wt\n",
        "\n",
        "    def forward(self, audio_input, video_input):\n",
        "        audio_sampling_rate = 16000\n",
        "        video_fps = 30\n",
        "        A = self.audio_model.forward(audio_input, [self.margin])\n",
        "        V = self.video_model.forward(video_input, [self.margin*audio_sampling_rate//video_fps])\n",
        "        # Here's the combining step. I'm currently using weighted average\n",
        "        return A*self.wt+V*(1-self.wt)"
      ],
      "metadata": {
        "id": "UL2_mx9OFDx1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = AVLipreading()\n",
        "model = model.eval()\n",
        "audio_input = torch.randn(1,1,20*16000//30)\n",
        "video_input = torch.randn(1,1,20,96,96)\n",
        "final_out = model.forward(audio_input, video_input)\n",
        "\n",
        "# Export the final model here.\n",
        "torch.onnx.export(model.eval(),(audio_input, video_input), 'AVSpeechRecog.onnx', opset_version=11, input_names=[\"audio_input\", \"video_input\"])"
      ],
      "metadata": {
        "id": "xszGSC3tG83r"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "zNL-H3FVU5oE"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}

## convert_TCN_speech_recog_model.py
# -*- coding: utf-8 -*-
"""TCN_AVSpeech_model_comp.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1awBCZ5O6uAT32cHvufNWad5m6q26TuqQ
"""

# Commented out IPython magic to ensure Python compatibility.
# Clone original repo, install requirements, download models & files from google drive
# !git clone --recursive https://github.com/mpc001/Lipreading_using_Temporal_Convolutional_Networks.git
# %cd Lipreading_using_Temporal_Convolutional_Networks/
# !git checkout 47872c9a7a357b70a4adc97e51658c1e43fde8d9
# !pip install -r requirements.txt
# !gdown --id 12mHlNQKCE2AXkFHzvRyqSbsmOMEs259i
# !gdown --id 16asCjDdGnnP3AFJZtDlYehHe7qrQ5AXq
# !unzip LRW_landmarks.zip -o ./landmarks
# !cd models && gdown --id 1tYNYOiJhVNQgf8Rt-X64uzso3Py-RSvu
# !cd models && gdown --id 1h6JVCAoLlq-StCkT_a7n_-qmViHK-iUT

import numpy as np
import torch
import torch.nn as nn
import json
from pathlib import Path
from lipreading.model import *
import torch.onnx

class AVLipreading(nn.Module):
    def __init__( self, comb_wt=0.5, margin=20):
        # comb_wt is the wt given to audio. Should be between 0 and 1
        self.margin=margin
        if(comb_wt<0 or comb_wt>1):
            raise Exception()
        super(AVLipreading, self).__init__()
        with open('configs/lrw_resnet18_mstcn.json') as fp:
            config = json.load(fp)
        tcn_options = {
            'num_layers': config['tcn_num_layers'],
            'kernel_size': config['tcn_kernel_size'],
            'dropout': config['tcn_dropout'],
            'dwpw': config['tcn_dwpw'],
            'width_mult': config['tcn_width_mult'],
        }
        self.audio_model = Lipreading(
                    num_classes=500,
                    tcn_options=tcn_options,
                    backbone_type=config['backbone_type'],
                    relu_type=config['relu_type'],
                    width_mult=config['width_mult'],
                    extract_feats=False,
                    modality = 'raw_audio'
                )
        self.video_model = Lipreading(
                    num_classes=500,
                    tcn_options=tcn_options,
                    backbone_type=config['backbone_type'],
                    relu_type=config['relu_type'],
                    width_mult=config['width_mult'],
                    extract_feats=False,
                    modality = 'video'
                )

        self.video_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_adamw_s3.pth.tar'), map_location='cpu')['model_state_dict'])
        self.audio_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_audio_adamw.pth.tar'), map_location='cpu')['model_state_dict'])

        self.wt = comb_wt

    def forward(self, audio_input, video_input):
        audio_sampling_rate = 16000
        video_fps = 30
        A = self.audio_model.forward(audio_input, [self.margin])
        V = self.video_model.forward(video_input, [self.margin*audio_sampling_rate//video_fps])
        # Here's the combining step. I'm currently using weighted average
        return A*self.wt+V*(1-self.wt)

model = AVLipreading()
model = model.eval()
audio_input = torch.randn(1,1,20*16000//30)
video_input = torch.randn(1,1,20,96,96)
final_out = model.forward(audio_input, video_input)

# Export the final model here.
torch.onnx.export(model.eval(),(audio_input, video_input), 'AVSpeechRecog.onnx', opset_version=11, input_names=["audio_input", "video_input"])
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"collapsed_sections": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "code",
	"source": [
	"!git clone --recursive https://github.com/mpc001/Lipreading_using_Temporal_Convolutional_Networks.git\n",
	"%cd Lipreading_using_Temporal_Convolutional_Networks/\n",
	"!git checkout 47872c9a7a357b70a4adc97e51658c1e43fde8d9\n",
	"!pip install -r requirements.txt\n",
	"!gdown --id 12mHlNQKCE2AXkFHzvRyqSbsmOMEs259i\n",
	"!gdown --id 16asCjDdGnnP3AFJZtDlYehHe7qrQ5AXq\n",
	"!unzip LRW_landmarks.zip -o ./landmarks\n",
	"!cd models && gdown --id 1tYNYOiJhVNQgf8Rt-X64uzso3Py-RSvu\n",
	"!cd models && gdown --id 1h6JVCAoLlq-StCkT_a7n_-qmViHK-iUT"
	],
	"metadata": {
	"id": "yPDzOQdV7PQp"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "cT7SCY4e7ExD"
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import torch\n",
	"import torch.nn as nn\n",
	"import json\n",
	"from pathlib import Path\n",
	"from lipreading.model import *\n",
	"import torch.onnx"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"class AVLipreading(nn.Module):\n",
	" def __init__( self, comb_wt=0.5, margin=20):\n",
	" # comb_wt is the wt given to audio. Should be between 0 and 1\n",
	" self.margin=margin\n",
	" if(comb_wt<0 or comb_wt>1):\n",
	" raise Exception()\n",
	" super(AVLipreading, self).__init__()\n",
	" with open('configs/lrw_resnet18_mstcn.json') as fp:\n",
	" config = json.load(fp)\n",
	" tcn_options = {\n",
	" 'num_layers': config['tcn_num_layers'],\n",
	" 'kernel_size': config['tcn_kernel_size'],\n",
	" 'dropout': config['tcn_dropout'],\n",
	" 'dwpw': config['tcn_dwpw'],\n",
	" 'width_mult': config['tcn_width_mult'],\n",
	" }\n",
	" self.audio_model = Lipreading(\n",
	" num_classes=500,\n",
	" tcn_options=tcn_options,\n",
	" backbone_type=config['backbone_type'],\n",
	" relu_type=config['relu_type'],\n",
	" width_mult=config['width_mult'],\n",
	" extract_feats=False,\n",
	" modality = 'raw_audio'\n",
	" )\n",
	" self.video_model = Lipreading(\n",
	" num_classes=500,\n",
	" tcn_options=tcn_options,\n",
	" backbone_type=config['backbone_type'],\n",
	" relu_type=config['relu_type'],\n",
	" width_mult=config['width_mult'],\n",
	" extract_feats=False,\n",
	" modality = 'video'\n",
	" )\n",
	" \n",
	" self.video_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_adamw_s3.pth.tar'), map_location='cpu')['model_state_dict'])\n",
	" self.audio_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_audio_adamw.pth.tar'), map_location='cpu')['model_state_dict'])\n",
	"\n",
	" self.wt = comb_wt\n",
	"\n",
	" def forward(self, audio_input, video_input):\n",
	" audio_sampling_rate = 16000\n",
	" video_fps = 30\n",
	" A = self.audio_model.forward(audio_input, [self.margin])\n",
	" V = self.video_model.forward(video_input, [self.margin*audio_sampling_rate//video_fps])\n",
	" # Here's the combining step. I'm currently using weighted average\n",
	" return Aself.wt+V(1-self.wt)"
	],
	"metadata": {
	"id": "UL2_mx9OFDx1"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"model = AVLipreading()\n",
	"model = model.eval()\n",
	"audio_input = torch.randn(1,1,20*16000//30)\n",
	"video_input = torch.randn(1,1,20,96,96)\n",
	"final_out = model.forward(audio_input, video_input)\n",
	"\n",
	"# Export the final model here.\n",
	"torch.onnx.export(model.eval(),(audio_input, video_input), 'AVSpeechRecog.onnx', opset_version=11, input_names=[\"audio_input\", \"video_input\"])"
	],
	"metadata": {
	"id": "xszGSC3tG83r"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "zNL-H3FVU5oE"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}
	# -- coding: utf-8 --
	"""TCN_AVSpeech_model_comp.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1awBCZ5O6uAT32cHvufNWad5m6q26TuqQ
	"""

	# Commented out IPython magic to ensure Python compatibility.
	# Clone original repo, install requirements, download models & files from google drive
	# !git clone --recursive https://github.com/mpc001/Lipreading_using_Temporal_Convolutional_Networks.git
	# %cd Lipreading_using_Temporal_Convolutional_Networks/
	# !git checkout 47872c9a7a357b70a4adc97e51658c1e43fde8d9
	# !pip install -r requirements.txt
	# !gdown --id 12mHlNQKCE2AXkFHzvRyqSbsmOMEs259i
	# !gdown --id 16asCjDdGnnP3AFJZtDlYehHe7qrQ5AXq
	# !unzip LRW_landmarks.zip -o ./landmarks
	# !cd models && gdown --id 1tYNYOiJhVNQgf8Rt-X64uzso3Py-RSvu
	# !cd models && gdown --id 1h6JVCAoLlq-StCkT_a7n_-qmViHK-iUT

	import numpy as np
	import torch
	import torch.nn as nn
	import json
	from pathlib import Path
	from lipreading.model import *
	import torch.onnx

	class AVLipreading(nn.Module):
	def __init__( self, comb_wt=0.5, margin=20):
	# comb_wt is the wt given to audio. Should be between 0 and 1
	self.margin=margin
	if(comb_wt<0 or comb_wt>1):
	raise Exception()
	super(AVLipreading, self).__init__()
	with open('configs/lrw_resnet18_mstcn.json') as fp:
	config = json.load(fp)
	tcn_options = {
	'num_layers': config['tcn_num_layers'],
	'kernel_size': config['tcn_kernel_size'],
	'dropout': config['tcn_dropout'],
	'dwpw': config['tcn_dwpw'],
	'width_mult': config['tcn_width_mult'],
	}
	self.audio_model = Lipreading(
	num_classes=500,
	tcn_options=tcn_options,
	backbone_type=config['backbone_type'],
	relu_type=config['relu_type'],
	width_mult=config['width_mult'],
	extract_feats=False,
	modality = 'raw_audio'
	)
	self.video_model = Lipreading(
	num_classes=500,
	tcn_options=tcn_options,
	backbone_type=config['backbone_type'],
	relu_type=config['relu_type'],
	width_mult=config['width_mult'],
	extract_feats=False,
	modality = 'video'
	)

	self.video_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_adamw_s3.pth.tar'), map_location='cpu')['model_state_dict'])
	self.audio_model.load_state_dict(torch.load(Path('models/lrw_resnet18_mstcn_audio_adamw.pth.tar'), map_location='cpu')['model_state_dict'])

	self.wt = comb_wt

	def forward(self, audio_input, video_input):
	audio_sampling_rate = 16000
	video_fps = 30
	A = self.audio_model.forward(audio_input, [self.margin])
	V = self.video_model.forward(video_input, [self.margin*audio_sampling_rate//video_fps])
	# Here's the combining step. I'm currently using weighted average
	return Aself.wt+V(1-self.wt)

	model = AVLipreading()
	model = model.eval()
	audio_input = torch.randn(1,1,20*16000//30)
	video_input = torch.randn(1,1,20,96,96)
	final_out = model.forward(audio_input, video_input)

	# Export the final model here.
	torch.onnx.export(model.eval(),(audio_input, video_input), 'AVSpeechRecog.onnx', opset_version=11, input_names=["audio_input", "video_input"])