nukadelic/tensorflow_to_barracuda.py

## tensorflow_to_barracuda.py
from __future__ import print_function
import numpy as np
import struct # convert from Python values and C structs
import tensorflow as tf
import re
import barracuda
from barracuda import Struct
from google.protobuf import descriptor
from google.protobuf.json_format import MessageToJson


if __name__ == '__main__':
    # Handle command line argumengts
    args = barracuda.parse_args(
        description = 'Convert Tensorflow model to Barracuda binary',
        source_extension = '.pb',
        help = 'input Tensorflow serialized .pb file')
    # Te following code can be used as an example of API used from another module
    # convert() is the main entry point for converter
    import tensorflow_to_barracuda as tf2bc
    tf2bc.convert(args.source_file, args.target_file, args.trim_unused_by_output, args)


# TODO: support more than 1 LSTM layer per model - prepend scope to names and inputs
# TODO: support different activation functions in LSTM
# TODO: strip output Identity node, instead patch upstream layer names
# TODO: use ScaleBias and Pow with alpha when input is constant Tensor
# TODO: support all data format types (curretly only NHWC)
# TODO: support all data types (currently only FLOAT, INT32, BOOL)
# TODO: implement FusedResizeAndPadConv2D

# Important ProtoBuf definitions:
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto
#
# Node descriptions:
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/nn_ops.cc
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/math_ops.cc
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/random_ops.cc
#
# Class doc:
#    https://www.tensorflow.org/api_docs/cc/
#
known_classes = {
    'Dense': Struct(
                    id = 1,
                    rank = 2,
                    out_shapes = lambda shapes: [
                        [shapes[0][0], 1, 1, shapes[0][1]] if len(shapes[0]) > 1 else [1,1,1,1],  # W
                        [1, 1, 1, shapes[-1][-1]]            # B
                    ],
                    patch_data = lambda data: [
                        data[0],
                        data[1]
                    ]),
    'MatMul': Struct(
                    id = 1,
                    rank = 2,
                    out_shapes = lambda shapes: [
                        [shapes[0][0], 1, 1, shapes[0][1]],  # W
                        [1, 1, 1, shapes[0][1]]              # B
                    ],
                    patch_data = lambda data: [
                        data[0],
                        np.zeros(np.shape(data[1]))
                    ]),
    'BiasAdd': Struct(
                    id = 51, # implemented as ScaleBias
                    out_shapes = lambda shapes: [
                        [1, 1, 1, shapes[0][0]],             # ONE
                        [1, 1, 1, shapes[0][0]],             # B
                    ],
                    patch_data = lambda data: [
                        np.ones(np.shape(data[0])),
                        data[0]
                    ]),

    # TODO: NCHW
    'Conv2D': Struct(
                    id = 20,
                    rank = 4,
                    out_shapes = lambda shapes: [
                        shapes[0],                           # K
                        [1, 1, 1, shapes[-1][-1]]            # B
                    ],
                    patch_data = lambda data: [
                        data[0],
                        data[1]
                    ]),
    'DepthwiseConv2dNative': Struct( # DepthwiseConv2D
                    id = 21,
                    rank = 4,
                    out_shapes = lambda s: [
                        [s[0][0], s[0][1], s[0][3], s[0][2]], # K TF:[H, W, in_channels, channel_multiplier] => [H, W, 1, in_channels]
                        [1, 1, 1, s[-1][-1]] if len(s) > 1
                        else [1, 1, 1, s[0][2]]               # B
                    ],
                    patch_data = lambda data: [
                        np.transpose(data[0], (0,1,3,2)),
                        data[1]
                    ]),
    'Conv2DBackpropInput': Struct( # Conv2DTranspose
                    id = 22,
                    rank = 4,
                    out_shapes = lambda s: [
                        [s[0][0], s[0][1], s[0][3], s[0][2]], # K TF:[H, W, in_channels, out_channels] => [H, W, out_channels, in_channels]
                        [1, 1, 1, s[-1][-1]] if len(s) > 1
                        else [1, 1, 1, s[0][2]]               # B
                    ],
                    patch_data = lambda data: [
                        np.transpose(data[0], (0,1,3,2)),
                        data[1]
                    ]),
    'Border2D':         29,
    'Pad2DReflect':     160,
    'Pad2DSymmetric':   161,

    # TODO: 3D

    'ResizeNearestNeighbor':
                        23, # implemented as Upsample2D
    'ResizeBilinear':   23, # implemented as Upsample2D
    'ResizeBicubic':    23, # implemented as Upsample2D
    'MaxPool':          25,
    'AvgPool':          26,

    'GlobalAveragePool':28,
    'GlobalAvgPool':    28,

    'Activation':       50,

    'BatchNormalization': Struct(
                    id = 51, # after fusion implemented as ScaleBias
                    out_shapes = lambda shapes: [
                        [1, 1, 1, shapes[0][0]],             # S
                        [1, 1, 1, shapes[0][0]],             # B
                    ],
                    patch_data = lambda data:
                        # fuse [gamma, beta, mean, var, epsilon] => [scale, bias]
                        # TODO: double-check if epsilon is the last data argument and not the 1st?
                        barracuda.fuse_batchnorm_weights(data[0], data[1], data[2], data[3], data[4]) if len(data) == 5 else
                        # fuse [ONE, beta, mean, var, epsilon] => [scale, bias]
                        # TODO: double-check if epsilon is the last data argument and not the 1st?
                        barracuda.fuse_batchnorm_weights(np.ones(np.shape(data[0])), data[0], data[1], data[2], data[3])
                    ),
    'FusedBatchNorm': Struct(
                    id = 51, # after fusion implemented as ScaleBias
                    out_shapes = lambda shapes: [
                        [1, 1, 1, shapes[0][0]],             # S
                        [1, 1, 1, shapes[0][0]],             # B
                    ],
                    patch_data = lambda data, layer:
                        # fuse [gamma, beta, mean, var, epsilon] => [scale, bias]
                        barracuda.fuse_batchnorm_weights(data[0], data[1], data[2], data[3], get_epsilon(layer))
                    ),
    'BatchNormalizationRuntime': Struct(
                    id = 52,
                    out_shapes = lambda shapes: [
                        [1, 1, 1, shapes[0][0]],             # G
                        [1, 1, 1, shapes[0][0]],             # B
                    ],
                    patch_data = lambda data:
                        [data[0], data[1]] if len(data) == 4 else
                        [np.ones(np.shape(data[0])), data[0]]
                    ),
    'InstanceNormalization': Struct( # TODO: epsilon
                    id = 52,
                    out_shapes = lambda shapes: [
                        [1, 1, 1, shapes[0][0]],             # G
                        [1, 1, 1, shapes[0][0]],             # B
                    ],
                    patch_data = lambda data:
                        [data[0], data[1]] if len(data) == 2 else
                        [np.ones(np.shape(data[0])), data[0]]
                    ),
    'LRN':              53,

    'RandomStandardNormal':
                        64,
    'RandomUniform':    65,
    'Multinomial': Struct(
                     id=66, rank = 2),
    'OneHot': Struct(id=67, rank = lambda inputs: inputs[0] + 1),


    # Broadcast ops
    'Add':    Struct(id=100, rank = lambda inputs: np.max(inputs)),
    'AddV2':    Struct(id=100, rank = lambda inputs: np.max(inputs)),
    'Sub':    Struct(id=101, rank = lambda inputs: np.max(inputs)),
    'Mul':    Struct(id=102, rank = lambda inputs: np.max(inputs)),
    'RealDiv':Struct(id=103, rank = lambda inputs: np.max(inputs)),
    'Pow':    Struct(id=104, rank = lambda inputs: np.max(inputs)),
    'Minimum':Struct(id=110, rank = lambda inputs: np.max(inputs)),
    'Maximum':Struct(id=111, rank = lambda inputs: np.max(inputs)),

    # Comparison ops with broadcast
    'Greater':      Struct(id=140, rank = lambda inputs: np.max(inputs)),
    'GreaterEqual': Struct(id=141, rank = lambda inputs: np.max(inputs)),
    'Less':         Struct(id=142, rank = lambda inputs: np.max(inputs)),
    'LessEqual':    Struct(id=143, rank = lambda inputs: np.max(inputs)),
    'Equal':        Struct(id=144, rank = lambda inputs: np.max(inputs)),

    # Logical ops with broadcast
    'LogicalOr':    Struct(id=145, rank = lambda inputs: np.max(inputs)),
    'LogicalAnd':   Struct(id=146, rank = lambda inputs: np.max(inputs)),
    'LogicalNot':   Struct(id=147, rank = lambda inputs: np.max(inputs)),
    'LogicalXor':   Struct(id=148, rank = lambda inputs: np.max(inputs)),

    # Reduce ops
    'Max':    Struct(id=124, rank = lambda inputs: inputs[0] - 1),
    'Mean':   Struct(id=125, rank = lambda inputs: inputs[0] - 1),
    'Min':    Struct(id=126, rank = lambda inputs: inputs[0] - 1),
    'Prod':   Struct(id=127, rank = lambda inputs: inputs[0] - 1),
    'Sum':    Struct(id=128, rank = lambda inputs: inputs[0] - 1),

    'Flatten':Struct(id=200, rank = 2),
    'Reshape':          201,
    'Concat':           210,
    'StridedSlice':     211,

    'Nop':              0,
}

requires_runtime_flag = {
    'Dropout' : 'DropoutRuntime',
    'BatchNormalization' : 'BatchNormalizationRuntime',
}

known_activations = {
    'Linear' : 0,
    'Relu' : 1,
    'Softmax' : 2,
    'Tanh' : 3,
    'Sigmoid' : 4,
    'Elu' : 5,
    'Relu6' : 6,
    'LeakyRelu' : 7,
    'Selu' : 8,
    'Swish' : 9,

    'LogSoftmax' : 10,
    'Softplus' : 11,
    'Softsign' : 12,

    'Abs' : 100,
    'Neg' : 101,
    'Ceil' : 102,
    'Floor' : 104,

    'Sqrt' : 111,
    'Exp' : 113,
    'Log' : 114,

    'Acos' : 200,
    'Acosh' : 201,
    'Asin' : 202,
    'Asinh' : 203,
    'Atan' : 204,
    'Atanh' : 205,
    'Cos' : 206,
    'Cosh' : 207,
    'Sin' : 208,
    'Sinh' : 209,
    'Tan' : 210
}

known_paddings = {
	'VALID' : [0,0,0,0],
    'SAME'  : [-1] # SameUpper
}

supported_data_formats = {
    'NHWC'
}

known_patterns = {
    # TODO: Flatten pattern using namespace regexp
    repr(['Shape', 'StridedSlice', 'Pack', 'Reshape'])          : "Flatten",
    repr(['Shape', 'StridedSlice', 'Prod', 'Pack', 'Reshape'])  : "Flatten",
    repr(['Shape', 'Slice', 'Slice', 'Prod',
        'ExpandDims', 'ConcatV2', 'Reshape'])                   : "Flatten",

    repr(['Add', 'Rsqrt', 'Mul', 'Mul', 'Sub', 'Add'])          : 'BatchNormalization',
    repr(['Add', 'Rsqrt', 'Mul', 'Mul', 'Mul', 'Sub', 'Add'])   : 'BatchNormalization',

    repr(['Mean', 'StopGradient', 'SquaredDifference', 'Mean',
        'Sub', 'Add', 'Pow', 'RealDiv', 'Mul', 'Add'])          : 'InstanceNormalization_ByTensorOrder',
    repr(['Mean', 'StopGradient', 'SquaredDifference', 'Mean',
        'Squeeze', 'Squeeze',
        'Add', 'Rsqrt', 'Mul', 'Mul', 'Mul', 'Sub', 'Add'])     : 'InstanceNormalization_ByTensorName',

    repr(['MatMul', 'BiasAdd'])                                 : 'Dense',
    repr(['Conv2D', 'BiasAdd'])                                 : 'Conv2D',
    repr(['DepthwiseConv2dNative', 'BiasAdd'])                  : 'DepthwiseConv2dNative',

    repr(['Conv2DBackpropInput', 'BiasAdd'])                    : 'Conv2DBackpropInput',
    repr(['Conv2DBackpropInput'])                               : 'Conv2DBackpropInput',
    repr(['Shape', 'StridedSlice', 'StridedSlice', 'StridedSlice', 'Mul',
        'Mul', 'Pack', 'Conv2DBackpropInput', 'BiasAdd'])       : 'Conv2DBackpropInput',
    repr(['Shape', 'StridedSlice', 'StridedSlice', 'StridedSlice', 'Mul',
        'Mul', 'Pack', 'Conv2DBackpropInput'])                  : 'Conv2DBackpropInput',

    repr(['Shape', 'StridedSlice', 'Mul', 'ResizeNearestNeighbor'])
                                                                : 'ResizeNearestNeighbor',

    repr(['Pack', 'Reshape'])                                   : 'Flatten$',    # for now we assume that this combination is trivial Flatten
                                                                                 # for exmaple it is used in ML-agents LSTM nets with sequence_length==1
    repr(['StridedSlice', 'Reshape',
        re.compile('^[a-zA-Z/]*lstm/'),
        'Reshape', 'ConcatV2', 'Identity'])                     : 'BasicLSTMReshapeOut',

    repr([re.compile('^[a-zA-Z/]*lstm/'),
        'Reshape', 'ConcatV2', 'Identity'])                     : 'BasicLSTMReshapeOut',

    repr(['Reshape', re.compile('^[a-zA-Z/]*lstm_[a-z]*/'),'Reshape', 'ConcatV2'])   : 'BasicLSTMReshapeOut',
    repr(['Reshape', re.compile('^[a-zA-Z/]*lstm_[a-z]*/'),'ConcatV2'])   : 'BasicLSTMConcatOut',

    repr(['Sigmoid', 'Mul'])                                    : "Swish",
    repr(['Mul', 'Abs', 'Mul', 'Add'])                          : "LeakyRelu",

    repr(['Shape', 'Reshape'])                                  : 'ReshapeLikeInput0', # shape comes from the 1st node as input[0]
    repr(['Reshape'])                                           : 'Reshape',
    repr(['ConcatV2'])                                          : 'ConcatV2',
    repr(['Mean'])                                              : 'Mean',
    repr(['Pad'])                                               : 'Pad',
    repr(['PadV2'])                                             : 'Pad',
    repr(['MirrorPad'])                                         : 'Pad',
    repr(['Multinomial'])                                       : 'Multinomial',
    repr(['OneHot'])                                            : 'OneHot',
    repr(['Square'])                                            : 'Square',
    repr(['SquaredDifference'])                                 : 'SquaredDifference',
    repr(['StridedSlice'])                                      : 'StridedSlice',
    repr(['Squeeze'])                                           : 'Squeeze',
    repr(['ExpandDims'])                                        : 'ExpandDims',
    # TODO: FusedResizeAndPadConv2D
}

def by_name(args, name):
    for a in args:
        if a.name.endswith(name):
            return a

def by_op(args, op):
    for a in args:
        if a.op == op:
            return a

def order_by(args, names):
    ordered = []
    arg_count = len(args)
    for name in names:
        ordered += [a for a in args if a.endswith(name)]
        args = [a for a in args if not a.endswith(name)]
    ordered += args # append what is left
    assert(len(ordered) == arg_count)
    return ordered

transform_patterns = {
    'Flatten' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Flatten',
            input = inputs
        ),
    'Flatten$' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Flatten',
            input = [inputs[-1]] # take only the last input, assume all other arguments are trivial (like sequence_length==1 always in ML-agents LSTM nets)
        ),
    'Reshape' : lambda nodes, inputs, tensors, context:
        Struct(
            op    = 'Reshape',
            rank  = len(tensors[0].data)                if len(tensors) > 0 # tensor data is treated as reshape coefficient, if not empty
                    else context.layer_ranks[inputs[1]] if len(inputs) == 2 # otherwise shape of the 2nd input tensor is used
                    else -1,
            input = inputs,
            shape = [tensors[0].data[0], tensors[0].data[1], tensors[0].data[2], tensors[0].data[3]] if len(tensors) > 0 and len(tensors[0].data) == 4
                    else [tensors[0].data[0], 1, tensors[0].data[1], tensors[0].data[2]]             if len(tensors) > 0 and len(tensors[0].data) == 3
                    else [tensors[0].data[0], 1, 1, tensors[0].data[1]]                              if len(tensors) > 0 and len(tensors[0].data) == 2
                    else [1, 1, 1, tensors[0].data[0]]                                               if len(tensors) > 0 and len(tensors[0].data) == 1
                    else []
        ),
    'ReshapeLikeInput0' : lambda nodes, inputs, tensors, context:
        Struct(
            op    = 'Reshape',
            rank  = context.layer_ranks[inputs[0]] if len(inputs) == 2  # unlike standard 'Reshape' input[0] is used as shape & input[1] as data
                    else -1,
            input = [inputs[1], inputs[0]] if len(inputs) == 2          # unlike standard 'Reshape' input[0] is used as shape & input[1] as data
                    else inputs,
        ),
    'Pad' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'BarracudaUnsupportedPad' if (len(tensors) == 0 or np.shape(tensors[0]) != [4,2]) else
                    'Pad2DReflect' if (get_attr(nodes[-1], 'mode', default='constant').lower() == 'reflect') else
                    'Pad2DSymmetric' if (get_attr(nodes[-1], 'mode', default='constant').lower() == 'symmetric') else
                    'Border2D' if (get_attr(nodes[-1], 'mode', default='constant').lower() == 'constant') else
                    'BarracudaUnsupportedPad',
            input = inputs,
            pads  = [tensors[0].data[2,0], tensors[0].data[1,0], tensors[0].data[2,1], tensors[0].data[1,1]] if (len(tensors) > 0 and np.shape(tensors[0]) == [4,2])
                    else [0,0,0,0],
            beta  = tensors[1].data[0] if len(tensors) > 1 and np.shape(tensors[1]) == (1,) else get_attr(nodes[-1], 'constant_values') or 0,
        ),
    'Squeeze' : lambda nodes, inputs, tensors, context:
        Struct(
            op    = 'Nop', # Squeeze is no-operation in Barracuda
            input = inputs,
            rank  = context.layer_ranks[inputs[0]] - len(get_attr(nodes[-1], 'squeeze_dims'))       if len(get_attr(nodes[-1], 'squeeze_dims')) > 0
                    else -1 # if list of squeeze axis is not specified, it is unknown what would be the rank of result
        ),
    'ExpandDims' : lambda nodes, inputs, tensors, context:
        Struct(
            op    = 'Nop', # ExpandDims is no-operation in Barracuda
            input = [inputs[0]],
            rank  = context.layer_ranks[inputs[0]] + 1
        ),
    'Multinomial' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Multinomial',
            input = inputs,
            shape = [int(by_name(tensors, '/num_samples').data[0])],
            #seed = get_attr(nodes[0], 'seed'),
        ),
    'OneHot' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'OneHot',
            input = inputs,
            shape = [int(by_name(tensors, '/depth').data[0])],
            alpha = by_name(tensors, '/on_value').data[0],
            beta  = by_name(tensors, '/off_value').data[0],
        ),
    'Square' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Mul',
            input = [inputs[0], inputs[0]], # input * input
        ),
    'ConcatV2' : lambda nodes, inputs, tensors, context:
        Struct(
            op    = 'Concat',
            input = inputs,
            axis  = axis_to_barracuda(
                int(by_name(tensors, '/axis').data[0]),
                context.layer_ranks[inputs[0]])
        ),
    'StridedSlice' : lambda nodes, inputs, tensors, context:
       strided_slice(nodes[-1].name,
            inputs[0], context.layer_ranks[inputs[0]],
            begin           = tensors[0].data,
            end             = tensors[1].data,
            strides         = tensors[2].data,
            begin_mask      = get_attr(nodes[-1], 'begin_mask'),
            end_mask        = get_attr(nodes[-1], 'end_mask'),
            ellipsis_mask   = get_attr(nodes[-1], 'ellipsis_mask'),
            new_axis_mask   = get_attr(nodes[-1], 'new_axis_mask'),
            shrink_axis_mask= get_attr(nodes[-1], 'shrink_axis_mask')
        ),
    'BatchNormalization' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'BatchNormalization',
            input = [i for i in inputs] +
                order_by([t.name for t in tensors], ['gamma', 'beta', 'mean', 'variance']),
        ),
    'InstanceNormalization_ByTensorName' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'InstanceNormalization',
            input = [i for i in inputs] +
                order_by([t.name for t in tensors], ['scale', 'offset']),
        ),
    'InstanceNormalization_ByTensorOrder' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'InstanceNormalization',
            input = [i for i in inputs] + [t.name for t in tensors][-2:],
        ),
    'Dense' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Dense',
            input = [i for i in inputs] + [t.name for t in tensors],
            data_frmt = get_attr(by_op(nodes, 'Dense') or by_op(nodes, 'MatMul'), 'data_format'),
        ),
    'Conv2D' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Conv2D',
            input = [i for i in inputs] + [t.name for t in tensors],
            padding   = get_attr(by_op(nodes, 'Conv2D'), 'padding'),
            strides   = get_attr(by_op(nodes, 'Conv2D'), 'strides'),
            dilations = get_attr(by_op(nodes, 'Conv2D'), 'dilations'),
            data_frmt = get_attr(by_op(nodes, 'Conv2D'), 'data_format'),
        ),
    'DepthwiseConv2dNative' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'DepthwiseConv2dNative',
            input = [i for i in inputs] + [t.name for t in tensors],
            padding   = get_attr(by_op(nodes, 'DepthwiseConv2dNative'), 'padding'),
            strides   = get_attr(by_op(nodes, 'DepthwiseConv2dNative'), 'strides'),
            dilations = get_attr(by_op(nodes, 'DepthwiseConv2dNative'), 'dilations'),
            data_frmt = get_attr(by_op(nodes, 'DepthwiseConv2dNative'), 'data_format'),
        ),
    'Conv2DBackpropInput' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Conv2DBackpropInput',
            input = [i for i in inputs] + [t.name for t in tensors][1:][-2:],   # [1:]  - skips the 0th tensor, since Conv2DBackpropInput 0th tensor is 'input_sizes' (which differs from other Conv layers)
                                                                                # [-2:] - take only last 2 tensors, this allows to process large patterns with the same code
            padding   = get_attr(by_op(nodes, 'Conv2DBackpropInput'), 'padding'),
            strides   = get_attr(by_op(nodes, 'Conv2DBackpropInput'), 'strides'),
            dilations = get_attr(by_op(nodes, 'Conv2DBackpropInput'), 'dilations'),
            data_frmt = get_attr(by_op(nodes, 'Conv2DBackpropInput'), 'data_format'),
        ),
    'ResizeNearestNeighbor' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'ResizeNearestNeighbor',
            input = [i for i in inputs],
            ksize = [int(tensors[0].data[0]), int(tensors[0].data[1])]          if len(tensors) == 1 and len(tensors[0].data) == 2
                    else [int(tensors[-1].data[0]), int(tensors[-1].data[1])]   if len(tensors) >= 4 and len(tensors[-1].data) == 2
                    else [1,1]
        ),
    'Mean' : lambda nodes, inputs, tensors, _:
        # take only the last input
        barracuda.mean(nodes[-1].name, inputs[-1], axis=tensors[0].data),

    'SquaredDifference' : lambda nodes, inputs, tensors, _:
        sqr_diff(nodes[-1].name, inputs[0], inputs[1]),

    'BasicLSTMReshapeOut' : lambda nodes, inputs, tensors, context:
        basic_lstm(nodes, inputs, tensors, context, find_type='Reshape'),

    'BasicLSTMConcatOut' : lambda nodes, inputs, tensors, context:
        basic_lstm(nodes, inputs, tensors, context, find_type='ConcatV2'),

    'Swish' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Swish',
            input = inputs
        ),
    'LeakyRelu' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'LeakyRelu',
            input = inputs
        ),

    # TODO:'Round'
    # TODO:'Rsqrt'
}

# Debug
def debug(s):
    print(s)
    return s

# Helper
def embody(v, default=0):
    return default if v is None else v

# Parse
def get_attr(node, attr_name, default=None):
    if type(node) == Struct:
        if hasattr(node, attr_name):
            return getattr(node, attr_name)
        else:
            return default

    # See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto
    val = node.attr[attr_name]

    if val.HasField("list"):
        return val.list.i
        # NOTE: can't find way to identify type of list BUT it is almost always list(int)
        # except list(float) in FractionalAvg/MaxPool
    if val.HasField("b"):
        return val.b
    if val.HasField("i"):
        return val.i
    if val.HasField("f"):
        return val.f
    if val.HasField("s"):
        return val.s.decode("utf-8")
    if val.HasField("shape"):
        return val.shape
    if val.HasField("tensor"):
        return val.tensor
    return default

def get_epsilon(layer):
    return get_attr(layer, 'epsilon', default=0.001) # default epsilon taken from tf.layers.batch_normalization

def get_layer_rank(layer):
    shape = get_attr(layer, 'shape')
    if not shape:
        return None
    if isinstance(shape, list):
        return 1
    shape = [dim.size for dim in shape.dim]
    return len(shape)

def get_layer_shape(layer):
    shape = get_attr(layer, 'shape')
    if not shape:
        return [-1, -1, -1, -1]
    shape = [dim.size for dim in shape.dim]
    if len(shape) == 1:
        return [1, 1, 1, shape[0]]
    if len(shape) == 2:
        return [shape[0], 1, 1, shape[1]]
    if len(shape) == 3:
        return [shape[0], 1, shape[1], shape[2]]
    return shape

def get_tensor_dims(tensor):
    if isinstance(tensor, np.ndarray):
        return np.shape(tensor)

    dims = []
    if tensor.tensor_shape:
        dims = [v.size for v in tensor.tensor_shape.dim]
    if tensor.float_val:
        dims = np.shape(tensor.float_val)
    if tensor.int_val:
        dims = np.shape(tensor.int_val)
    if tensor.bool_val:
        dims = np.shape(tensor.bool_val)
    return dims

def get_tensor_dtype(tensor):
    if isinstance(tensor, np.ndarray):
        return tensor.dtype

    dataType = ''
    fields = tensor.ListFields()

    for field, value in fields:
        if field.name == 'dtype' and field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
            dataType = field.enum_type.values_by_number.get(value, None).name

    return dataType

def get_tensor_data(tensor):
    if isinstance(tensor, np.ndarray):
        return tensor.astype(float)

    dims = get_tensor_dims(tensor)
    elems = np.product(dims)

    if tensor.tensor_content:
        # TODO: support other types
        dataType = get_tensor_dtype(tensor)
        if dataType == "DT_FLOAT":
            data = struct.unpack('<'+str(elems)+'f', tensor.tensor_content)
        elif dataType == "DT_INT32":
            data = struct.unpack('<'+str(elems)+'i', tensor.tensor_content)
        elif dataType == "DT_BOOL":
            data = struct.unpack('<'+str(elems)+'?', tensor.tensor_content)
        else:
            print('UNSUPPORTED: data type', dataType)
    elif tensor.float_val:
        data = tensor.float_val
    elif tensor.int_val:
        data = np.array(tensor.int_val, dtype=float)
    elif tensor.bool_val:
        data = np.array(tensor.bool_val, dtype=float)
    else:
        print('[x] CRITICAL ! UNSUPPORTED: data type', get_tensor_dtype(tensor))
        return None

    return np.array(data).reshape(dims)

def flatten(items,enter=lambda x:isinstance(x, list)):
    # http://stackoverflow.com/a/40857703
    # https://github.com/ctmakro/canton/blob/master/canton/misc.py
    """Yield items from any nested iterable; see REF."""
    for x in items:
        if enter(x):
            yield from flatten(x)
        else:
            yield x

def replace_strings_in_list(array_of_strigs, replace_with_strings):
    "A value in replace_with_strings can be either single string or list of strings"
    potentially_nested_list = [replace_with_strings.get(s) or s for s in array_of_strigs]
    return list(flatten(potentially_nested_list))

def remove_duplicates_from_list(array):
    "Preserves the order of elements in the list"
    output = []
    unique = set()
    for a in array:
        if a not in unique:
            unique.add(a)
            output.append(a)
    return output

#########################################################

def pool_to_HW(shape, data_frmt):
    """ Convert from NHWC|NCHW => HW
    """
    if len(shape) != 4:
        return shape # Not NHWC|NCHW, return as is
    if data_frmt == 'NCHW':
        return [shape[2], shape[3]]
    return [shape[1], shape[2]]

def strides_to_HW(shape, format):
    return pool_to_HW(shape, format)

def axis_to_barracuda(axis, input_rank):
    N = 0; H = 1; W = 2; C = 3
    if axis < 0:
        axis = input_rank - axis
    assert(axis >= 0)
    assert(axis < input_rank)
    if (input_rank == 4):
        # [NHWC]
        return [N,H,W,C][axis]
    if (input_rank == 3):
        # [N_WC]
        return [N,W,C][axis]
    elif (input_rank == 2):
        # [N__C]
        return [N,C][axis]
    elif (input_rank == 1):
        # [___C]
        return [C][axis]
    return -1

#########################################################

def sqr_diff(name, a, b):
    nn = barracuda.Build(name)
    d = nn.sub(a, b)
    nn.mul(d, d, out=name)
    return nn.layers

def strided_slice(name, input, input_rank, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
    assert (input_rank != -1)
    begin = begin.astype(np.int32).tolist()
    end = end.astype(np.int32).tolist()
    strides = strides.astype(np.int32).tolist()

    # StridedSlice range and mask descriptions: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice
    # TODO: I don't think elipsis and newaxis would work together well with current implementation

    assert len(begin) == len(end)
    assert len(begin) == len(strides)

    # prepare begin, end, stride arrays
    output_rank = input_rank
    insert_pos = 0
    while (ellipsis_mask):
        ellipsis_mask >>= 1
        insert_pos += 1

    # NOTE: begin=0, end=0, stride=1  <=  full range from existing axis
    #       begin=0, end=0, stride=0  <=  new axis OR shrink axis to single 1st element
    #       begin=N, end=N, stride=0  <=              shrink axis to single Nth element
    while len(begin) < input_rank:
        if insert_pos:
            begin.insert(insert_pos, 0)
            end.insert(insert_pos, 0)
            strides.insert(insert_pos, 1)
        else:
            begin.append(0)
            end.append(0)
            strides.append(1)
    assert len(begin) <= input_rank

    descriptor_count = input_rank
    for i in range(len(begin)):
        if begin_mask       & (1 << i): begin[i] = 0
        if end_mask         & (1 << i): end[i] = 0
        if new_axis_mask    & (1 << i):
            begin[i] = end[i] = strides[i] = 0
            output_rank += 1
        if shrink_axis_mask & (1 << i):
            end[i] = begin[i]
            strides[i] = 0
            output_rank -= 1

    # convert to Barracuda layout
    descriptor_count = len(begin)
    assert(descriptor_count <= 4)
    if (descriptor_count == 3):
        begin   = [begin[0],   0, begin[1], begin[2]]
        end     = [end[0],     0, end[1], end[2]]
        strides = [strides[0], 1, strides[1], strides[2]]
    elif (descriptor_count == 2):
        begin   = [begin[0],   0, 0, begin[1]]
        end     = [end[0],     0, 0, end[1]]
        strides = [strides[0], 1, 1, strides[1]]
    elif (descriptor_count == 1):
        begin   = [0, 0, 0, begin[0]]
        end     = [0, 0, 0, end[0]]
        strides = [1, 1, 1, strides[0]]

    nn = barracuda.Build(name)
    nn.strided_slice(input, begin, end, strides, output_rank, out=name)
    return nn.layers

# search backwards starting from index_of_actual_output_node for non-const node
def locate_actual_output_node(nodes, index_of_actual_output_node=-1, find_type='Reshape'):
    while (-index_of_actual_output_node-1) < len(nodes) and nodes[index_of_actual_output_node].op != find_type:
        index_of_actual_output_node -= 1
    actual_output_node = nodes[index_of_actual_output_node]
    assert(-index_of_actual_output_node < len(nodes))
    return actual_output_node

def gru(nodes, inputs, tensors, context, index_of_actual_output_node, assert_output_node_op_type=None):
    assert(len(inputs) == 2)

    def find_tensor_by_name(name, default=None):
        nonlocal tensors
        candidates = [t for t in tensors if t.name.endswith(name)]
        return candidates[0].data if candidates else default

    input = inputs[-1]
    state = inputs[0]
    gates_kernel = find_tensor_by_name('/gates/kernel')
    gates_bias = find_tensor_by_name('/gates/bias', default=np.zeros(np.shape(gates_kernel)[-1]))
    candidate_kernel = find_tensor_by_name('/candidate/kernel')
    candidate_bias = find_tensor_by_name('/candidate/bias', default=np.zeros(np.shape(candidate_kernel)[-1]))
    new_state = nodes[-1].name + '_h'

    assert(np.shape(gates_kernel)[-1] == np.shape(gates_bias)[-1])
    assert(np.shape(candidate_kernel)[-1] == np.shape(candidate_bias)[-1])

    num_gates = 2
    seq_length = 1
    hidden_size = np.shape(gates_kernel)[-1] // num_gates

    gate_kernels = np.split(gates_kernel, num_gates, axis=-1)
    gate_biases = np.split(gates_bias, num_gates, axis=-1)

    context.model_tensors['kernel_r'] = gate_kernels[0]
    context.model_tensors['kernel_u'] = gate_kernels[1]
    context.model_tensors['kernel_c'] = candidate_kernel
    context.model_tensors['bias_r'] = gate_biases[0]
    context.model_tensors['bias_u'] = gate_biases[1]
    context.model_tensors['bias_c'] = candidate_bias

    context.layer_ranks[state] = 2

    new_layers = barracuda.gru('gru', input, state,
        'kernel_r', 'kernel_u', 'kernel_c',
        'bias_r', 'bias_u', 'bias_c',
        new_state)

    state_shape = [1, 1, seq_length, hidden_size]
    context.model_memories += [state_shape, state, new_state]

    # map exptected output of the replaced pattern to output from our GRU cell
    actual_output_node = locate_actual_output_node(nodes, index_of_actual_output_node, assert_output_node_op_type)
    context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state

    return new_layers

def basic_lstm(nodes, inputs, tensors, context, find_type='Reshape'):
    assert(len(inputs) == 2)

    def find_tensor_by_name(name, default=None):
        nonlocal tensors
        candidates = [t for t in tensors if t.name.endswith(name)]
        return candidates[0].data if candidates else default

    def find_forget_bias():
        nonlocal nodes
        nonlocal tensors
        # TODO: make it more fault-tolerant
        # search for scalar float constant that is input to Add node
        # and hope it is not a constant for some complex activation function
        for t in tensors:
            if np.prod(t.shape) == 1 and get_tensor_dtype(t.obj) == "DT_FLOAT":
                for n in nodes:
                    if n.op == 'Add' and t.name in n.input:
                        return t.data
        return np.zeros(1)

    input = inputs[-1]
    state_c = inputs[0] + '_c'
    state_h = inputs[0] + '_h'
    kernel = find_tensor_by_name('/kernel')
    bias = find_tensor_by_name('/bias', default=np.zeros(np.shape(kernel)[-1]))
    forget_bias = find_forget_bias()
    new_state_c = nodes[-1].name + '_c'
    new_state_h = nodes[-1].name + '_h'

    assert(np.shape(kernel)[-1] == np.shape(bias)[-1])

    num_gates = 4
    seq_length = 1
    hidden_size = np.shape(kernel)[-1] // num_gates

    kernels = np.split(kernel, num_gates, axis=-1)
    biases = np.split(bias, num_gates, axis=-1)

    context.model_tensors['kernel_i'] = kernels[0]
    context.model_tensors['kernel_j'] = kernels[1]
    context.model_tensors['kernel_f'] = kernels[2]
    context.model_tensors['kernel_o'] = kernels[3]
    context.model_tensors['bias_i'] = biases[0]
    context.model_tensors['bias_j'] = biases[1]
    context.model_tensors['bias_f'] = biases[2] + forget_bias
    context.model_tensors['bias_o'] = biases[3]

    context.layer_ranks[state_c] = 2
    context.layer_ranks[state_h] = 2

    # lstm_value/strided_slice/stack => lstm_value
    lstm_name = re.match('^([a-zA-Z/]*lstm[_a-z]*)/.*', next(i.name for i in nodes if re.match('^[a-zA-Z/]*lstm[_a-z]*/.*', i.name))).group(1)

    new_layers = barracuda.lstm(lstm_name, input, state_c, state_h,
        'kernel_i', 'kernel_j', 'kernel_f', 'kernel_o',
        'bias_i', 'bias_j', 'bias_f', 'bias_o',
        new_state_c, new_state_h)

    state_shape = [1, 1, seq_length, hidden_size]
    context.model_memories += [state_shape, state_c, new_state_c]
    context.model_memories += [state_shape, state_h, new_state_h]

    # map expected output of the replaced pattern to output from our LSTM cell
    actual_output_node = locate_actual_output_node(nodes, -1, find_type)
    concat_out_node = locate_actual_output_node(nodes, -1, 'ConcatV2')
    context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state_h
    context.map_ignored_layer_to_its_input[concat_out_node.name] = new_state_c

    return new_layers

#########################################################

def process_layer(layer, context, args):
    model_tensors = context.model_tensors
    input_shapes = context.input_shapes
    layer_ranks = context.layer_ranks
    map_ignored_layer_to_its_input = context.map_ignored_layer_to_its_input

    name = layer.name
    class_name = layer.op
    inputs = layer.input # Tensorflow inputs are always explicit, but in case of Keras we had 'inputs = layer.input or [prev_layer_name]'
    inputs = replace_strings_in_list(inputs, map_ignored_layer_to_its_input)

    if class_name == 'Nop':
        assert(len(inputs) <= 1)
        map_ignored_layer_to_its_input[name] = inputs
        return

    if class_name == 'Const':
        model_tensors[name] = layer.attr["value"].tensor
        layer_ranks[name] = get_layer_rank(layer) or 1 # we treast constants without shape as rank=1 (scalar converted to tensor)
        return

    if class_name == 'Placeholder':
        assert(inputs == [])
        map_ignored_layer_to_its_input[name] = inputs
        input_shapes[name] = get_layer_shape(layer)
        layer_ranks[name] = get_layer_rank(layer)
        return

    if class_name == 'Identity':
        connected_to_const = len(inputs) == 1 and inputs[0] in model_tensors
        if connected_to_const:
            map_ignored_layer_to_its_input[name] = inputs
            return
        else:
            # treat Identity layer that are connected to processing nodes
            # as output from the network
            class_name = 'Linear'

    if args.print_layers or args.verbose:
        var_tensors = [i for i in inputs if i not in model_tensors]
        const_tensors = [i for i in inputs if i in model_tensors]
        print("'%s' %s Vars:%s Const:%s" % (name, class_name, var_tensors, const_tensors))

    if class_name in known_activations:
        activation = class_name
        class_name = 'Activation'
    else:
        activation = 'Linear'

    if not class_name in known_classes:
        if class_name in requires_runtime_flag:
            print('SKIP:', class_name, 'layer is used only for training')
        else:
            print('IGNORED:', class_name, 'unknown layer')
        map_ignored_layer_to_its_input[name] = inputs
        return

    klass = known_classes[class_name]
    if type(klass) == int:
        klass = Struct(id = klass)

    o_l = Struct()
    o_l.type = klass.id
    o_l.class_name = class_name
    o_l.name = name

    auto_pad    = get_attr(layer, 'padding') # layer.attr['padding'].s.decode("utf-8")
    pads        = get_attr(layer, 'pads')
    strides     = get_attr(layer, 'strides') # layer.attr['strides'].list.i
    pool_size   = get_attr(layer, 'ksize') # layer.attr['ksize'].list.i
    shape       = get_attr(layer, 'shape')
    starts      = get_attr(layer, 'starts')
    ends        = get_attr(layer, 'ends')
    slice_strides = get_attr(layer, 'slice_strides')
    rank        = get_attr(layer, 'rank') or get_layer_rank(layer)
    data_frmt   = get_attr(layer, 'data_format') # layer.attr['data_format'].s.decode("utf-8")
    axis        = get_attr(layer, 'axis')
    alpha       = get_attr(layer, 'alpha', default=1)
    beta        = get_attr(layer, 'beta')

    if activation and not activation in known_activations:
        print('IGNORED: unknown activation', activation)
    if auto_pad and not auto_pad in known_paddings:
        print('IGNORED: unknown padding', auto_pad)
    if data_frmt and not data_frmt in supported_data_formats:
        print('UNSUPPORTED: data format', data_frmt)

    o_l.activation  = known_activations.get(activation) or 0
    o_l.pads        = known_paddings.get(auto_pad) if auto_pad else pads or starts or [0,0,0,0]
    o_l.strides     = strides_to_HW(strides, data_frmt) if strides else slice_strides or []
    o_l.pool_size   = pool_to_HW(pool_size, data_frmt) if pool_size else ends or shape or []
    o_l.axis        = embody(axis, default=-1)
    o_l.alpha       = embody(alpha, default=1)
    o_l.beta        = beta or 0
    o_l.rank        = -1 # default initialization, actual value will be set later on in this function

    tensor_names = [i for i in inputs if i in model_tensors]

    # temp_tensor_data = get_tensor_data(model_tensors[x])
    # if temp_tensor_data is not None:
    o_l.tensors = [Struct(name = x, shape = get_tensor_dims(model_tensors[x]), data = get_tensor_data(model_tensors[x]))
        for x in tensor_names]
    # Patch shapes & data
    layer_has_model_tensors = len(o_l.tensors) > 0
    if hasattr(klass, 'out_shapes') and layer_has_model_tensors:
        shapes = klass.out_shapes([x.shape for x in o_l.tensors])

        # if we have more shapes than actual tensors,
        # then create & fill missing tensors with zeros
        in_tensor_num = len(o_l.tensors)
        for index, new_shape in enumerate(shapes):
            if index >= in_tensor_num:
                new_tensor = Struct(name = ('%s/patch:%i') % (name, index-in_tensor_num),
                    shape = new_shape,
                    data = np.zeros(new_shape))
                o_l.tensors.append(new_tensor)
        assert(len(shapes) <= len(o_l.tensors))

        if hasattr(klass, 'patch_data'):
            data = [x.data for x in o_l.tensors]

            patch_data_fn = klass.patch_data
            patch_data_expected_arg_count = patch_data_fn.__code__.co_argcount
            patch_data_args = (data, layer) if patch_data_expected_arg_count > 1 else (data,)
            tensor_data = patch_data_fn(*patch_data_args)
            o_l.tensors = o_l.tensors[:len(tensor_data)] # resize tensor array to match patched data - patching might reduce number of tensors
            for x, data in zip(o_l.tensors, tensor_data):
                x.data = data

        # after this point we should have equal amount of shapes and tensors
        assert(len(o_l.tensors) == len(shapes))

        for x, shape in zip(o_l.tensors, shapes):
            assert x.data.size == np.prod(shape)
            x.shape = shape

        o_l.inputs = [i for i in inputs if i not in model_tensors]

    else:
        # no 'patch_data' lambda was specifiowned, op does not require tensor args
        o_l.tensors = []
        o_l.inputs = inputs

    # Force all tensors to float32
    for x in o_l.tensors:
        x.data = x.data.astype(np.float32)

    input_ranks = [layer_ranks.get(i, -1) for i in o_l.inputs]
    for i in o_l.inputs:
        if i not in layer_ranks and 'lstm' not in i:
            print("WARNING: rank unknown for tensor", i, "while processing node", name)
    if hasattr(klass, 'rank'):
        rank = klass.rank
        if hasattr(rank, '__call__'):
            assert(-1 not in input_ranks) # for rank() lambda all input ranks have to be known (not -1)
            rank = rank(input_ranks)
    if rank == None:
        def all_elements_equal(arr): # http://stackoverflow.com/q/3844948/
            return arr.count(arr[0]) == len(arr)
        assert(len(input_ranks) > 0)
        assert(all_elements_equal(input_ranks))
        rank = input_ranks[0]
    layer_ranks[name] = rank
    o_l.rank = rank

    # Layer is ready
    context.layers.append(o_l)

class ModelBuilderContext:
    def __init__(self):
        self.layers = []
        self.input_shapes = {}
        self.model_tensors = {}
        self.model_memories = []
        self.layer_ranks = {}
        self.map_ignored_layer_to_its_input = {}

def process_model(model, args):
    o_context = ModelBuilderContext()

    # Find node patterns
    nodes_as_array = [node for node in model.node]
    nodes_as_array = slow_but_stable_topological_sort(nodes_as_array, verbose=True)

    node_index = 0
    while node_index < len(nodes_as_array):
        node = nodes_as_array[node_index]
        match = False
        for pattern_repr, pattern_name in known_patterns.items():
            pattern = eval(pattern_repr)
            if node_index + len(pattern) > len(nodes_as_array):
                continue # pattern too long, skip

            require_exact_match = (pattern[0] == 'Const' or pattern[0] == 'Identity')
            pattern_end = node_index

            def match_node(node, pattern):
                return node.op == pattern or (hasattr(pattern, 'match') and pattern.match(node.name))

            for p in pattern:
                if not require_exact_match:
                    while pattern_end < len(nodes_as_array) and nodes_as_array[pattern_end].op != p and (
                        nodes_as_array[pattern_end].op == 'Const' or
                        nodes_as_array[pattern_end].op == 'Identity'):
                        pattern_end += 1
                if pattern_end >= len(nodes_as_array):
                    break

                match = False
                if (hasattr(p, 'match')): # regexp
                    while pattern_end < len(nodes_as_array) and p.match(nodes_as_array[pattern_end].name):
                        match = True
                        pattern_end += 1
                else: # exact string
                    match = nodes_as_array[pattern_end].op == p
                    pattern_end += 1

                if not match:
                    break

            def get_tensors(pattern_nodes):
                nonlocal o_context
                map_ignored_layer_to_its_input = o_context.map_ignored_layer_to_its_input
                model_tensors = o_context.model_tensors

                # tensors <= all Const nodes within this pattern
                const_nodes = [n for n in pattern_nodes if n.op == 'Const']

                # TODO: unify / reuse code from process_layer
                identity_nodes = [n for n in pattern_nodes if n.op == 'Identity']
                for i in identity_nodes:
                    inputs = replace_strings_in_list(i.input, map_ignored_layer_to_its_input)
                    map_ignored_layer_to_its_input[i.name] = inputs

                # gather inputs from Op nodes (not Const, not Identity)
                op_nodes = [n for n in pattern_nodes if n not in const_nodes and n not in identity_nodes]
                inputs_to_op_nodes = list(flatten([list(flatten(n.input)) for n in op_nodes]))
                inputs_to_op_nodes = replace_strings_in_list(inputs_to_op_nodes, map_ignored_layer_to_its_input)
                inputs_to_op_nodes = [i.split(':')[0] for i in inputs_to_op_nodes]

                const_nodes_by_name = {n.name:n for n in const_nodes}
                tensors = []
                for i in inputs_to_op_nodes:
                    if i in model_tensors:
                        src = model_tensors[i]

                        temp_tensor_data = get_tensor_data(src)
                        if not ( temp_tensor_data is None ):
                            tensors += [Struct(name = i, obj = src, shape = get_tensor_dims(src), data = temp_tensor_data)]

                    elif i in const_nodes_by_name:
                        src = const_nodes_by_name[i].attr["value"].tensor

                        temp_tensor_data = get_tensor_data(src)
                        if not ( temp_tensor_data is None ):
                            tensors += [Struct(name = i, obj = src, shape = get_tensor_dims(src), data = temp_tensor_data)]

                tensor_names = [n.name for n in tensors]

                # filter only inputs that are coming from nodes that are outside this pattern
                # preserve the order
                pattern_nodes = [n.name for n in pattern_nodes] + tensor_names
                #inputs_from_outside_pattern = remove_duplicates_from_list([i for i in inputs_to_op_nodes if nodes_by_name[i] not in pattern_nodes])
                inputs_from_outside_pattern = remove_duplicates_from_list([i for i in inputs_to_op_nodes if i not in pattern_nodes])

                return inputs_from_outside_pattern, tensors

            if match:
                nodes = nodes_as_array[node_index:pattern_end]
                name = nodes[-1].name
                var_tensors, const_tensors = get_tensors(nodes)
                if args.print_patterns or args.verbose:
                    print('PATTERN:', name, '~~', pattern_name, '<-', var_tensors, '+', [t.name for t in const_tensors])
                    print('        ', pattern)
                for n in nodes:
                    if n.op == 'Const' or n.op == 'Identity':
                        process_layer(n, o_context, args)

                new_layers = transform_patterns[pattern_name](nodes, var_tensors, const_tensors, o_context)
                if not isinstance(new_layers, list):
                    if not hasattr(new_layers, name): new_layers.name = name
                    new_layers = [new_layers]

                for l in new_layers:
                    # TODO: prefix new layer names with scope, patch inputs
                    #l.name = name + '/' + l.name
                    process_layer(l, o_context, args)

                node_index = pattern_end
                break # pattern found & processed

        if not match:
            # TODO: gather tensors in the same way as patterns do
            process_layer(node, o_context, args)
            node_index += 1

    def find_unconnected_const_nodes(nodes):
        nodes_with_consts = {node.name: node for node in nodes if node.op == 'Const'}
        for node in nodes:
            for i in node.input:
                nodes_with_consts.pop(i, None)
        return list(nodes_with_consts.keys())

    return o_context.layers, o_context.input_shapes, o_context.model_tensors, o_context.model_memories, \
        find_unconnected_const_nodes(nodes_as_array)


# Sort nodes so that all input dependencies are satisfied beforehand
# while preserving original order of the nodes in the model whenever possible.
# NOITE: preservation of original order is important for pattern matching
def slow_but_stable_topological_sort(nodes, verbose):

    nodes_with_consts = [node for node in nodes if node.op == 'Const']
    nodes_for_sorting = [node for node in nodes if node.op != 'Const']

    # TODO: optimize for performance
    # based on http://blog.gapotchenko.com/stable-topological-sort

    def assign_ids(nodes):
        ids = []
        id_by_name = {}
        id = 0
        for node in nodes:
            id_by_name[node.name] = id;
            ids.append(id)
            id += 1

        inputs_by_id = [None] * len(nodes)
        for node in nodes:
            id = id_by_name[node.name]
            inputs_by_id[id] = {id_by_name.get(i, -1) for i in node.input}

        return ids, inputs_by_id

    def sort(ids, inputs_by_id, verbose_lambda):
        sorted = False
        n = len(ids)
        while not sorted:
            sorted = True
            for i in range(n):
                for j in range (i):
                    if ids[i] in inputs_by_id[ids[j]]:
                        tmp = ids.pop(i)
                        ids.insert(j, tmp)
                        sorted = False
            verbose_lambda(sorted)
        return ids

    prefix_printed = False
    def print_status(sorted):
        nonlocal prefix_printed
        if not sorted:
            if not prefix_printed:
                print('Sorting model, may take a while...', end="", flush=True)
                prefix_printed = True
            else:
                print('.', end="", flush=True)
        else:
            if prefix_printed:
                print(' Done!')

    ids, inputs_by_id = assign_ids(nodes_for_sorting)
    ids = sort(ids, inputs_by_id, lambda sorted: print_status(sorted) if verbose else None)


    assert(len(ids) == len(nodes_for_sorting))
    assert(len(ids) + len(nodes_with_consts) == len(nodes))
    return nodes_with_consts + [nodes_for_sorting[id] for id in ids]

def very_slow_but_stable_topological_sort(nodes, verbose):
    # TODO: optimize for performance
    # based on http://blog.gapotchenko.com/stable-topological-sort
    n = len(nodes)
    sorted = False

    while not sorted:
        sorted = True
        for i in range(n):
            for j in range (i):
                if nodes[i].name in nodes[j].input:
                    tmp = nodes.pop(i)
                    nodes.insert(j, tmp)
                    sorted = False
    assert(len(nodes) == n)
    return nodes

#########################################################

def convert(source_file, target_file, trim_unused_by_output="", verbose=False, compress_f16=False):
    """
    Converts a TensorFlow model into a Barracuda model.
    :param source_file: The TensorFlow Model
    :param target_file: The name of the file the converted model will be saved to
    :param trim_unused_by_output: The regexp to match output nodes to remain in the model. All other uconnected nodes will be removed.
    :param verbose: If True, will display debug messages
    :param compress_f16: If true, the float values will be converted to f16
    :return:
    """
    if (type(verbose)==bool):
        args = Struct()
        args.verbose = verbose
        args.print_layers = verbose
        args.print_source_json = verbose
        args.print_barracuda_json = verbose
        args.print_layer_links = verbose
        args.print_patterns = verbose
        args.print_tensors = verbose
        args.print_supported_ops = verbose
    else:
        args = verbose

    if args.print_supported_ops:
        barracuda.print_known_operations(known_classes, known_activations)

    # Load Tensorflow model
    print("Converting %s to %s" % (source_file, target_file))
    f = open(source_file, 'rb')
    i_model = tf.compat.v1.GraphDef()
    i_model.ParseFromString(f.read())

    if args.verbose:
        print('OP_TYPES:', {layer.op for layer in i_model.node})

    if args.print_source_json or args.verbose:
        for layer in i_model.node:
            if not layer.op == 'Const':
                print('MODEL:', MessageToJson(layer) + ",")

    # Convert
    o_model = barracuda.Model()
    o_model.layers, o_input_shapes, o_model.tensors, o_model.memories, o_model.globals = \
        process_model(i_model, args)

    # Cleanup unconnected Identities (they might linger after processing complex node patterns like LSTM)
    def cleanup_layers(layers):
        all_layers = {l.name for l in layers}
        all_inputs = {i for l in layers for i in l.inputs}

        def is_unconnected_identity(layer):
            if layer.class_name == 'Activation' and layer.activation == 0: # Identity
                assert(len(layer.inputs) == 1)
                if layer.inputs[0] not in all_layers and layer.name not in all_inputs:
                    return True;
            return False;

        return [l for l in layers if not is_unconnected_identity(l)]
    o_model.layers = cleanup_layers(o_model.layers)


    # Trim
    if trim_unused_by_output:
        o_model.layers = barracuda.trim(o_model.layers, trim_unused_by_output, args.verbose)

    # Create load layer for constants
    def dims_to_barracuda_shape(dims):
        shape = list(dims)
        while len(shape) < 4:
            shape = [1] + shape
        return shape


    # temp_tensor_data = get_tensor_data(tensor)
    # if not ( temp_tensor_data is None ):
    # if temp_tensor_data is not None:
    barracuda.setup_constants(o_model,
        lambda tensor: dims_to_barracuda_shape(get_tensor_dims(tensor)),
        lambda tensor: get_tensor_data(tensor))


    # Find model inputs & outputs
    all_inputs = {i for l in o_model.layers for i in l.inputs}
    all_layers = {l.name for l in o_model.layers}

    # global inputs => are inputs that are NOT connected to any layer in the network
    # global outputs => are outputs that are NOT feeding any layer in the network OR are coming from Identity layers
    o_model.inputs = {i:o_input_shapes[i] for l in o_model.layers for i in l.inputs if i not in all_layers and i not in o_model.memories}

    def is_output_layer(layer):
        if layer.class_name == 'Const': # Constants never count as global output even when unconnected
            return False;
        if layer.name not in all_inputs: # this layer is not inputing to any other layer
            return True
        if layer.class_name == 'Activation' and layer.activation == 0: # Identity marks global output
            return True
        return False
    o_model.outputs = [l.name for l in o_model.layers if is_output_layer(l)]

    # Compress
    if compress_f16:
        o_model = barracuda.compress(o_model)

    # Sort model so that layer inputs are always ready upfront
    o_model.layers = barracuda.sort(o_model.layers, o_model.inputs, o_model.memories, args.verbose)
    o_model.layers = barracuda.fuse(o_model.layers, args.verbose)

    # Summary
    barracuda.summary(o_model,
        print_layer_links = args.print_layer_links or args.verbose,
        print_barracuda_json = args.print_barracuda_json or args.verbose,
        print_tensors = args.print_tensors or args.verbose)

    # Write to file
    barracuda.write(o_model, target_file)
    print('DONE: wrote', target_file, 'file.')