Skip to content

Instantly share code, notes, and snippets.

@ppwwyyxx
Last active March 18, 2021 23:30
Show Gist options
  • Save ppwwyyxx/4c3527f9da8b0f49a8aaee5dcd734450 to your computer and use it in GitHub Desktop.
Save ppwwyyxx/4c3527f9da8b0f49a8aaee5dcd734450 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
import torch
from torch import nn
from fvcore.nn import FlopCountAnalysis, flop_count_table
from pypapi import events, papi_high as high
def main():
model = nn.Conv2d(256, 128, 3, padding=1)
model.cpu()
model.double()
model.eval()
print("PAPI, theoretical")
for bs in range(1, 13):
input = torch.randn((bs, 256, 28, 28)).double()
with torch.no_grad():
for evt in ['PAPI_DP_OPS']:
high.start_counters([getattr(events, evt)])
_ = model(input)
papi_flop = high.stop_counters()[0] / 1e9
flop = FlopCountAnalysis(model, input).total() / 1e9
flop *= 2 # different convention
print(papi_flop, flop)
if __name__ == '__main__':
main()
#-*- coding: utf-8 -*-
#File:
import argparse
import torch
from torch import nn
from fvcore.nn import FlopCountAnalysis, flop_count_table, flop_count_str
from pypapi import events, papi_high as high
def main():
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.cpu()
roberta.double()
roberta.eval()
tgt_len = 50
tokens = roberta.encode('hi'*(tgt_len - 3)).unsqueeze(0).to( device=roberta.device)
assert tokens.numel() == tgt_len
class A(nn.Module):
def forward(self, tokens):
return self.m.extract_features(tokens)
with torch.no_grad():
for evt in ['PAPI_DP_OPS']:
high.start_counters([getattr(events, evt)])
features, _ = roberta.model.extract_features(tokens)
papi_flops = high.stop_counters()
print('total flops (papi, {})'.format(evt), papi_flops[0] / 1e9)
model = A(); model.m = roberta.model
flop = FlopCountAnalysis(model, tokens)
print(flop_count_table(flop, max_depth=5, show_param_shapes=False))
print(flop_count_str(flop))
print("Total", flop.total() / 1e9)
if __name__ == '__main__':
main()
total flops (papi, PAPI_DP_OPS) 9.003876419
| module | #parameters | #flops |
|:---------------------------------------------------------------|:--------------|:------------|
| model.m | 0.356G | 17.9G |
| m.encoder | 0.355G | 17.9G |
| m.encoder.sentence_encoder | 0.354G | 15.2G |
| m.encoder.sentence_encoder.embed_tokens | 51.5M | 0 |
| m.encoder.sentence_encoder.embed_positions | 0.526M | 0 |
| m.encoder.sentence_encoder.layernorm_embedding | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers | 0.302G | |
| m.encoder.sentence_encoder.layers.0 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.0.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.0.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.0.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.0.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.0.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.1 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.1.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.1.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.1.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.1.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.1.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.2 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.2.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.2.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.2.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.2.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.2.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.3 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.3.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.3.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.3.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.3.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.3.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.4 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.4.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.4.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.4.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.4.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.4.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.5 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.5.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.5.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.5.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.5.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.5.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.6 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.6.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.6.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.6.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.6.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.6.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.7 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.7.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.7.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.7.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.7.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.7.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.8 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.8.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.8.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.8.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.8.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.8.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.9 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.9.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.9.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.9.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.9.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.9.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.10 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.10.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.10.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.10.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.10.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.10.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.11 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.11.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.11.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.11.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.11.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.11.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.12 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.12.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.12.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.12.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.12.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.12.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.13 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.13.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.13.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.13.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.13.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.13.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.14 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.14.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.14.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.14.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.14.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.14.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.15 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.15.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.15.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.15.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.15.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.15.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.16 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.16.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.16.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.16.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.16.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.16.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.17 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.17.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.17.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.17.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.17.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.17.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.18 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.18.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.18.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.18.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.18.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.18.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.19 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.19.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.19.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.19.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.19.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.19.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.20 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.20.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.20.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.20.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.20.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.20.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.21 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.21.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.21.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.21.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.21.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.21.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.22 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.22.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.22.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.22.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.22.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.22.final_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.23 | 12.6M | 0.635G |
| m.encoder.sentence_encoder.layers.23.self_attn | 4.2M | 0.215G |
| m.encoder.sentence_encoder.layers.23.self_attn_layer_norm | 2.05K | 0.256M |
| m.encoder.sentence_encoder.layers.23.fc1 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.23.fc2 | 4.2M | 0.21G |
| m.encoder.sentence_encoder.layers.23.final_layer_norm | 2.05K | 0.256M |
| m.encoder.lm_head | 1.1M | 2.63G |
| m.encoder.lm_head.dense | 1.05M | 52.4M |
| m.encoder.lm_head.layer_norm | 2.05K | 0.256M |
| m.classification_heads.mnli | 1.05M | |
| m.classification_heads.mnli.dense | 1.05M | |
| m.classification_heads.mnli.out_proj | 3.08K | |
Input sizes (torch.Tensor only): [[50]]
N/A indicates a possibly missing statistic due to how the module was called. Missing values are still included in the parent's total.
A(
n_params: 0.356G, n_flops: 17.9G
(m): RobertaModel(
n_params: 0.356G, n_flops: 17.9G
(encoder): RobertaEncoder(
n_params: 0.355G, n_flops: 17.9G
(sentence_encoder): TransformerEncoder(
n_params: 0.354G, n_flops: 15.2G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(embed_tokens): Embedding(
50265, 1024, padding_idx=1
n_params: 51.5M, n_flops: 0
)
(embed_positions): LearnedPositionalEmbedding(
514, 1024, padding_idx=1
n_params: 0.526M, n_flops: 0
)
(layernorm_embedding): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(layers): ModuleList(
n_params: 0.302G, n_flops: N/A
(0): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(1): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(2): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(3): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(4): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(5): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(6): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(7): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(8): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(9): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(10): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(11): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(12): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(13): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(14): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(15): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(16): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(17): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(18): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(19): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(20): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(21): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(22): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
(23): TransformerEncoderLayer(
n_params: 12.6M, n_flops: 0.635G
(self_attn): MultiheadAttention(
n_params: 4.2M, n_flops: 0.215G
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(k_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(v_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(q_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
)
(self_attn_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
(dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A)
(fc1): Linear(
in_features=1024, out_features=4096, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(fc2): Linear(
in_features=4096, out_features=1024, bias=True
n_params: 4.2M, n_flops: 0.21G
)
(final_layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
)
)
(lm_head): RobertaLMHead(
n_params: 1.1M, n_flops: 2.63G
(dense): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: 52.4M
)
(layer_norm): LayerNorm(
(1024,), eps=1e-05, elementwise_affine=True
n_params: 2.05K, n_flops: 0.256M
)
)
)
(classification_heads): ModuleDict(
n_params: 1.05M, n_flops: N/A
(mnli): RobertaClassificationHead(
n_params: 1.05M, n_flops: N/A
(dense): Linear(
in_features=1024, out_features=1024, bias=True
n_params: 1.05M, n_flops: N/A
)
(dropout): Dropout(
p=0.3, inplace=False
n_params: 0, n_flops: N/A
)
(out_proj): Linear(
in_features=1024, out_features=3, bias=True
n_params: 3.08K, n_flops: N/A
)
)
)
)
)
Total 17.8611712
@ppwwyyxx
Copy link
Author

ppwwyyxx commented Mar 18, 2021

test-conv-flop.py prints:

PAPI, theoretical
0.463632384 0.462422016
0.463632384 0.924844032
0.463632384 1.387266048
0.463632384 1.849688064
0.927264768 2.31211008
0.927264768 2.774532096
0.927264768 3.236954112
0.927264768 3.699376128
1.390897152 4.161798144
1.390897152 4.62422016
1.390897152 5.086642176
1.390897152 5.549064192

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment