Skip to content

Instantly share code, notes, and snippets.

@echan00
Created November 11, 2019 09:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save echan00/4154458473c4698c93d0597c617ff01a to your computer and use it in GitHub Desktop.
Save echan00/4154458473c4698c93d0597c617ff01a to your computer and use it in GitHub Desktop.
encoder.version torch.Size([1])
encoder.embed_tokens.weight torch.Size([41104, 512])
encoder.embed_positions._float_tensor torch.Size([1])
encoder.layers.0.self_attn.in_proj_weight torch.Size([1536, 512])
encoder.layers.0.self_attn.in_proj_bias torch.Size([1536])
encoder.layers.0.self_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.0.self_attn.out_proj.bias torch.Size([512])
encoder.layers.0.bert_attn.k_proj_weight torch.Size([512, 768])
encoder.layers.0.bert_attn.v_proj_weight torch.Size([512, 768])
encoder.layers.0.bert_attn.q_proj_weight torch.Size([512, 512])
encoder.layers.0.bert_attn.in_proj_bias torch.Size([1536])
encoder.layers.0.bert_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.0.bert_attn.out_proj.bias torch.Size([512])
encoder.layers.0.self_attn_layer_norm.weight torch.Size([512])
encoder.layers.0.self_attn_layer_norm.bias torch.Size([512])
encoder.layers.0.fc1.weight torch.Size([1024, 512])
encoder.layers.0.fc1.bias torch.Size([1024])
encoder.layers.0.fc2.weight torch.Size([512, 1024])
encoder.layers.0.fc2.bias torch.Size([512])
encoder.layers.0.final_layer_norm.weight torch.Size([512])
encoder.layers.0.final_layer_norm.bias torch.Size([512])
encoder.layers.1.self_attn.in_proj_weight torch.Size([1536, 512])
encoder.layers.1.self_attn.in_proj_bias torch.Size([1536])
encoder.layers.1.self_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.1.self_attn.out_proj.bias torch.Size([512])
encoder.layers.1.bert_attn.k_proj_weight torch.Size([512, 768])
encoder.layers.1.bert_attn.v_proj_weight torch.Size([512, 768])
encoder.layers.1.bert_attn.q_proj_weight torch.Size([512, 512])
encoder.layers.1.bert_attn.in_proj_bias torch.Size([1536])
encoder.layers.1.bert_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.1.bert_attn.out_proj.bias torch.Size([512])
encoder.layers.1.self_attn_layer_norm.weight torch.Size([512])
encoder.layers.1.self_attn_layer_norm.bias torch.Size([512])
encoder.layers.1.fc1.weight torch.Size([1024, 512])
encoder.layers.1.fc1.bias torch.Size([1024])
encoder.layers.1.fc2.weight torch.Size([512, 1024])
encoder.layers.1.fc2.bias torch.Size([512])
encoder.layers.1.final_layer_norm.weight torch.Size([512])
encoder.layers.1.final_layer_norm.bias torch.Size([512])
encoder.layers.2.self_attn.in_proj_weight torch.Size([1536, 512])
encoder.layers.2.self_attn.in_proj_bias torch.Size([1536])
encoder.layers.2.self_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.2.self_attn.out_proj.bias torch.Size([512])
encoder.layers.2.bert_attn.k_proj_weight torch.Size([512, 768])
encoder.layers.2.bert_attn.v_proj_weight torch.Size([512, 768])
encoder.layers.2.bert_attn.q_proj_weight torch.Size([512, 512])
encoder.layers.2.bert_attn.in_proj_bias torch.Size([1536])
encoder.layers.2.bert_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.2.bert_attn.out_proj.bias torch.Size([512])
encoder.layers.2.self_attn_layer_norm.weight torch.Size([512])
encoder.layers.2.self_attn_layer_norm.bias torch.Size([512])
encoder.layers.2.fc1.weight torch.Size([1024, 512])
encoder.layers.2.fc1.bias torch.Size([1024])
encoder.layers.2.fc2.weight torch.Size([512, 1024])
encoder.layers.2.fc2.bias torch.Size([512])
encoder.layers.2.final_layer_norm.weight torch.Size([512])
encoder.layers.2.final_layer_norm.bias torch.Size([512])
encoder.layers.3.self_attn.in_proj_weight torch.Size([1536, 512])
encoder.layers.3.self_attn.in_proj_bias torch.Size([1536])
encoder.layers.3.self_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.3.self_attn.out_proj.bias torch.Size([512])
encoder.layers.3.bert_attn.k_proj_weight torch.Size([512, 768])
encoder.layers.3.bert_attn.v_proj_weight torch.Size([512, 768])
encoder.layers.3.bert_attn.q_proj_weight torch.Size([512, 512])
encoder.layers.3.bert_attn.in_proj_bias torch.Size([1536])
encoder.layers.3.bert_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.3.bert_attn.out_proj.bias torch.Size([512])
encoder.layers.3.self_attn_layer_norm.weight torch.Size([512])
encoder.layers.3.self_attn_layer_norm.bias torch.Size([512])
encoder.layers.3.fc1.weight torch.Size([1024, 512])
encoder.layers.3.fc1.bias torch.Size([1024])
encoder.layers.3.fc2.weight torch.Size([512, 1024])
encoder.layers.3.fc2.bias torch.Size([512])
encoder.layers.3.final_layer_norm.weight torch.Size([512])
encoder.layers.3.final_layer_norm.bias torch.Size([512])
encoder.layers.4.self_attn.in_proj_weight torch.Size([1536, 512])
encoder.layers.4.self_attn.in_proj_bias torch.Size([1536])
encoder.layers.4.self_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.4.self_attn.out_proj.bias torch.Size([512])
encoder.layers.4.bert_attn.k_proj_weight torch.Size([512, 768])
encoder.layers.4.bert_attn.v_proj_weight torch.Size([512, 768])
encoder.layers.4.bert_attn.q_proj_weight torch.Size([512, 512])
encoder.layers.4.bert_attn.in_proj_bias torch.Size([1536])
encoder.layers.4.bert_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.4.bert_attn.out_proj.bias torch.Size([512])
encoder.layers.4.self_attn_layer_norm.weight torch.Size([512])
encoder.layers.4.self_attn_layer_norm.bias torch.Size([512])
encoder.layers.4.fc1.weight torch.Size([1024, 512])
encoder.layers.4.fc1.bias torch.Size([1024])
encoder.layers.4.fc2.weight torch.Size([512, 1024])
encoder.layers.4.fc2.bias torch.Size([512])
encoder.layers.4.final_layer_norm.weight torch.Size([512])
encoder.layers.4.final_layer_norm.bias torch.Size([512])
encoder.layers.5.self_attn.in_proj_weight torch.Size([1536, 512])
encoder.layers.5.self_attn.in_proj_bias torch.Size([1536])
encoder.layers.5.self_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.5.self_attn.out_proj.bias torch.Size([512])
encoder.layers.5.bert_attn.k_proj_weight torch.Size([512, 768])
encoder.layers.5.bert_attn.v_proj_weight torch.Size([512, 768])
encoder.layers.5.bert_attn.q_proj_weight torch.Size([512, 512])
encoder.layers.5.bert_attn.in_proj_bias torch.Size([1536])
encoder.layers.5.bert_attn.out_proj.weight torch.Size([512, 512])
encoder.layers.5.bert_attn.out_proj.bias torch.Size([512])
encoder.layers.5.self_attn_layer_norm.weight torch.Size([512])
encoder.layers.5.self_attn_layer_norm.bias torch.Size([512])
encoder.layers.5.fc1.weight torch.Size([1024, 512])
encoder.layers.5.fc1.bias torch.Size([1024])
encoder.layers.5.fc2.weight torch.Size([512, 1024])
encoder.layers.5.fc2.bias torch.Size([512])
encoder.layers.5.final_layer_norm.weight torch.Size([512])
encoder.layers.5.final_layer_norm.bias torch.Size([512])
decoder.version torch.Size([1])
decoder.embed_tokens.weight torch.Size([41104, 512])
decoder.embed_positions._float_tensor torch.Size([1])
decoder.layers.0.self_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.0.self_attn.in_proj_bias torch.Size([1536])
decoder.layers.0.self_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.0.self_attn.out_proj.bias torch.Size([512])
decoder.layers.0.self_attn_layer_norm.weight torch.Size([512])
decoder.layers.0.self_attn_layer_norm.bias torch.Size([512])
decoder.layers.0.encoder_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.0.encoder_attn.in_proj_bias torch.Size([1536])
decoder.layers.0.encoder_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.0.encoder_attn.out_proj.bias torch.Size([512])
decoder.layers.0.bert_attn.k_proj_weight torch.Size([512, 768])
decoder.layers.0.bert_attn.v_proj_weight torch.Size([512, 768])
decoder.layers.0.bert_attn.q_proj_weight torch.Size([512, 512])
decoder.layers.0.bert_attn.in_proj_bias torch.Size([1536])
decoder.layers.0.bert_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.0.bert_attn.out_proj.bias torch.Size([512])
decoder.layers.0.encoder_attn_layer_norm.weight torch.Size([512])
decoder.layers.0.encoder_attn_layer_norm.bias torch.Size([512])
decoder.layers.0.fc1.weight torch.Size([1024, 512])
decoder.layers.0.fc1.bias torch.Size([1024])
decoder.layers.0.fc2.weight torch.Size([512, 1024])
decoder.layers.0.fc2.bias torch.Size([512])
decoder.layers.0.final_layer_norm.weight torch.Size([512])
decoder.layers.0.final_layer_norm.bias torch.Size([512])
decoder.layers.1.self_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.1.self_attn.in_proj_bias torch.Size([1536])
decoder.layers.1.self_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.1.self_attn.out_proj.bias torch.Size([512])
decoder.layers.1.self_attn_layer_norm.weight torch.Size([512])
decoder.layers.1.self_attn_layer_norm.bias torch.Size([512])
decoder.layers.1.encoder_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.1.encoder_attn.in_proj_bias torch.Size([1536])
decoder.layers.1.encoder_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.1.encoder_attn.out_proj.bias torch.Size([512])
decoder.layers.1.bert_attn.k_proj_weight torch.Size([512, 768])
decoder.layers.1.bert_attn.v_proj_weight torch.Size([512, 768])
decoder.layers.1.bert_attn.q_proj_weight torch.Size([512, 512])
decoder.layers.1.bert_attn.in_proj_bias torch.Size([1536])
decoder.layers.1.bert_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.1.bert_attn.out_proj.bias torch.Size([512])
decoder.layers.1.encoder_attn_layer_norm.weight torch.Size([512])
decoder.layers.1.encoder_attn_layer_norm.bias torch.Size([512])
decoder.layers.1.fc1.weight torch.Size([1024, 512])
decoder.layers.1.fc1.bias torch.Size([1024])
decoder.layers.1.fc2.weight torch.Size([512, 1024])
decoder.layers.1.fc2.bias torch.Size([512])
decoder.layers.1.final_layer_norm.weight torch.Size([512])
decoder.layers.1.final_layer_norm.bias torch.Size([512])
decoder.layers.2.self_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.2.self_attn.in_proj_bias torch.Size([1536])
decoder.layers.2.self_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.2.self_attn.out_proj.bias torch.Size([512])
decoder.layers.2.self_attn_layer_norm.weight torch.Size([512])
decoder.layers.2.self_attn_layer_norm.bias torch.Size([512])
decoder.layers.2.encoder_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.2.encoder_attn.in_proj_bias torch.Size([1536])
decoder.layers.2.encoder_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.2.encoder_attn.out_proj.bias torch.Size([512])
decoder.layers.2.bert_attn.k_proj_weight torch.Size([512, 768])
decoder.layers.2.bert_attn.v_proj_weight torch.Size([512, 768])
decoder.layers.2.bert_attn.q_proj_weight torch.Size([512, 512])
decoder.layers.2.bert_attn.in_proj_bias torch.Size([1536])
decoder.layers.2.bert_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.2.bert_attn.out_proj.bias torch.Size([512])
decoder.layers.2.encoder_attn_layer_norm.weight torch.Size([512])
decoder.layers.2.encoder_attn_layer_norm.bias torch.Size([512])
decoder.layers.2.fc1.weight torch.Size([1024, 512])
decoder.layers.2.fc1.bias torch.Size([1024])
decoder.layers.2.fc2.weight torch.Size([512, 1024])
decoder.layers.2.fc2.bias torch.Size([512])
decoder.layers.2.final_layer_norm.weight torch.Size([512])
decoder.layers.2.final_layer_norm.bias torch.Size([512])
decoder.layers.3.self_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.3.self_attn.in_proj_bias torch.Size([1536])
decoder.layers.3.self_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.3.self_attn.out_proj.bias torch.Size([512])
decoder.layers.3.self_attn_layer_norm.weight torch.Size([512])
decoder.layers.3.self_attn_layer_norm.bias torch.Size([512])
decoder.layers.3.encoder_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.3.encoder_attn.in_proj_bias torch.Size([1536])
decoder.layers.3.encoder_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.3.encoder_attn.out_proj.bias torch.Size([512])
decoder.layers.3.bert_attn.k_proj_weight torch.Size([512, 768])
decoder.layers.3.bert_attn.v_proj_weight torch.Size([512, 768])
decoder.layers.3.bert_attn.q_proj_weight torch.Size([512, 512])
decoder.layers.3.bert_attn.in_proj_bias torch.Size([1536])
decoder.layers.3.bert_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.3.bert_attn.out_proj.bias torch.Size([512])
decoder.layers.3.encoder_attn_layer_norm.weight torch.Size([512])
decoder.layers.3.encoder_attn_layer_norm.bias torch.Size([512])
decoder.layers.3.fc1.weight torch.Size([1024, 512])
decoder.layers.3.fc1.bias torch.Size([1024])
decoder.layers.3.fc2.weight torch.Size([512, 1024])
decoder.layers.3.fc2.bias torch.Size([512])
decoder.layers.3.final_layer_norm.weight torch.Size([512])
decoder.layers.3.final_layer_norm.bias torch.Size([512])
decoder.layers.4.self_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.4.self_attn.in_proj_bias torch.Size([1536])
decoder.layers.4.self_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.4.self_attn.out_proj.bias torch.Size([512])
decoder.layers.4.self_attn_layer_norm.weight torch.Size([512])
decoder.layers.4.self_attn_layer_norm.bias torch.Size([512])
decoder.layers.4.encoder_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.4.encoder_attn.in_proj_bias torch.Size([1536])
decoder.layers.4.encoder_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.4.encoder_attn.out_proj.bias torch.Size([512])
decoder.layers.4.bert_attn.k_proj_weight torch.Size([512, 768])
decoder.layers.4.bert_attn.v_proj_weight torch.Size([512, 768])
decoder.layers.4.bert_attn.q_proj_weight torch.Size([512, 512])
decoder.layers.4.bert_attn.in_proj_bias torch.Size([1536])
decoder.layers.4.bert_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.4.bert_attn.out_proj.bias torch.Size([512])
decoder.layers.4.encoder_attn_layer_norm.weight torch.Size([512])
decoder.layers.4.encoder_attn_layer_norm.bias torch.Size([512])
decoder.layers.4.fc1.weight torch.Size([1024, 512])
decoder.layers.4.fc1.bias torch.Size([1024])
decoder.layers.4.fc2.weight torch.Size([512, 1024])
decoder.layers.4.fc2.bias torch.Size([512])
decoder.layers.4.final_layer_norm.weight torch.Size([512])
decoder.layers.4.final_layer_norm.bias torch.Size([512])
decoder.layers.5.self_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.5.self_attn.in_proj_bias torch.Size([1536])
decoder.layers.5.self_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.5.self_attn.out_proj.bias torch.Size([512])
decoder.layers.5.self_attn_layer_norm.weight torch.Size([512])
decoder.layers.5.self_attn_layer_norm.bias torch.Size([512])
decoder.layers.5.encoder_attn.in_proj_weight torch.Size([1536, 512])
decoder.layers.5.encoder_attn.in_proj_bias torch.Size([1536])
decoder.layers.5.encoder_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.5.encoder_attn.out_proj.bias torch.Size([512])
decoder.layers.5.bert_attn.k_proj_weight torch.Size([512, 768])
decoder.layers.5.bert_attn.v_proj_weight torch.Size([512, 768])
decoder.layers.5.bert_attn.q_proj_weight torch.Size([512, 512])
decoder.layers.5.bert_attn.in_proj_bias torch.Size([1536])
decoder.layers.5.bert_attn.out_proj.weight torch.Size([512, 512])
decoder.layers.5.bert_attn.out_proj.bias torch.Size([512])
decoder.layers.5.encoder_attn_layer_norm.weight torch.Size([512])
decoder.layers.5.encoder_attn_layer_norm.bias torch.Size([512])
decoder.layers.5.fc1.weight torch.Size([1024, 512])
decoder.layers.5.fc1.bias torch.Size([1024])
decoder.layers.5.fc2.weight torch.Size([512, 1024])
decoder.layers.5.fc2.bias torch.Size([512])
decoder.layers.5.final_layer_norm.weight torch.Size([512])
decoder.layers.5.final_layer_norm.bias torch.Size([512])
bert_encoder.embeddings.word_embeddings.weight torch.Size([119547, 768])
bert_encoder.embeddings.position_embeddings.weight torch.Size([512, 768])
bert_encoder.embeddings.token_type_embeddings.weight torch.Size([2, 768])
bert_encoder.embeddings.LayerNorm.weight torch.Size([768])
bert_encoder.embeddings.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.0.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.0.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.0.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.0.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.0.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.0.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.0.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.0.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.0.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.1.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.1.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.1.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.1.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.1.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.1.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.1.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.1.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.1.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.1.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.1.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.1.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.1.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.1.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.1.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.1.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.2.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.2.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.2.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.2.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.2.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.2.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.2.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.2.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.2.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.2.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.2.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.2.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.2.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.2.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.2.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.2.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.3.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.3.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.3.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.3.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.3.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.3.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.3.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.3.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.3.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.3.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.3.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.3.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.3.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.3.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.3.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.3.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.4.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.4.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.4.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.4.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.4.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.4.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.4.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.4.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.4.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.4.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.4.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.4.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.4.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.4.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.4.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.4.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.5.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.5.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.5.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.5.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.5.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.5.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.5.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.5.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.5.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.5.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.5.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.5.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.5.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.5.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.5.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.5.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.6.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.6.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.6.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.6.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.6.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.6.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.6.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.6.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.6.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.6.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.6.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.6.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.6.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.6.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.6.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.6.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.7.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.7.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.7.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.7.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.7.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.7.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.7.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.7.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.7.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.7.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.7.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.7.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.7.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.7.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.7.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.7.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.8.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.8.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.8.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.8.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.8.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.8.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.8.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.8.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.8.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.8.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.8.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.8.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.8.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.8.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.8.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.8.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.9.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.9.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.9.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.9.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.9.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.9.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.9.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.9.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.9.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.9.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.9.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.9.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.9.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.9.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.9.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.9.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.10.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.10.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.10.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.10.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.10.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.10.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.10.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.10.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.10.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.10.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.10.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.10.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.10.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.10.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.10.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.10.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.11.attention.self.query.weight torch.Size([768, 768])
bert_encoder.encoder.layer.11.attention.self.query.bias torch.Size([768])
bert_encoder.encoder.layer.11.attention.self.key.weight torch.Size([768, 768])
bert_encoder.encoder.layer.11.attention.self.key.bias torch.Size([768])
bert_encoder.encoder.layer.11.attention.self.value.weight torch.Size([768, 768])
bert_encoder.encoder.layer.11.attention.self.value.bias torch.Size([768])
bert_encoder.encoder.layer.11.attention.output.dense.weight torch.Size([768, 768])
bert_encoder.encoder.layer.11.attention.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.11.attention.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.11.attention.output.LayerNorm.bias torch.Size([768])
bert_encoder.encoder.layer.11.intermediate.dense.weight torch.Size([3072, 768])
bert_encoder.encoder.layer.11.intermediate.dense.bias torch.Size([3072])
bert_encoder.encoder.layer.11.output.dense.weight torch.Size([768, 3072])
bert_encoder.encoder.layer.11.output.dense.bias torch.Size([768])
bert_encoder.encoder.layer.11.output.LayerNorm.weight torch.Size([768])
bert_encoder.encoder.layer.11.output.LayerNorm.bias torch.Size([768])
bert_encoder.pooler.dense.weight torch.Size([768, 768])
bert_encoder.pooler.dense.bias torch.Size([768])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment