Created
October 1, 2023 15:04
-
-
Save Birch-san/7709286ec73c6795666050a4f2786309 to your computer and use it in GitHub Desktop.
google/t5-v1_1-small t5-small weight initializations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import T5ForConditionalGeneration | |
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-small') | |
_inference_mode_context = torch._C._InferenceMode(True) | |
_inference_mode_context.__enter__() | |
model.shared.weight.std() | |
tensor(11.6375) | |
model.shared.weight.mean() | |
tensor(-0.1536) | |
model.lm_head.weight.mean() | |
tensor(0.0011) | |
model.lm_head.weight.std() | |
tensor(1.1614) | |
model.encoder.final_layer_norm.weight.std() | |
tensor(0.0685) | |
model.decoder.final_layer_norm.weight.std() | |
tensor(0.7312) | |
# encoder per-block layernorms | |
torch.stack([layer.layer_norm.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.0131, 0.0199, 0.0199, 0.0243, 0.0307, 0.0311, 0.0288, 0.0339]) | |
torch.stack([layer.layer_norm.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.0252) | |
torch.stack([layer.layer_norm.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.0157, 0.0219, 0.0292, 0.0178, 0.0174, 0.0177, 0.0213, 0.0534]) | |
torch.stack([layer.layer_norm.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.0243) | |
# decoder per-block layernorms | |
torch.stack([layer.layer_norm.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.0174, 0.0294, 0.0948, 0.1030, 0.1150, 0.1660, 0.1392, 0.1612]) | |
torch.stack([layer.layer_norm.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.1033) | |
torch.stack([layer.layer_norm.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.0186, 0.0347, 0.0435, 0.0822, 0.0868, 0.0941, 0.1088, 0.1113]) | |
torch.stack([layer.layer_norm.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.0725) | |
# encoder self-attn qkv per-head (probably), first block/layer | |
model.encoder.block[0].layer[0].SelfAttention.q.weight.unflatten(-2, (6, -1)).mean((-2, -1)) | |
tensor([ 3.0427e-04, -3.3273e-04, -5.7137e-04, 1.3270e-04, -6.0954e-04, | |
5.7666e-05]) | |
model.encoder.block[0].layer[0].SelfAttention.k.weight.unflatten(-2, (6, -1)).mean((-2, -1)) | |
tensor([ 0.0003, 0.0033, 0.0010, 0.0012, -0.0103, -0.0039]) | |
model.encoder.block[0].layer[0].SelfAttention.v.weight.unflatten(-2, (6, -1)).mean((-2, -1)) | |
tensor([-1.5389e-03, 9.3224e-04, 2.3848e-04, -1.6036e-03, -3.6677e-05, | |
1.5001e-03]) | |
model.encoder.block[0].layer[0].SelfAttention.q.weight.unflatten(-2, (6, -1)).std((-2, -1)) | |
tensor([0.1002, 0.0594, 0.0379, 0.0657, 0.0996, 0.0520]) | |
model.encoder.block[0].layer[0].SelfAttention.k.weight.unflatten(-2, (6, -1)).std((-2, -1)) | |
tensor([0.6797, 0.4613, 0.3589, 0.3852, 0.7974, 0.4413]) | |
model.encoder.block[0].layer[0].SelfAttention.v.weight.unflatten(-2, (6, -1)).std((-2, -1)) | |
tensor([0.3999, 0.3589, 0.2850, 0.2751, 0.3750, 0.3195]) | |
model.encoder.block[0].layer[0].SelfAttention.q.weight.unflatten(-2, (6, -1)).std((-2, -1)).mean() | |
tensor(0.0691) | |
model.encoder.block[0].layer[0].SelfAttention.k.weight.unflatten(-2, (6, -1)).std((-2, -1)).mean() | |
tensor(0.5206) | |
model.encoder.block[0].layer[0].SelfAttention.v.weight.unflatten(-2, (6, -1)).std((-2, -1)).mean() | |
tensor(0.3356) | |
# encoder self-attn QKVO | |
torch.stack([layer.SelfAttention.q.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.0730, 0.0559, 0.0474, 0.0421, 0.0342, 0.0352, 0.0321, 0.0250]) | |
torch.stack([layer.SelfAttention.q.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.0431) | |
torch.stack([layer.SelfAttention.k.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.5451, 0.4529, 0.3882, 0.3447, 0.2807, 0.2983, 0.2610, 0.1999]) | |
torch.stack([layer.SelfAttention.k.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.3463) | |
torch.stack([layer.SelfAttention.v.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.3387, 0.4017, 0.5156, 0.5692, 0.6797, 0.6786, 0.7803, 0.8473]) | |
torch.stack([layer.SelfAttention.v.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.6014) | |
torch.stack([layer.SelfAttention.o.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.5092, 0.5412, 0.5862, 0.6100, 0.7142, 0.7527, 0.8422, 0.9372]) | |
torch.stack([layer.SelfAttention.o.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.6866) | |
# decoder self-attn QKVO | |
torch.stack([layer.SelfAttention.q.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.0511, 0.0619, 0.0351, 0.0373, 0.0418, 0.0351, 0.0322, 0.0299]) | |
torch.stack([layer.SelfAttention.q.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.0405) | |
torch.stack([layer.SelfAttention.k.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.3900, 0.3526, 0.3069, 0.2829, 0.3208, 0.2671, 0.2520, 0.2967]) | |
torch.stack([layer.SelfAttention.k.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.3086) | |
torch.stack([layer.SelfAttention.v.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.3136, 0.3880, 0.4715, 0.8108, 0.8222, 1.0587, 1.0818, 1.3638]) | |
torch.stack([layer.SelfAttention.v.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.7888) | |
torch.stack([layer.SelfAttention.o.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']) | |
tensor([0.3299, 0.4926, 0.6292, 0.8534, 1.1908, 1.5305, 1.6492, 1.1218]) | |
torch.stack([layer.SelfAttention.o.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerSelfAttention']).mean() | |
tensor(0.9747) | |
# decoder cross-attn QKVO | |
torch.stack([layer.EncDecAttention.q.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']) | |
tensor([0.0557, 0.0597, 0.0613, 0.0586, 0.0581, 0.0491, 0.0508, 0.0610]) | |
torch.stack([layer.EncDecAttention.q.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']).mean() | |
tensor(0.0568) | |
torch.stack([layer.EncDecAttention.k.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']) | |
tensor([0.3914, 0.4123, 0.3697, 0.4201, 0.4773, 0.4357, 0.4050, 0.5185]) | |
torch.stack([layer.EncDecAttention.k.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']).mean() | |
tensor(0.4287) | |
torch.stack([layer.EncDecAttention.v.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']) | |
tensor([0.2404, 0.3402, 0.5126, 0.5974, 0.7563, 0.9074, 1.2404, 1.8536]) | |
torch.stack([layer.EncDecAttention.v.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']).mean() | |
tensor(0.8060) | |
torch.stack([layer.EncDecAttention.o.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']) | |
tensor([0.3555, 0.4144, 0.5828, 0.5629, 0.7363, 1.0384, 1.4935, 2.1371]) | |
torch.stack([layer.EncDecAttention.o.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerCrossAttention']).mean() | |
tensor(0.9151) | |
# encoder FFN | |
torch.stack([layer.DenseReluDense.wi_0.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.4937, 0.4639, 0.3912, 0.3585, 0.3292, 0.3103, 0.2805, 0.2635]) | |
torch.stack([layer.DenseReluDense.wi_0.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.3614) | |
torch.stack([layer.DenseReluDense.wi_1.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.5527, 0.5850, 0.7052, 0.8311, 0.9126, 1.0003, 1.0600, 1.1883]) | |
torch.stack([layer.DenseReluDense.wi_1.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.8544) | |
torch.stack([layer.DenseReluDense.wo.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.3889, 0.4227, 0.4912, 0.5334, 0.5577, 0.6020, 0.6247, 0.6578]) | |
torch.stack([layer.DenseReluDense.wo.weight.std() for block in model.encoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.5348) | |
# decoder FFN | |
torch.stack([layer.DenseReluDense.wi_0.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.4987, 0.4583, 0.3574, 0.3235, 0.2580, 0.2114, 0.2111, 0.2236]) | |
torch.stack([layer.DenseReluDense.wi_0.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.3178) | |
torch.stack([layer.DenseReluDense.wi_1.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.4576, 0.6423, 0.8584, 0.9272, 1.0576, 1.1572, 1.4150, 2.8697]) | |
torch.stack([layer.DenseReluDense.wi_1.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(1.1731) | |
torch.stack([layer.DenseReluDense.wo.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']) | |
tensor([0.3635, 0.5009, 0.6503, 0.7095, 0.7366, 0.7630, 0.8231, 1.3555]) | |
torch.stack([layer.DenseReluDense.wo.weight.std() for block in model.decoder.block for layer in block.layer if type(layer).__name__ == 'T5LayerFF']).mean() | |
tensor(0.7378) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment