Created
May 24, 2024 19:20
-
-
Save bigsnarfdude/fb6bfab3d976900567150b1d71dbfc73 to your computer and use it in GitHub Desktop.
gpt2 model with lora
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
GPTModel( | |
(tok_emb): Embedding(50257, 768) | |
(pos_emb): Embedding(1024, 768) | |
(drop_emb): Dropout(p=0.0, inplace=False) | |
(trf_blocks): Sequential( | |
(0): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(1): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(2): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(3): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(4): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(5): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(6): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(7): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(8): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(9): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(10): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
(11): TransformerBlock( | |
(att): MultiHeadAttention( | |
(W_query): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_key): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(W_value): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(out_proj): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
(dropout): Dropout(p=0.0, inplace=False) | |
) | |
(ff): FeedForward( | |
(layers): Sequential( | |
(0): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=3072, bias=True) | |
(lora): LoRALayer() | |
) | |
(1): GELU() | |
(2): LinearWithLoRA( | |
(linear): Linear(in_features=3072, out_features=768, bias=True) | |
(lora): LoRALayer() | |
) | |
) | |
) | |
(norm1): LayerNorm() | |
(norm2): LayerNorm() | |
(drop_resid): Dropout(p=0.0, inplace=False) | |
) | |
) | |
(final_norm): LayerNorm() | |
(out_head): LinearWithLoRA( | |
(linear): Linear(in_features=768, out_features=2, bias=True) | |
(lora): LoRALayer() | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment