bigsnarfdude/gist:fb6bfab3d976900567150b1d71dbfc73

## gistfile1.txt
GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (2): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (3): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (4): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (5): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (6): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (7): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (8): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (9): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (10): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (11): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_features=3072, bias=True)
            (lora): LoRALayer()
          )
          (1): GELU()
          (2): LinearWithLoRA(
            (linear): Linear(in_features=3072, out_features=768, bias=True)
            (lora): LoRALayer()
          )
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
  )
  (final_norm): LayerNorm()
  (out_head): LinearWithLoRA(
    (linear): Linear(in_features=768, out_features=2, bias=True)
    (lora): LoRALayer()
  )
)
	GPTModel(
	(tok_emb): Embedding(50257, 768)
	(pos_emb): Embedding(1024, 768)
	(drop_emb): Dropout(p=0.0, inplace=False)
	(trf_blocks): Sequential(
	(0): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(1): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(2): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(3): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(4): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(5): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(6): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(7): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(8): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(9): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(10): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	(11): TransformerBlock(
	(att): MultiHeadAttention(
	(W_query): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_key): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(W_value): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(out_proj): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	(dropout): Dropout(p=0.0, inplace=False)
	)
	(ff): FeedForward(
	(layers): Sequential(
	(0): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=3072, bias=True)
	(lora): LoRALayer()
	)
	(1): GELU()
	(2): LinearWithLoRA(
	(linear): Linear(in_features=3072, out_features=768, bias=True)
	(lora): LoRALayer()
	)
	)
	)
	(norm1): LayerNorm()
	(norm2): LayerNorm()
	(drop_resid): Dropout(p=0.0, inplace=False)
	)
	)
	(final_norm): LayerNorm()
	(out_head): LinearWithLoRA(
	(linear): Linear(in_features=768, out_features=2, bias=True)
	(lora): LoRALayer()
	)
	)