Skip to content

Instantly share code, notes, and snippets.

@jramapuram
Created March 1, 2022 19:21
Show Gist options
  • Save jramapuram/d284e0f261d3fdb15c213dd929d272b9 to your computer and use it in GitHub Desktop.
Save jramapuram/d284e0f261d3fdb15c213dd929d272b9 to your computer and use it in GitHub Desktop.
ViT(
(patch_embed): PatchEmbed(
(proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
(norm): Identity()
)
(backbone): xFormer(
(encoders): ModuleList(
(0): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(1): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(2): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(3): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(4): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(5): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(6): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(7): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(8): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(9): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(10): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(11): xFormerEncoderBlock(
(mha): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
(feedforward): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(wrap_att): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MultiHeadDispatch(
(attention): ScaledDotProduct(
(attn_drop): Dropout(p=0.0, inplace=False)
)
(in_proj_container): InProjContainer()
(resid_drop): DropPath()
(proj): Linear(in_features=768, out_features=768, bias=True)
)
)
)
(wrap_ff): Residual(
(layer): PreNorm(
(norm): FusedLayerNorm()
(sublayer): MLP(
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=3072, out_features=768, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
)
)
(decoders): ModuleList()
)
(head): Sequential(
(0): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(1): Linear(in_features=768, out_features=1000, bias=True)
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment