1414}
1515
1616
17- # from transformers
18- class Conv1D (nn .Module ):
19- """
20- 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
21-
22- Basically works like a linear layer but the weights are transposed.
23-
24- Args:
25- nf (`int`): The number of output features.
26- nx (`int`): The number of input features.
27- """
28-
29- def __init__ (self , nf , nx ):
30- super ().__init__ ()
31- self .nf = nf
32- self .weight = nn .Parameter (torch .empty (nx , nf ))
33- self .bias = nn .Parameter (torch .zeros (nf ))
34- nn .init .normal_ (self .weight , std = 0.02 )
35-
36- def forward (self , x ):
37- size_out = x .size ()[:- 1 ] + (self .nf ,)
38- x = torch .addmm (self .bias , x .view (- 1 , x .size (- 1 )), self .weight )
39- x = x .view (size_out )
40- return x
41-
42-
4317class HeadFFN (nn .Module ): # todo rename
4418 def __init__ (self , dim ):
4519 super ().__init__ ()
46- self .c_fc = Conv1D ( dim , config ['n_embd' ])
47- self .c_proj = Conv1D ( config ['n_embd' ], dim )
20+ self .c_fc = nn . Linear ( config ['n_embd' ], dim )
21+ self .c_proj = nn . Linear ( dim , config ['n_embd' ])
4822 self .act = nn .functional .gelu
4923
5024 def forward (self , hidden_states ):
@@ -62,8 +36,8 @@ def __init__(self):
6236 self .head_dim = self .embed_dim // self .num_heads
6337 self .split_size = self .embed_dim
6438
65- self .c_att = Conv1D (config ['n_embd' ] * 3 , config ['n_embd' ])
66- self .c_proj = Conv1D (config ['n_embd' ], config ['n_embd' ])
39+ self .c_att = nn . Linear (config ['n_embd' ], config ['n_embd' ] * 3 )
40+ self .c_proj = nn . Linear (config ['n_embd' ], config ['n_embd' ])
6741
6842 def _split_heads (self , tensor , num_heads , attn_head_size ):
6943 """
0 commit comments