add lm head

8382affa · novelailab · d57cfcec · 8382affa · 8382affa
Commit 8382affa authored Feb 22, 2022 by novelailab
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 10 deletions

gptj.py gptj.py +2 -2

main.py main.py +11 -8

No files found.
--- a/gptj.py
+++ b/gptj.py
@@ -60,11 +60,11 @@ def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True

 with torch.no_grad():
    model = load_gpt_j().cuda().half()
-    x = torch.zeros(1, 2048).cuda().long()
+    x = torch.zeros(1, 1024).cuda().long()
    print(model(x).shape)
    print("PyTorch Eager")
    timeit(r=1, n=100, func=lambda: model(x), do_tqdm=False, first=False)
-    module = torch.jit.trace(model, torch.zeros((1, 2048)).long().cuda())
+    module = torch.jit.trace(model, torch.zeros((1, 1024)).long().cuda())
    torch.jit.optimize_for_inference(module)
    print("PyTorch JIT")
    timeit(r=1, n=100, func=lambda: module(x), do_tqdm=False, first=False)
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -75,6 +75,9 @@ class SplitCheckpoint(MutableMapping):

 #TODO: Might change with non einsum functions?

+def get_logits(x, embedding):
+    return embedding(x)
+
 def gelu_new(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

@@ -152,7 +155,7 @@ class SelfAttention(nn.Module):
        self.n_head = n_head
        self.register_buffer("scale_attn", torch.sqrt(torch.tensor(self.head_dim, requires_grad=False).float()))
        self.register_buffer("bias", bias)
-        self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False))
+        self.register_buffer("masked_bias", torch.tensor(-1e10, requires_grad=False)) #-1e10 is what mtj uses.
        attn_bias = False
        self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
        self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
@@ -221,7 +224,7 @@ class GPTLayer(nn.Module):

        attn_out = self.attn(x)
        ff_out = self.ff(x)
-        x = residual + ff_out + attn_out + (hyper_out if hypernetwork is not None else 0)
+        x = residual + ff_out + attn_out# + (hyper_out if hypernetwork is not None else 0)
        return x

 # Can access and change every module from here, as both Layer class and ff and attn classes are passed from GPTModel.
@@ -232,6 +235,7 @@ class GPTModel(nn.Module):
        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
        self.layers = nn.ModuleList([])
+        self.lm_head = nn.Linear(hidden_dim, vocab_dim, bias=True)
        for _ in range(n_layer):
            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
            #TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
@@ -244,6 +248,11 @@ class GPTModel(nn.Module):

        x = self.ln_final(x)
        return x
+
+    def get_logits(self, x):
+        x = self.forward(x)
+        x = self.lm_head(x)
+        return x.float()
    
    @classmethod
    def load(cls, config, path=None, state_dict=None):
@@ -271,12 +280,6 @@ class GPTModel(nn.Module):
 # TODO: Do we want to have the LM head as a seperate Class? Or just a function? I think we might be better off with a function here and maybe
 # also for the self attention, we can just write a function that gets fed in the q, k, v.

-class GPTLM(nn.Module):
-    def __init__(self):
-        return
-    def forward(self, x):
-        return
-
 def load_gpt_j(path="models/6b", state_dict=None):
    config = {
        "n_layer": 28,