refactor kinda done, yeet main

09e4a19b · novelailab · 840dd7f4 · 09e4a19b · 09e4a19b · 09e4a19b
Commit 09e4a19b authored Apr 06, 2022 by novelailab
Showing with 34 additions and 465 deletions

comparehf.py comparehf.py +5 -2

lm_arch/gptj.py lm_arch/gptj.py +6 -6

lm_arch/lm_class.py lm_arch/lm_class.py +20 -4

main.py main.py +0 -442

pyfra_util.py pyfra_util.py +3 -11

No files found.
--- a/comparehf.py
+++ b/comparehf.py
-from main import *
+from lm_arch import lm_class
+from lm_arch.utils import * 
 import time

+import torch
 from time import perf_counter, perf_counter_ns
 import numpy as np
 from tqdm import tqdm
@@ -65,7 +67,8 @@ def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True


 with torch.no_grad():
-    based_model = load_gpt_j().cuda().half().eval()
+    based_model = lm_class.load_gpt_j().cuda().half().eval()
+    based_model = based_model.lm
    print("Loaded based model")
    hf_model = no_init(lambda: AutoModelForCausalLM.from_pretrained('/home/xuser/models/j6b_ckpt_14001')).cuda().half().eval()
    print("Loaded hf model")

--- a/lm_arch/gptj.py
+++ b/lm_arch/gptj.py
@@ -79,7 +79,7 @@ def _attn(query, key, value, causal_mask, masked_bias,
 class SelfAttention(nn.Module):
    # Code copied from HF, might want to sanity check later.
    def __init__(self, hidden_dim, n_head, device, dtype):
-        super().__init__(self)
+        nn.Module.__init__(self)
        max_positions = 2049
        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
            1, 1, max_positions, max_positions).bool()
@@ -143,7 +143,7 @@ class SelfAttention(nn.Module):

 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, activation, device, dtype):
-        super().__init__(self)
+        nn.Module.__init__(self)
        self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
        self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
        self.activation = activation
@@ -159,7 +159,7 @@ class FeedForward(nn.Module):

 class GPTJLayer(nn.Module):
    def __init__(self, attn, ff, hidden_dim, n_head, eps, activation, device, dtype):
-        super().__init__(self)
+        nn.Module.__init__(self)
        self.hidden_dim = hidden_dim
        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
        self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
@@ -204,8 +204,8 @@ class GPTJLayer(nn.Module):
        return x

 class GPTJModel(nn.Module):
-    def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation, Layer, device, dtype):
-        super().__init__(self)
+    def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation=gelu_new, Layer=GPTJLayer, device="cuda", dtype=torch.float16):
+        nn.Module.__init__(self)
        self.n_layer = n_layer
        self.hidden_dim = hidden_dim
        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)

--- a/lm_arch/lm_class.py
+++ b/lm_arch/lm_class.py
@@ -2,11 +2,13 @@ from lm_arch import utils
 import math
 import torch
 from torch import nn
+from lm_arch import gptj
 import os

 #Having common BaseLM functionality in this class instead of the torch LM itself makes sense.
 class BaseLM(nn.Module):
    def __init__(self, config=None, lm=None):
+        nn.Module.__init__(self)
        self.config = config
        self.lm = lm

@@ -44,13 +46,13 @@ class BaseLM(nn.Module):
        return model

    @classmethod
-    def load(cls, config, path=None, state_dict=None, strict=False):
+    def load(cls, model_class, config, path=None, state_dict=None, strict=False):
        # I am kinda sad that we will not have a load function in lm object itself.
        # might be better to add load functions to that as well but not sure.
        if path:
            state_dict = utils.SplitCheckpoint(path, device="cuda")

-        lm = config.model_class(**config)
+        lm = model_class(**config)
        model = cls(config, lm)
        model.lm.load_state_dict(state_dict, strict=strict)
        return model
@@ -67,3 +69,17 @@ class BaseLM(nn.Module):
            checkpoint[x[0]] = f"{path}/b{i}.pt"
            torch.save(x[1], f"{path}/b{i}.pt")
        torch.save(checkpoint, f"{path}/m.pt")
+
+
+def load_gpt_j(path="models/6b", state_dict=None):
+    config = {
+        "n_layer": 28,
+        "n_head": 16,
+        "hidden_dim": 4096,
+        "vocab_dim": 50400,
+        "eps": 1e-5,
+        "activation": gptj.gelu_new,
+        "Layer": gptj.GPTJLayer
+    }
+    model = BaseLM.load(gptj.GPTJModel, config, path, state_dict)
+    return model
--- a/main.py
+++ b/main.py
--- a/pyfra_util.py
+++ b/pyfra_util.py
-# run bash: -b
-# run command: default
-# kill: -k name
-# start pod: -s name 
-# gpu: -g --gpu
-# amount: -n
-# cpu cores: -c
-# amount of ram: -r
-# image: -i
-
 from novelutils.novelfra import *
 from pyfra import *
 import argparse
 import sys
+
+
 parser = argparse.ArgumentParser(description='Novelfra utility tool for launching pods and deployments on kubernetes with pyfra.')
 parser.add_argument('name', nargs="?", type=str, help='Deployment name')
-# Make the default the last one we used.
+#TODO: Make the default the last one we used.
 parser.add_argument('--service', action="store_true", help="""Create a service with the deployment. If a service is not 
 created you won't be able to access the pod outside from the kube network.""")
 parser.add_argument('-b', '--bash', action="store_true", help='Run bash instead of python3.')