add more stuff

7eebf8ad · novelailab · 4a13de3a · 7eebf8ad · 7eebf8ad · 7eebf8ad
Commit 7eebf8ad authored Feb 20, 2022 by novelailab
Hide whitespace changes
Inline Side-by-side

Showing with 278 additions and 48 deletions

hfport.py hfport.py +65 -0

main.py main.py +107 -42

test.py test.py +102 -3

test_pyfra.py test_pyfra.py +4 -3

No files found.
--- a/hfport.py
+++ b/hfport.py
+from main import *
+
+state_dict = SplitCheckpoint("/home/xuser/models/j6b_ckpt_14001", device="cpu")
+
+
+# ORIGINAL
+'''
+transformer.ln_f.weight
+transformer.ln_f.bias
+lm_head.weight
+lm_head.bias
+transformer.h.9.ln_1.weight
+transformer.h.9.ln_1.bias
+transformer.h.9.mlp.c_proj.weight
+transformer.h.9.mlp.c_proj.bias
+transformer.h.9.mlp.c_fc.weight
+transformer.h.9.mlp.c_fc.bias
+transformer.h.9.attn.attention.out_proj.weight
+transformer.h.9.attn.attention.k_proj.weight
+transformer.h.9.attn.attention.v_proj.weight
+transformer.h.9.attn.attention.q_proj.weight
+transformer.wte.weight
+'''
+
+new_state_dict = {}
+module_map = {
+                "ln_1": "ln_preattn",
+                "mlp.c_proj": "ff.ff2",
+                "mlp.c_fc": "ff.ff1",
+                "attn.attention.out_proj": "attn.out_proj",
+                "attn.attention.k_proj": "attn.k_proj",
+                "attn.attention.v_proj": "attn.v_proj",
+                "attn.attention.q_proj": "attn.q_proj",
+                "wte": "vocab_embed",
+                'ln_f': 'ln_final',
+                'lm_head': 'lm_head',
+                }
+
+print(type(state_dict))
+for key in state_dict.keys():
+    dotlist = key.split('.')
+    if len(dotlist) > 3:
+        layer = dotlist[2]
+        for x in module_map:
+            if x in key:
+                new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
+                print(f"{key} -> layers.{layer}.{module_map[x]}.{dotlist[-1]}")
+    else:
+        for x in module_map:
+            if x in key:
+                new_state_dict[f"{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
+                print(f"{key} -> {module_map[x]}.{dotlist[-1]}")
+
+#print(new_state_dict)
+
+def save(state_dict, path):
+    try: os.mkdir(path)
+    except: pass
+    checkpoint = {}
+    for i, x in enumerate(state_dict.items()):
+        checkpoint[x[0]] = f"{path}/b{i}.pt"
+        torch.save(x[1], f"{path}/b{i}.pt")
+    torch.save(checkpoint, f"{path}/m.pt")
+
+save(new_state_dict, "models/6b")
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -8,6 +8,14 @@ except ImportError:
    from collections import MutableMapping
 import os
 from pathlib import Path
+import math
+
+def defaults():
+    # Easily accessible defaults
+    D_LAYER = GPTLayer
+    D_ATTN = SelfAttention
+    D_FF = FeedForward
+    D_ACT = gelu_new

 def no_init(loading_code):
    def dummy(self):
@@ -67,6 +75,9 @@ class SplitCheckpoint(MutableMapping):

 #TODO: Might change with non einsum functions?

+def gelu_new(x):
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+
 def fixed_pos_embedding(dim=None, seq_len=None, x=None):
    if x is None:
        x = torch.empty(0)
@@ -84,19 +95,7 @@ def apply_rotary_pos_emb(x, sincos, offset=0):
    sin, cos = map(lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2), sincos)
    return (x * cos) + (rotate_every_two(x) * sin)

-class FeedForward(nn.Module):
-    def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU):
-        self.ff1 = nn.Linear(dim, hidden_dim)
-        self.ff2 = nn.Linear(hidden_dim, dim)
-        self.activation = activation()
-
-    def forward(self, x):
-        x = self.ff1(x)
-        x = self.activation(x)
-        x = self.ff2(x)
-        return x
-
-def _split_heads(self, tensor, num_heads, attn_head_size, rotary):
+def _split_heads(tensor, num_heads, attn_head_size, rotary):
    """
    Splits hidden_size dim into attn_head_size and num_heads
    """
@@ -111,7 +110,7 @@ def _split_heads(self, tensor, num_heads, attn_head_size, rotary):
    else:
        raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")

-def _merge_heads(self, tensor, num_heads, attn_head_size):
+def _merge_heads(tensor, num_heads, attn_head_size):
    """
    Merges attn_head_size dim and num_attn_heads dim into hidden_size
    """
@@ -143,7 +142,8 @@ def _attn(query, key, value, causal_mask, masked_bias,

 class SelfAttention(nn.Module):
    # Code copied from HF, might want to sanity check later.
-    def __init__(self, hidden_dim, n_head):
+    def __init__(self, hidden_dim, n_head, device="cuda", dtype=torch.float16):
+        super(SelfAttention, self).__init__()
        max_positions = 2049
        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
            1, 1, max_positions, max_positions).bool()
@@ -154,10 +154,10 @@ class SelfAttention(nn.Module):
        self.register_buffer("bias", bias)
        self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False))
        attn_bias = False
-        self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
-        self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
-        self.q_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
-        self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
+        self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        self.q_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
        self.rotary_dim = self.head_dim
        # TODO: handle rotary
        sin, cos = fixed_pos_embedding(dim=self.rotary_dim, seq_len=max_positions)
@@ -175,8 +175,8 @@ class SelfAttention(nn.Module):

        offset = 0

-        key = self.apply_rotary_pos_emb(key, (self.sin, self.cos), offset=offset).to(key.dtype)
-        query = self.apply_rotary_pos_emb(query, (self.sin, self.cos), offset=offset).to(query.dtype)
+        key = apply_rotary_pos_emb(key, (self.sin, self.cos), offset=offset).to(key.dtype)
+        query = apply_rotary_pos_emb(query, (self.sin, self.cos), offset=offset).to(query.dtype)
            
        key = key.permute(0, 2, 1, 3)
        query = query.permute(0, 2, 1, 3)
@@ -185,21 +185,34 @@ class SelfAttention(nn.Module):
        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]

        x = _attn(
-            query, key, value, causal_mask, self.masked_bias, self.scale_attn
+            query, key, value, causal_mask, self.masked_bias, None, self.scale_attn
        )

-        x = _merge_heads(x, self.num_heads, self.head_dim)
+        x = _merge_heads(x, self.n_head, self.head_dim)
        x = self.out_proj(x)

        return x  # a, present, (attentions)

+class FeedForward(nn.Module):
+    def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU, device="cuda", dtype=torch.float16):
+        super(FeedForward, self).__init__()
+        self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
+        self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
+        self.activation = activation
+
+    def forward(self, x):
+        x = self.ff1(x)
+        x = self.activation(x)
+        x = self.ff2(x)
+        return x
+
 class GPTLayer(nn.Module):
-    def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU):
+    def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU, device="cuda", dtype=torch.float16):
+        super(GPTLayer, self).__init__()
        self.hidden_dim = hidden_dim
-        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps)
-        #self.ln_postattn = nn.LayerNorm(hidden_dim, eps=eps)
-        self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation)
-        self.attn = attn(hidden_dim=hidden_dim, n_head=n_head)
+        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
+        self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
+        self.attn = attn(hidden_dim=hidden_dim, n_head=n_head, device=device, dtype=dtype)

    def forward(self, x, hypernetwork):
        residual = x
@@ -209,20 +222,21 @@ class GPTLayer(nn.Module):

        attn_out = self.attn(x)
        ff_out = self.ff(x)
-        x = residual + attn_out + ff_out + (hyper_out if hyper_out is not None else 0)
+        x = residual + ff_out + attn_out + (hyper_out if hypernetwork is not None else 0)
        return x

 # Can access and change every module from here, as both Layer class and ff and attn classes are passed from GPTModel.
 class GPTModel(nn.Module):
-    def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPTLayer):
+    def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPTLayer, device="cuda", dtype=torch.float16):
+        super(GPTModel, self).__init__()
        self.hidden_dim = hidden_dim
-        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim)
-        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps)
+        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
+        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
        self.layers = nn.ModuleList([])
        for _ in range(n_layer):
-            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation))
+            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
            #TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
-            #TODO: Do we want to pass a config object everywhere?
+            #TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.

    def forward(self, x, hypernetwork=None):
        x = self.vocab_embed(x)
@@ -232,9 +246,19 @@ class GPTModel(nn.Module):
        x = self.ln_final(x)
        return x
    
-    def load(self, path):
-        state_dict = SplitCheckpoint(path, device="cuda")
-        self.load_state_dict(state_dict)
+    @classmethod
+    def load(cls, config, path=None, state_dict=None):
+        if path:
+            state_dict = SplitCheckpoint(path, device="cuda")
+
+        model = no_init(lambda: cls(**config))
+        model.load_state_dict(state_dict, strict=False)
+        return model
+
+    @classmethod
+    def init(cls, config):
+        model = no_init(lambda: cls(**config))
+        return model

    def save(self, path):
        try: os.mkdir(path)
@@ -243,6 +267,7 @@ class GPTModel(nn.Module):
        for i, x in enumerate(self.state_dict().items()):
            checkpoint[x[0]] = f"{path}/b{i}.pt"
            torch.save(x[1], f"{path}/b{i}.pt")
+        torch.save(checkpoint, f"{path}/m.pt")

 # TODO: Do we want to have the LM head as a seperate Class? Or just a function? I think we might be better off with a function here and maybe
 # also for the self attention, we can just write a function that gets fed in the q, k, v.
@@ -253,17 +278,57 @@ class GPTLM(nn.Module):
    def forward(self, x):
        return

-def load_gpt_j(path):
+def load_gpt_j(state_dict=None, path=None):
+    config = {
+        "n_layer": 28,
+        "n_head": 16,
+        "hidden_dim": 4096,
+        "vocab_dim": 50400,
+        "eps": 1e-5,
+        "activation": gelu_new,
+        "Layer": GPTLayer
+    }
+
+    model = GPTModel.load(config, path=path)
+    return model
+
+def init_6b():
    config = {
        "n_layer": 28,
        "n_head": 16,
        "hidden_dim": 4096,
        "vocab_dim": 50400,
        "eps": 1e-5,
-        "activation": nn.GELU,
+        "activation": gelu_new,
+        "Layer": GPTLayer
+    }
+    model = GPTModel(**config)
+    return model
+
+def init_125m():
+    config = {
+        "n_layer": 12,
+        "n_head": 12,
+        "hidden_dim": 768,
+        "vocab_dim": 50400,
+        "eps": 1e-5,
+        "activation": gelu_new,
+        "Layer": GPTLayer
+    }
+
+    model = GPTModel.init(config)
+    return model
+
+def init_1_3b():
+    config = {
+        "n_layer": 24,
+        "n_head": 16,
+        "hidden_dim": 2048,
+        "vocab_dim": 50400,
+        "eps": 1e-5,
+        "activation": gelu_new,
        "Layer": GPTLayer
    }

-    model = no_init(lambda: GPTModel(**config))
-    model.load(path)
-    return model
\ No newline at end of file
+    model = GPTModel(**config)
+    return model
--- a/test.py
+++ b/test.py
 from main import *
+import time

-state_dict = SplitCheckpoint("'/home/xuser/models/j6b_ckpt_14001", device="cuda")
-for x in state_dict:
-    print(x)
+from time import perf_counter, perf_counter_ns
+import numpy as np
+from tqdm import tqdm
+from contextlib import contextmanager
+#replicating timeit magic function of ipython
+def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
+    precision = 'ns'
+    r_arr = np.empty([2, r]) # [0] = mean, [1] = std
+    if function:
+        func.__name__ = function.__name__
+
+    for i in tqdm(range(r)) if do_tqdm else range(r):
+        n_arr = np.empty(n)
+        for k in range(n):
+            start = perf_counter_ns()
+            func()
+            n_arr[k] = perf_counter_ns() - start
+        
+        if not first:
+            # delete the first element from n_arr numpy array
+            n_arr = np.delete(n_arr, 0)
+
+        r_arr[0, i] = np.mean(n_arr)
+        r_arr[1, i] = np.std(n_arr)
+    
+    best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
+    #check if best[0] bigger than 1ms in numpy
+    if best[0] < 1e3:
+        precision = 'ns'
+
+    elif best[0] >= 1e9:
+        print('b')
+        best[0] = best[0] * 1e-9
+        best[1] = best[1] * 1e-9
+        precision = 's'
+
+    elif best[0] >= 1e6:
+        best[0] = best[0] * 1e-6
+        best[1] = best[1] * 1e-6
+        precision = 'ms'
+
+    elif best[0] >= 1e3:
+        precision = 'μs'
+        best[0] = best[0] * 1e-3
+        best[1] = best[1] * 1e-3
+
+    if not quiet:
+        if precision == 'ns':
+            print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        if precision == 'μs':
+            print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        elif precision == 'ms':
+            print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        elif precision == 's':
+            print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+
+
+with torch.no_grad():
+    model = init_125m().cuda().half()
+    '''
+    timeit(lambda: model(torch.zeros((1, 2048)).long().cuda()), n=20, first=False)
+    module = torch.jit.trace(model, torch.zeros((1, 2048)).long().cuda())
+    torch.jit.optimize_for_inference(module)
+    timeit(lambda: module(torch.zeros((1, 2048)).long().cuda()), n=20, first=False)
+    timeit(lambda: model(torch.zeros((1, 1000)).long().cuda()), n=20, first=False)
+    module = torch.jit.trace(model, torch.zeros((1, 1000)).long().cuda())
+    torch.jit.optimize_for_inference(module)
+    timeit(lambda: module(torch.zeros((1, 1000)).long().cuda()), n=20, first=False)
+    '''
+
+    module = torch.jit.trace(model, torch.zeros((1, 2048)).long().cuda())
+    torch.jit.optimize_for_inference(module)
+    static_input = torch.zeros((1, 2048), device='cuda').long()
+    static_out = torch.randn((1, 2048, 2048), device='cuda').half()
+    timeit(lambda: module(static_input), n=20, first=False)
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for i in range(3): 
+            output = module(torch.randint(0, 50000, (1, 2048), device='cuda').long())
+    torch.cuda.current_stream().wait_stream(s)
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g, stream=s):
+        static_out = module(static_input)
+
+    real_inputs = [torch.randint(0, 50000, (1, 2048), device='cuda').long() for _ in range(100)]
+
+    t = time.perf_counter()
+    for data in real_inputs:
+        #print(data[0, :20])
+        static_input.copy_(data)
+        #timeit(lambda: g.replay(), n=100, first=True)
+        g.replay()
+        #print(static_out[0, 0, :20])
+    torch.cuda.synchronize()
+    print(f"{perf_counter() - t}s")
+
+
+    #for data in real_inputs:
+    #    print(model(data)[0, 0, :20])
+    
--- a/test_pyfra.py
+++ b/test_pyfra.py
@@ -7,14 +7,14 @@ dry = False

 config_obj = KubeConfig()
 config_obj.set_name(name)
-config_obj.set_gpu(gpu_name=GPU.A40, amount=1)
+config_obj.set_gpu(gpu_name=GPU.RTX_A5000, amount=1)
 config_obj.set_ram(16)
 config_obj.set_cpu(4)
 #config_obj.set_cpu_only()
 config_obj.dry_run(dry)
 config_obj.print_information()
-config_obj.create_deployment(overwrite=True)
-config_obj.create_service(overwrite=True)
+#config_obj.create_deployment(overwrite=True)
+#config_obj.create_service(overwrite=True)

 remote = config_obj.get_pyfra_remote()
 env1 = remote.env('noname', python_version=None)
@@ -25,5 +25,6 @@ models = {'6b': '/home/xuser/models/j6b_ckpt_14001', '20b': '/home/xuser/diffusi
 path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
 env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
 env1.sh('pip install einops numpy')
+env1.sh('pip install tqdm')
 with always_rerun():
    path.sh(f'python3 test.py')
\ No newline at end of file