cleanup

840dd7f4 · novelailab · 1c9d3a31 · 1c9d3a31 · 1c9d3a31 · 840dd7f4
Commit 840dd7f4 authored Apr 06, 2022 by novelailab
6 changed files
--- a/lm_arch/gpt2.py
+++ b/lm_arch/gpt2.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils.checkpoint import checkpoint as ck
-from einops import rearrange, repeat
-try:
-    from collections.abc import MutableMapping
-except ImportError:
-    from collections import MutableMapping
-import os
-from pathlib import Path
-import math
-import lm_arch.gpt_arch as gpt_arch
-from lm_arch.utils import *
-
-#TODO: Might change with non einsum functions?
-
-def gelu_new(x):
-    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
-
-def _split_heads(tensor, num_heads, attn_head_size, rotary):
-    """
-    Splits hidden_size dim into attn_head_size and num_heads
-    """
-    new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-    tensor = tensor.view(*new_shape)
-    if rotary:
-        return tensor
-    if len(tensor.shape) == 5:
-        return tensor.permute(0, 1, 3, 2, 4)  # (batch, blocks, head, block_length, head_features)
-    elif len(tensor.shape) == 4:
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-    else:
-        raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
-
-def _merge_heads(tensor, num_heads, attn_head_size):
-    """
-    Merges attn_head_size dim and num_attn_heads dim into hidden_size
-    """
-    if len(tensor.shape) == 5:
-        tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
-    elif len(tensor.shape) == 4:
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
-    else:
-        raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
-    new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-    return tensor.view(new_shape)
-
-def _attn(query, key, value, causal_mask, masked_bias,
-            attention_mask=None, scale_attn=None):
-
-    attn_weights = torch.matmul(query, key.transpose(-1, -2))
-    attn_weights = torch.where(causal_mask, attn_weights, masked_bias.to(attn_weights.dtype))
-    attn_weights = attn_weights / scale_attn
-
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
-
-    attn_weights = F.softmax(attn_weights, dim=-1)
-    attn_weights = attn_weights.to(value.dtype)
-
-    attn_output = torch.matmul(attn_weights, value).to(value.dtype)
-
-    return attn_output
-
-class SelfAttention(nn.Module):
-    # Code copied from HF, might want to sanity check later.
-    def __init__(self, hidden_dim, n_head, device="cuda", dtype=torch.float16):
-        super(SelfAttention, self).__init__()
-        max_positions = 2049
-        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
-            1, 1, max_positions, max_positions).bool()
-        self.head_dim = hidden_dim // n_head
-        self.hidden_dim = hidden_dim
-        self.n_head = n_head
-        self.register_buffer("scale_attn", torch.sqrt(torch.tensor(self.head_dim, requires_grad=False).float()))
-        self.register_buffer("bias", bias)
-        self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False)) #-1e10 is what mtj uses.
-        attn_bias = True
-        self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
-        self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
-        self.q_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
-        self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
-
-    def forward(self, x):
-        query = self.q_proj(x)
-        key = self.k_proj(x)
-        value = self.v_proj(x)
-
-        query = _split_heads(query, self.n_head, self.head_dim, True)
-        key = _split_heads(key, self.n_head, self.head_dim, True)
-        value = _split_heads(value, self.n_head, self.head_dim, False)
-            
-        key = key.permute(0, 2, 1, 3)
-        query = query.permute(0, 2, 1, 3)
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-
-        x = _attn(
-            query, key, value, causal_mask, self.masked_bias, None, self.scale_attn
-        )
-
-        x = _merge_heads(x, self.n_head, self.head_dim)
-        x = self.out_proj(x)
-
-        return x
-
-class FeedForward(nn.Module):
-    def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU(), device="cuda", dtype=torch.float16):
-        super(FeedForward, self).__init__()
-        self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
-        self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
-        self.activation = activation
-
-    def forward(self, x, act_ck=False):
-        x = self.ff1(x)
-        if act_ck:
-            ck(self.activation, x)
-        else:
-            x = self.activation(x)
-        x = self.ff2(x)
-        return x
-
-class GPT2Layer(nn.Module):
-    def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU(), device="cuda", dtype=torch.float16):
-        super(GPT2Layer, self).__init__()
-        self.hidden_dim = hidden_dim
-        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
-        self.ln_postattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
-        self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
-        self.attn = attn(hidden_dim=hidden_dim, n_head=n_head, device=device, dtype=dtype)
-
-    def forward(self, x, hypernetwork=None, act_ck=False):
-        parallel_residual = True
-        residual = x
-        x = self.ln_preattn(x)
-        attn_out = self.attn(x)
-        if parallel_residual:
-            ff_out = self.ff(x, act_ck)
-            x = residual + attn_out + ff_out
-        else:
-            x = residual + attn_out
-            residual = x
-            x = self.ln_postattn(x)
-            ff_out = self.ff(x, act_ck)
-            x = residual + ff_out
-
-        if hypernetwork:
-            hyper_out = hypernetwork(x)
-            x = x + hyper_out
-            
-        return x
-
-class GPT2Model(nn.Module):
-    def __init__(self, hidden_dim=512, n_layer=12, ctx_size=1024, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPT2Layer, SelfAttention=SelfAttention, FeedForward=FeedForward, device="cuda", dtype=torch.float16):
-        super(GPT2Model, self).__init__()
-        self.n_layer = n_layer
-        self.hidden_dim = hidden_dim
-        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
-        self.pos_embed = nn.Embedding(ctx_size, self.hidden_dim)
-        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
-        self.layers = nn.ModuleList([])
-        self.lm_head = nn.Linear(hidden_dim, vocab_dim, bias=True)
-        for _ in range(n_layer):
-            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
-            #TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
-            #TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.
-
-    def get_embeds(self, x, hypernetwork=None, act_ck=False):
-        #print(x.shape)
-        position_ids = torch.arange(x.shape[-1], dtype=torch.long).cuda()
-        position_ids = position_ids.unsqueeze(0) 
-        x = self.vocab_embed(x) + self.pos_embed(position_ids)
-        for layer in self.layers:
-            x = layer(x, hypernetwork, act_ck)
-        x = self.ln_final(x)
-        return x
-
-    def forward(self, x, hypernetwork=None, act_ck=False):
-        x = self.get_embeds(x, hypernetwork=hypernetwork, act_ck=act_ck)
-        x = self.lm_head(x)
-        return x.float()
-    
-    @classmethod
-    def load(cls, config, path=None, state_dict=None):
-        if path:
-            state_dict = SplitCheckpoint(path, device="cuda")
-
-        model = no_init(lambda: cls(**config))
-        model.load_state_dict(state_dict, strict=False)
-        return model
-
-    @classmethod
-    def init(cls, config):
-        model = cls(**config)
-        return model
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=0.02)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=0.02)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-        for name, p in module.named_parameters():
-            if ("ff2" in name or "out_proj" in name) and "weight" in name:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.n_layer)))
-
-    @classmethod
-    def neox_init(cls, config):
-        model = cls(**config)
-        modules = [*model.layers[:-1], model.vocab_embed, model.ln_final, model.lm_head]
-        init = small_init_method(config["hidden_dim"])
-        for module in modules:
-            for param in module.parameters():
-                init(param)
-                
-        last_layer = model.layers[-1]
-        last_layer_init = wang_init_method(config["n_layer"], config["hidden_dim"])
-        for param in last_layer.parameters():
-            last_layer_init(param)
-
-        return model
-
-    @classmethod
-    def simple_init(cls, config):
-        model = cls(**config)
-        state = model.state_dict()
-        for k in state:
-            state[k] = state[k] / math.sqrt(2 * config["n_layer"])
-        model.load_state_dict(state)
-
-        return model
-
-    @classmethod
-    def gpt2_init(cls, config):
-        model = cls(**config)
-        for module in model.modules():
-            model._init_weights(module)
-        return model
-
-    def save(self, path):
-        try: os.mkdir(path)
-        except: pass
-        checkpoint = {}
-        for i, x in enumerate(self.state_dict().items()):
-            checkpoint[x[0]] = f"{path}/b{i}.pt"
-            torch.save(x[1], f"{path}/b{i}.pt")
-        torch.save(checkpoint, f"{path}/m.pt")
-
-def wang_init_method(n_layers, dim):
-    std = 2 / n_layers / math.sqrt(dim)
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
-    return init_
-
-# Stolen from NeoX. For the 20B run wang_init used on the output layer and small_init on rest of the layers.
-def small_init_method(dim):
-    """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving 
-    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
-    std = math.sqrt(2 / (5 * dim))
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
-    return init_
-        
-
-def load_gpt_j(path="models/6b", state_dict=None):
-    config = {
-        "n_layer": 28,
-        "n_head": 16,
-        "hidden_dim": 4096,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-    model = GPTJModel.load(config, path, state_dict)
-    return model
-
-def init_6b():
-    config = {
-        "n_layer": 28,
-        "n_head": 16,
-        "hidden_dim": 4096,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-    model = GPTJModel.init(config)
-    return model
-
-def init_125m():
-    config = {
-        "n_layer": 12,
-        "n_head": 12,
-        "hidden_dim": 768,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-
-    model = GPTJModel.init(config)
-    return model
-
-def init_1_3b():
-    config = {
-        "n_layer": 24,
-        "n_head": 16,
-        "hidden_dim": 2048,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-
-    model = GPTJModel(**config)
-    return model
\ No newline at end of file
--- a/lm_arch/gpt_arch.py
+++ b/lm_arch/gpt_arch.py
-from torch import nn
-import torch
-import os
-import math
-from lm_arch.utils import *
-
-# Can access and change every module from here, as both Layer class and ff and attn classes are passed from GPTModel.
-class GPTModel(nn.Module):
-    def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=None, SelfAttention=None, FeedForward=None, device="cuda", dtype=torch.float16):
-        super(GPTModel, self).__init__()
-        self.hidden_dim = hidden_dim
-        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
-        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
-        self.layers = nn.ModuleList([])
-        self.lm_head = nn.Linear(hidden_dim, vocab_dim, bias=True)
-        for _ in range(n_layer):
-            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
-            #TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
-            #TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.
-
-    def forward(self, x, hypernetwork=None, act_ck=False):
-        x = self.vocab_embed(x)
-        for layer in self.layers:
-            x = layer(x, hypernetwork, act_ck)
-        x = self.ln_final(x)
-        return x
-
-    def get_logits(self, x, hypernetwork=None, act_ck=False):
-        x = self.forward(x, hypernetwork=hypernetwork, act_ck=act_ck)
-        x = self.lm_head(x)
-        return x.float()
-    
-    @classmethod
-    def load(cls, config, path=None, state_dict=None):
-        if path:
-            state_dict = SplitCheckpoint(path, device="cuda")
-
-        model = no_init(lambda: cls(**config))
-        model.load_state_dict(state_dict, strict=False)
-        return model
-
-    @classmethod
-    def init(cls, config):
-        model = cls(**config)
-        return model
-
-    @classmethod
-    def neox_init(cls, config):
-        model = cls(**config)
-        modules = [*model.layers[:-1], model.vocab_embed, model.ln_final, model.lm_head]
-        init = small_init_method(config["hidden_dim"])
-        for module in modules:
-            for param in module.parameters():
-                init(param)
-                
-        last_layer = model.layers[-1]
-        last_layer_init = wang_init_method(config["n_layer"], config["hidden_dim"])
-        for param in last_layer.parameters():
-            last_layer_init(param)
-
-        return model
-
-    @classmethod
-    def simple_init(cls, config):
-        model = cls(**config)
-        state = model.state_dict()
-        for k in state:
-            state[k] = state[k] / math.sqrt(2 * config["n_layer"])
-        model.load_state_dict(state)
-
-        return model
-
-    def save(self, path):
-        try: os.mkdir(path)
-        except: pass
-        checkpoint = {}
-        for i, x in enumerate(self.state_dict().items()):
-            checkpoint[x[0]] = f"{path}/b{i}.pt"
-            torch.save(x[1], f"{path}/b{i}.pt")
-        torch.save(checkpoint, f"{path}/m.pt")
-
-def wang_init_method(n_layers, dim):
-    std = 2 / n_layers / math.sqrt(dim)
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
-    return init_
-
-# Stolen from NeoX. For the 20B run wang_init used on the output layer and small_init on rest of the layers.
-def small_init_method(dim):
-    """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving 
-    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
-    std = math.sqrt(2 / (5 * dim))
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
-    return init_
\ No newline at end of file
--- a/lm_arch/gptj.py
+++ b/lm_arch/gptj.py
@@ -79,7 +79,7 @@ def _attn(query, key, value, causal_mask, masked_bias,
 class SelfAttention(nn.Module):
    # Code copied from HF, might want to sanity check later.
    def __init__(self, hidden_dim, n_head, device, dtype):
-        super(SelfAttention, self).__init__()
+        super().__init__(self)
        max_positions = 2049
        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
            1, 1, max_positions, max_positions).bool()
@@ -143,7 +143,7 @@ class SelfAttention(nn.Module):

 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, activation, device, dtype):
-        super(FeedForward, self).__init__()
+        super().__init__(self)
        self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
        self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
        self.activation = activation
@@ -159,14 +159,16 @@ class FeedForward(nn.Module):

 class GPTJLayer(nn.Module):
    def __init__(self, attn, ff, hidden_dim, n_head, eps, activation, device, dtype):
-        super(GPTJLayer, self).__init__()
+        super().__init__(self)
        self.hidden_dim = hidden_dim
        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
        self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
        self.attn = attn(hidden_dim=hidden_dim, n_head=n_head, device=device, dtype=dtype)
+        self.tick = True

-    def forward(self, x, hypernetwork=None, act_ck=False):
+    def forward(self, x, layer_id=None, hypernetwork=None, act_ck=False, diff_hypernets=False, interleaving_layers=False, every_n=5):
        residual = x
+        
        if act_ck:
            x = ck(self.ln_preattn, x)
            attn_out = ck(self.attn, x)
@@ -175,17 +177,35 @@ class GPTJLayer(nn.Module):
            x = self.ln_preattn(x)
            attn_out = self.attn(x)

-        ff_out = self.ff(x, act_ck)
-        x = residual + attn_out + ff_out
        if hypernetwork:
-            hyper_out = hypernetwork(x)
+            if diff_hypernets:
+                if interleaving_layers and layer_id % every_n == 0:
+                    if self.tick:
+                        hyper_out = hypernetwork[0](x)
+                        self.tick = False
+                    else:
+                        hyper_out = hypernetwork[1](x)
+                        self.tick = True
+
+                elif layer_id % every_n == 0:
+                    hyper_out = hypernetwork[(layer_id // every_n) - 1](x)
+
+            else:
+                if layer_id % every_n == 0:
+                    hyper_out = hypernetwork(x)
+
+        ff_out = self.ff(x, act_ck)
+        #order of addition matters, i had no idea... fixed a bug here.
+        x = attn_out + ff_out + residual
+        #x = residual + attn_out + ff_out -> doesn't match.
+        if hypernetwork and layer_id % every_n == 0:
            x = x + hyper_out
            
        return x

-class GPTModel(nn.Module):
+class GPTJModel(nn.Module):
    def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation, Layer, device, dtype):
-        super(GPTModel, self).__init__()
+        super().__init__(self)
        self.n_layer = n_layer
        self.hidden_dim = hidden_dim
        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
@@ -194,25 +214,6 @@ class GPTModel(nn.Module):
        self.lm_head = nn.Linear(hidden_dim, vocab_dim, bias=True)
        for _ in range(n_layer):
            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
-            #TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
-            #TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=0.02)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=0.02)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-        for name, p in module.named_parameters():
-            if ("ff2" in name or "out_proj" in name) and "weight" in name:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.n_layer)))

    def forward(self, x, hypernetwork=None, act_ck=False):
        x = self.get_embeds(x, hypernetwork=hypernetwork, act_ck=act_ck)

--- a/lm_arch/lm_class.py
+++ b/lm_arch/lm_class.py
@@ -4,6 +4,7 @@ import torch
 from torch import nn
 import os

+#Having common BaseLM functionality in this class instead of the torch LM itself makes sense.
 class BaseLM(nn.Module):
    def __init__(self, config=None, lm=None):
        self.config = config
@@ -57,6 +58,8 @@ class BaseLM(nn.Module):
    def save(self, path):
        if self.lm is None:
            print("No LM object to save. Please first init a model.")
+            return
+            
        try: os.mkdir(path)
        except: pass
        checkpoint = {}

--- a/lm_arch/nn.py
+++ b/lm_arch/nn.py
--- a/lm_arch/utils.py
+++ b/lm_arch/utils.py
@@ -6,7 +6,6 @@ except ImportError:
 from pathlib import Path
 import os

-
 def no_init(loading_code):
    def dummy(self):
        return