fairseq works

88f0a90e · novelailab · 24459438 · 88f0a90e · 88f0a90e · 88f0a90e
Commit 88f0a90e authored May 13, 2022 by novelailab
12 changed files
--- a/.tmp.89744
+++ b/.tmp.89744
--- a/basedformer/models/__init__.py
+++ b/basedformer/models/__init__.py
 from . import gptj
+from . import gpt2
+from . import fairseq

 MODEL_MAP = {
    "gptj": gptj.GPTJModel,
+    "gpt2": gpt2.GPT2Model,
+    "gpt-fairseq": fairseq.GPTFairModel
 }

 def get_model(model_name: str):

--- a/basedformer/models/base_lm.py
+++ b/basedformer/models/base_lm.py
@@ -17,12 +17,13 @@ class BaseModel(nn.Module):
        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
        self.layers = nn.ModuleList([])
        self.lm_head = nn.Linear(config.hidden_dim, config.vocab_dim, bias=True)
-        for _ in range(config.n_layer):
+        for i in range(config.n_layer):
            self.layers.append(
            config.Layer(
                attn=config.SelfAttention,
                ff=config.FeedForward,
                config=config,
+                layer_idx=i,
                )
            )
    

--- a/basedformer/models/fairseq.py
+++ b/basedformer/models/fairseq.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from basedformer.utils import *
+from torch.utils.checkpoint import checkpoint as ck
+from einops import rearrange, repeat
+try:
+    from collections.abc import MutableMapping
+except ImportError:
+    from collections import MutableMapping
+import os
+from pathlib import Path
+import math
+from basedformer.models import base_lm
+from typing import Optional, Any
+
+def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
+    """Replace non-padding symbols with their position numbers.
+
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(torch.tensor(50257, requires_grad=False)).int()
+    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
+
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+
+    Padding symbols are ignored.
+    """
+
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx if padding_idx is not None else 0
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(
+            init_size, embedding_dim, padding_idx
+        )
+        self.onnx_trace = False
+        self.register_buffer("_float_tensor",
+                             torch.tensor(1.0, requires_grad=False).float())
+        self.max_positions = int(1e5)
+        # print(embedding_dim, padding_idx, init_size)
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    @staticmethod
+    def get_embedding(
+            num_embeddings: int, embedding_dim: int,
+            padding_idx: Optional[int] = None
+    ):
+        """Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+
+    def forward(
+            self,
+            input,
+            incremental_state: Optional[Any] = None,
+            timestep: Optional[torch.Tensor] = None,
+            positions: Optional[Any] = None,
+            offset: Optional[int] = 0
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bspair = input.shape
+        bsz, seq_len = bspair[0], bspair[1]
+        max_pos = self.padding_idx + 1 + seq_len + offset
+        # print("max_pos: " + str(max_pos))
+        if self.weights is None or max_pos > self.weights.size(0):
+            # print("recomputing embeddings")
+            # recompute/expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos, self.embedding_dim, self.padding_idx + offset
+            )
+        self.weights = self.weights.to(self._float_tensor)
+
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            if self.onnx_trace:
+                return (
+                    self.weights.index_select(
+                        index=self.padding_idx + pos + offset, dim=0)
+                        .unsqueeze(1)
+                        .repeat(bsz, 1, 1)
+                )
+            return self.weights[self.padding_idx + pos + offset, :].expand(bsz,
+                                                                           1,
+                                                                           -1)
+
+        positions = make_positions(
+            input, self.padding_idx + offset, onnx_trace=self.onnx_trace
+        )
+        if self.onnx_trace:
+            flat_embeddings = self.weights.detach().index_select(0,
+                                                                 positions.view(
+                                                                     -1))
+            embedding_shape = torch.cat(
+                (bsz.view(1), seq_len.view(1),
+                 torch.tensor([-1], dtype=torch.long))
+            )
+            embeddings = torch.onnx.operators.reshape_from_tensor_shape(
+                flat_embeddings, embedding_shape
+            )
+            return embeddings
+        return (
+            self.weights.index_select(0, positions.view(-1))
+                .view(bsz, seq_len, -1)
+                .detach()
+        )
+
+
+def PositionalEmbedding(
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int,
+):
+    m = SinusoidalPositionalEmbedding(
+        embedding_dim,
+        padding_idx,
+        init_size=num_embeddings + padding_idx + 1,
+    )
+    return m
+
+def _attn(query, key, value, causal_mask, masked_bias,
+            attention_mask=None, scale_attn=None, fp32_attn=True):
+
+    if fp32_attn:
+        attn_weights = torch.matmul(query.float(), key.transpose(-1, -2).float())
+    else:
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+    attn_weights = torch.where(causal_mask, attn_weights, masked_bias.to(attn_weights.dtype))
+    attn_weights = attn_weights / scale_attn.to(attn_weights.dtype)
+
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = F.softmax(attn_weights, dim=-1)
+    attn_weights = attn_weights.to(value.dtype)
+
+    attn_output = torch.matmul(attn_weights, value).to(value.dtype)
+
+    return attn_output
+
+class SelfAttention(nn.Module):
+    # Code copied from HF, might want to sanity check later.
+    def __init__(self, config):
+        nn.Module.__init__(self)
+        self.config = config
+        max_positions = 2049
+        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
+            1, 1, max_positions, max_positions).bool()
+        self.head_dim = config.hidden_dim // config.n_head
+        self.rotary_dim = self.head_dim // 4
+        self.hidden_dim = config.hidden_dim
+        self.n_head = config.n_head
+        device = config.device
+        dtype = config.dtype
+
+        self.register_buffer("scale_attn", torch.sqrt(torch.tensor(self.head_dim, requires_grad=False).float()))
+        self.register_buffer("bias", bias)
+        self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False)) #-1e10 is what mtj uses.
+        attn_bias = True #fairseq has attn_bias
+        self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        self.q_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
+        
+    def forward(self, x, kv=None, cache=False):
+        B, S, H = x.shape # batch, sequence, hidden_dim
+        # split heads into: [batch, head, sequence, head_dim]
+        query = self.q_proj(x).view(B, S, self.n_head, self.head_dim).transpose(1, 2)
+        key = self.k_proj(x).view(B, S, self.n_head, self.head_dim).transpose(1, 2)
+        value = self.v_proj(x).view(B, S, self.n_head, self.head_dim).transpose(1, 2)
+
+        if kv:
+            k, v = kv
+            # cat key and value (get the whole sequence, other than the last added token all are cached),
+            # so query can attend to it.
+            torch.cat([k, key], dim=-2) # cat key
+            torch.cat([v, value], dim=-2) # cat value
+            
+        query_length, key_length = query.size(-2), key.size(-2) # seq_len, seq_len
+        causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length]
+
+        x = _attn(
+            query, key, value, causal_mask, self.masked_bias, None, self.scale_attn, self.config.fp32_attn
+        )
+
+        x = x.transpose(1, 2).contiguous().view(B, S, H)
+        x = self.out_proj(x)
+        if cache:
+            return x, (key, value)
+        else:
+            return x, None
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        nn.Module.__init__(self)
+        self.ff1 = nn.Linear(config.hidden_dim, config.hidden_dim * 4, device=config.device, dtype=config.dtype)
+        self.ff2 = nn.Linear(config.hidden_dim * 4, config.hidden_dim, device=config.device, dtype=config.dtype)
+        self.activation = config.activation
+
+    def forward(self, x, act_ck=False):
+        x = self.ff1(x)
+        if act_ck:
+            x = ck(self.activation, x)
+        else:
+            x = self.activation(x)
+        x = self.ff2(x)
+        return x
+
+class GPTFairLayer(nn.Module):
+    def __init__(self, attn, ff, config):
+        nn.Module.__init__(self)
+        self.hidden_dim = config.hidden_dim
+        self.ln_preattn = nn.LayerNorm(config.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
+        self.ln_postattn = nn.LayerNorm(config.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
+        self.ff = ff(config)
+        self.attn = attn(config)
+        self.tick = True
+
+    def forward(self, x, layer_id=None, hypernetwork=None, act_ck=False, cache=False, kv=None):
+        residual = x
+        
+        if act_ck:
+            x = ck(self.ln_preattn, x)
+            attn_out, kv = ck(self.attn, x, kv=kv, cache=cache)
+
+        else:
+            x = self.ln_preattn(x)
+            attn_out, kv = self.attn(x, kv=kv, cache=cache)
+
+        x = residual + attn_out
+        residual = x
+        x = self.ln_postattn(x)
+        ff_out = self.ff(x, act_ck)
+        x = residual + ff_out
+            
+        return x, kv
+
+class GPTFairModel(base_lm.BaseModel):
+    def __init__(self, user_config, **kwargs):
+        self.default_config = {
+            'n_layer': 6,
+            'n_head': 8,
+            'n_tokens': 2049,
+            'hidden_dim': 512,
+            'vocab_dim': 50400,
+            'fp32_attn': True, #fairseq models are trained with fp32 attn
+            'eps': 1e-5,
+            'device': torch.device('cuda'),
+            'dtype': torch.float16,
+            'Layer': GPTFairLayer,
+            'activation': F.gelu,
+            'SelfAttention': SelfAttention,
+            'FeedForward': FeedForward,
+        }
+        base_lm.BaseModel.__init__(self, user_config, **kwargs)
+        # returns sinusoidal embeddings of shape: (1, n_tokens, 768)
+        self.register_buffer("embed_scale", torch.sqrt(torch.tensor(self.config.hidden_dim, requires_grad=False)))
+        self.pos_embed = PositionalEmbedding(self.config.n_tokens, self.config.hidden_dim, 1)
+        self.lm_head = nn.Linear(self.config.hidden_dim, self.config.vocab_dim, bias=False)
+        #bias=False for fairseq models
+
+    def get_embeds(self, x, hypernetwork=None, act_ck=False, kv=None, cache=False):
+        if kv is None:
+            kv = [None] * self.n_layer
+            past_length = 0
+
+        else:
+            past_length = kv[0][0].size(-2) #get sequence dim of key
+
+        kv_new = []
+
+        position_embeds = self.pos_embed(x, offset=past_length)
+        input_embeds = self.vocab_embed(x) * self.embed_scale
+        x = position_embeds + input_embeds
+
+        for layer_id, layer in enumerate(self.layers):
+            x, kvi = layer(x, layer_id=layer_id, hypernetwork=hypernetwork, act_ck=act_ck, kv=kv[layer_id], cache=cache)
+            kv_new.append(kvi)
+
+        x = self.ln_final(x)
+        if cache:
+            return x, kv_new
+        else:
+            return x, None
\ No newline at end of file
--- a/basedformer/models/gpt2.py
+++ b/basedformer/models/gpt2.py
@@ -57,15 +57,18 @@ def _attn(query, key, value, causal_mask, masked_bias,

 class SelfAttention(nn.Module):
    # Code copied from HF, might want to sanity check later.
-    def __init__(self, hidden_dim, n_head, device, dtype):
+    def __init__(self, config):
        nn.Module.__init__(self)
        max_positions = 2049
        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
            1, 1, max_positions, max_positions).bool()
-        self.head_dim = hidden_dim // n_head
+        self.head_dim = config.hidden_dim // config.n_head
        self.rotary_dim = self.head_dim // 4
-        self.hidden_dim = hidden_dim
-        self.n_head = n_head
+        self.hidden_dim = config.hidden_dim
+        self.n_head = config.n_head
+        device = config.device
+        dtype = config.dtype
+
        self.register_buffer("scale_attn", torch.sqrt(torch.tensor(self.head_dim, requires_grad=False).float()))
        self.register_buffer("bias", bias)
        self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False)) #-1e10 is what mtj uses.
@@ -101,14 +104,14 @@ class SelfAttention(nn.Module):
        if cache:
            return x, (key, value)
        else:
-            return x
+            return x, None

 class FeedForward(nn.Module):
-    def __init__(self, dim, hidden_dim, activation, device, dtype):
+    def __init__(self, config):
        nn.Module.__init__(self)
-        self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
-        self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
-        self.activation = activation
+        self.ff1 = nn.Linear(config.hidden_dim, config.hidden_dim * 4, device=config.device, dtype=config.dtype)
+        self.ff2 = nn.Linear(config.hidden_dim * 4, config.hidden_dim, device=config.device, dtype=config.dtype)
+        self.activation = config.activation

    def forward(self, x, act_ck=False):
        x = self.ff1(x)
@@ -120,39 +123,39 @@ class FeedForward(nn.Module):
        return x

 class GPT2Layer(nn.Module):
-    def __init__(self, attn, ff, hidden_dim, n_head, eps, activation, device, dtype):
+    def __init__(self, attn, ff, config):
        nn.Module.__init__(self)
-        self.hidden_dim = hidden_dim
-        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
-        self.ln_postattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
-        self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
-        self.attn = attn(hidden_dim=hidden_dim, n_head=n_head, device=device, dtype=dtype)
+        self.hidden_dim = config.hidden_dim
+        self.ln_preattn = nn.LayerNorm(config.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
+        self.ln_postattn = nn.LayerNorm(config.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
+        self.ff = ff(config)
+        self.attn = attn(config)
        self.tick = True

-    def forward(self, x, layer_id=None, hypernetwork=None, act_ck=False):
+    def forward(self, x, layer_id=None, hypernetwork=None, act_ck=False, cache=False, kv=None):
        residual = x
        
        if act_ck:
            x = ck(self.ln_preattn, x)
-            attn_out = ck(self.attn, x)
+            attn_out, kv = ck(self.attn, x, kv=kv, cache=cache)

        else:
            x = self.ln_preattn(x)
-            attn_out = self.attn(x)
+            attn_out, kv = self.attn(x, kv=kv, cache=cache)

        residual = residual + attn_out
        x = self.ln_postattn(x)
        ff_out = self.ff(x, act_ck)
        x = residual + ff_out
            
-        return x
+        return x, kv

 class GPT2Model(base_lm.BaseModel):
    def __init__(self, user_config, **kwargs):
        self.default_config = {
            'n_layer': 6,
            'n_head': 8,
-            'n_tokens': 2048,
+            'n_tokens': 1024,
            'hidden_dim': 512,
            'vocab_dim': 50400,
            'eps': 1e-5,
@@ -163,4 +166,30 @@ class GPT2Model(base_lm.BaseModel):
            'SelfAttention': SelfAttention,
            'FeedForward': FeedForward,
        }
-        base_lm.BaseModel.__init__(self, user_config, **kwargs)
\ No newline at end of file
+        base_lm.BaseModel.__init__(self, user_config, **kwargs)
+        self.pos_embed = nn.Embedding(self.config.n_tokens, self.config.hidden_dim)
+
+    def get_embeds(self, x, hypernetwork=None, act_ck=False, kv=None, cache=False):
+        if kv is None:
+            kv = [None] * self.n_layer
+            past_length = 0
+
+        else:
+            past_length = kv[0][0].size(-2) #get sequence dim of key
+
+        position_ids = torch.arange(past_length, x[-1] + past_length, dtype=torch.long, device=x.device)
+        position_ids = position_ids.unsqueeze(0).view(-1, x[-1])
+
+        kv_new = []
+        x = self.vocab_embed(x)
+        x = x + self.pos_embed(position_ids)
+
+        for layer_id, layer in enumerate(self.layers):
+            x, kvi = layer(x, layer_id=layer_id, hypernetwork=hypernetwork, act_ck=act_ck, kv=kv[layer_id], cache=cache)
+            kv_new.append(kvi)
+
+        x = self.ln_final(x)
+        if cache:
+            return x, kv_new
+        else:
+            return x, None
\ No newline at end of file
--- a/pyfra_util.py
+++ b/pyfra_util.py
@@ -48,13 +48,13 @@ remote = config_obj.get_pyfra_remote()
 env1 = remote.env('noname', python_version=None)

 path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
-env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
-env1.sh('pip install einops numpy')
-env1.sh('pip install tqdm')
-env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
-env1.sh('pip3 install einops==0.4.1 pyyaml wandb')
-env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4')
-env1.sh('pip3 install dotmap')
+#env1.sh('pip3 install git+https://github.com/pytorch/fairseq')
+env1.sh('pip3 install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
+
+with always_rerun():
+    env1.sh('pip uninstall transformers')
+    env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
+    

 with always_rerun():
    if args.bash:

--- a/run_pyfra.py
+++ b/run_pyfra.py
@@ -40,7 +40,9 @@ if False:
    #path.sh("pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113")
 with always_rerun():
    if True:
-        path.sh("python3 ../lm-evaluation-harness/main.py --model basedformer --batch_size 8 --model_args pretrained=/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/gptj-6b --device 0 --tasks lambada")
+        env1.sh('pip3 uninstall transformers')
+        env1.sh('pip3 install transformers')
+        path.sh("python3 ../lm-evaluation-harness/main.py --model basedformer --batch_size 8 --model_args pretrained=/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m --device 0 --tasks lambada --no_cache")
        #path.sh("python3 ../lm-evaluation-harness/main.py --batch_size 8")

    else:

--- a/scripts/comparefairseq.py
+++ b/scripts/comparefairseq.py
+from basedformer.utils import * 
+import basedformer.lm_utils as lmu
+from fairseq.models.transformer_lm import TransformerLanguageModel
+import time
+
+import torch
+from time import perf_counter, perf_counter_ns
+import numpy as np
+from tqdm import tqdm
+from contextlib import contextmanager
+import torch.nn.functional as F
+from transformers import GPTNeoForCausalLM
+from icecream import ic
+#replicating timeit magic function of ipython
+def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
+    precision = 'ns'
+    r_arr = np.empty([2, r]) # [0] = mean, [1] = std
+    if function:
+        func.__name__ = function.__name__
+
+    for i in tqdm(range(r)) if do_tqdm else range(r):
+        n_arr = np.empty(n)
+        for k in range(n):
+            start = perf_counter_ns()
+            func()
+            n_arr[k] = perf_counter_ns() - start
+        
+        if not first:
+            # delete the first element from n_arr numpy array
+            n_arr = np.delete(n_arr, 0)
+
+        r_arr[0, i] = np.mean(n_arr)
+        r_arr[1, i] = np.std(n_arr)
+    
+    best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
+    #check if best[0] bigger than 1ms in numpy
+    if best[0] < 1e3:
+        precision = 'ns'
+
+    elif best[0] >= 1e9:
+        print('b')
+        best[0] = best[0] * 1e-9
+        best[1] = best[1] * 1e-9
+        precision = 's'
+
+    elif best[0] >= 1e6:
+        best[0] = best[0] * 1e-6
+        best[1] = best[1] * 1e-6
+        precision = 'ms'
+
+    elif best[0] >= 1e3:
+        precision = 'μs'
+        best[0] = best[0] * 1e-3
+        best[1] = best[1] * 1e-3
+
+    if not quiet:
+        if precision == 'ns':
+            print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        if precision == 'μs':
+            print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        elif precision == 'ms':
+            print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        elif precision == 's':
+            print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+
+
+with torch.no_grad():
+    model_dir = '/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/hf_125m/'
+    hf_model = no_init(lambda: GPTNeoForCausalLM.from_pretrained(model_dir)).cuda().half().eval()
+    print("Loaded hf model")
+    path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m"
+    based_model = lmu.load_from_path(path).cuda().half().eval()
+    print("Loaded based model")
+    x = torch.randint(0, 50256, (1, 2048)).cuda().long()
+
+    assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x))
+    hidden = hf_model.transformer.wte(x)
+    for layer in range(len(based_model.layers)):
+        ic(layer)
+        residual = hidden
+        #ln_preattn
+        assert torch.allclose(hf_model.transformer.h[layer].ln_1(hidden), based_model.layers[layer].ln_preattn(hidden))
+        hidden = hf_model.transformer.h[layer].ln_1(hidden)
+        #attn
+        ic(hf_model.transformer.h[layer].attn(hidden)[0].abs().mean())
+        ic(based_model.layers[layer].attn(hidden)[0].abs().mean())
+        ic((hf_model.transformer.h[layer].attn(hidden)[0] - based_model.layers[layer].attn(hidden)[0]).abs().mean())
+        #assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0], rtol=1e-6)
+        attn_out = hf_model.transformer.h[layer].attn(hidden)[0]
+        hidden = residual + attn_out
+        residual = hidden
+        assert torch.allclose(hf_model.transformer.h[layer].ln_2(hidden), based_model.layers[layer].ln_postattn(hidden))
+        hidden = hf_model.transformer.h[layer].ln_2(hidden)
+        #ffn
+        assert torch.allclose(hf_model.transformer.h[layer].mlp(hidden), based_model.layers[layer].ff(hidden))
+        ff_out = hf_model.transformer.h[layer].mlp(hidden)
+        hidden = residual + ff_out
+        assert torch.allclose(hf_model.transformer.h[layer](hidden)[0], based_model.layers[layer](hidden)[0])
+
+    ic(hf_model(x)["logits"].abs().mean())
+    ic(based_model(x).abs().mean())
+    assert torch.allclose(hf_model.transformer.ln_f(hidden), based_model.ln_final(hidden))
+    hidden = hf_model.transformer.ln_f(hidden)
+    assert torch.allclose(hf_model.transformer(x)["last_hidden_state"], based_model.get_embeds(x)[0])
+    assert torch.allclose(hf_model(x)["logits"], based_model(x))
\ No newline at end of file
--- a/scripts/comparehfgpt2.py
+++ b/scripts/comparehfgpt2.py
+from basedformer import gptj
+from basedformer.utils import * 
+import basedformer.lm_utils as lmu
+import time
+
+import torch
+from time import perf_counter, perf_counter_ns
+import numpy as np
+from tqdm import tqdm
+from contextlib import contextmanager
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM
+#replicating timeit magic function of ipython
+def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
+    precision = 'ns'
+    r_arr = np.empty([2, r]) # [0] = mean, [1] = std
+    if function:
+        func.__name__ = function.__name__
+
+    for i in tqdm(range(r)) if do_tqdm else range(r):
+        n_arr = np.empty(n)
+        for k in range(n):
+            start = perf_counter_ns()
+            func()
+            n_arr[k] = perf_counter_ns() - start
+        
+        if not first:
+            # delete the first element from n_arr numpy array
+            n_arr = np.delete(n_arr, 0)
+
+        r_arr[0, i] = np.mean(n_arr)
+        r_arr[1, i] = np.std(n_arr)
+    
+    best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
+    #check if best[0] bigger than 1ms in numpy
+    if best[0] < 1e3:
+        precision = 'ns'
+
+    elif best[0] >= 1e9:
+        print('b')
+        best[0] = best[0] * 1e-9
+        best[1] = best[1] * 1e-9
+        precision = 's'
+
+    elif best[0] >= 1e6:
+        best[0] = best[0] * 1e-6
+        best[1] = best[1] * 1e-6
+        precision = 'ms'
+
+    elif best[0] >= 1e3:
+        precision = 'μs'
+        best[0] = best[0] * 1e-3
+        best[1] = best[1] * 1e-3
+
+    if not quiet:
+        if precision == 'ns':
+            print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        if precision == 'μs':
+            print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        elif precision == 'ms':
+            print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+        elif precision == 's':
+            print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
+
+
+with torch.no_grad():
+    hf_model = no_init(lambda: AutoModelForCausalLM.from_pretrained('gpt2')).cuda().half().eval()
+    print("Loaded hf model")
+    path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/gpt2"
+    based_model = lmu.load_from_path(path).cuda().half().eval()
+    print("Loaded based model")
+    x = torch.randint(0, 50256, (1, 1000)).cuda().long()
+
+    assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x))
+    hidden = hf_model.transformer.wte(x)
+    for layer in range(28):
+        assert torch.allclose(hf_model.transformer.h[layer].ln_1(hidden), based_model.layers[layer].ln_preattn(hidden))
+        hidden = hf_model.transformer.h[layer].ln_1(hidden)
+        assert torch.allclose(hf_model.transformer.h[layer].ln_2(hidden), based_model.layers[layer].ln_postattn(hidden))
+        hidden = hf_model.transformer.h[layer].ln_2(hidden)
+        hf_mlp = hf_model.transformer.h[layer].mlp(hidden)
+        based_mlp = based_model.layers[layer].ff(hidden)
+        print((hf_mlp - based_mlp).abs().sum())
+        hidden = hf_mlp
+        hf_attn = hf_model.transformer.h[layer].attn(hidden)[0]
+        based_attn = based_model.layers[layer].attn(hidden)[0]
+        print((hf_attn - based_attn).abs().sum())
+        assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0])
+        hidden = hf_model.transformer.h[layer].attn(hidden)[0]
+        assert torch.allclose(hf_model.transformer.h[layer](hidden)[0], based_model.layers[layer](hidden)[0])
+
+    assert torch.allclose(hf_model.transformer.ln_f(hidden), based_model.ln_final(hidden))
+    hidden = hf_model.transformer.ln_f(hidden)
+    assert torch.allclose(hf_model.transformer(x)["last_hidden_state"], based_model.get_embeds(x)[0])
+    assert torch.allclose(hf_model(x)["logits"], based_model(x))
\ No newline at end of file
--- a/scripts/convertfairseqhf.py
+++ b/scripts/convertfairseqhf.py
+import json
+import torch
+from fairseq.models.transformer_lm import TransformerLanguageModel
+import sys
+import os
+
+copy_eot_to_newline = True
+copy_newline_to_eot = True
+model_dir = 'pretrained/en_dense_lm_125m' # path to smol model weights to fix tokenizer shuffle
+
+checkpoint = {}
+ckmap = {}
+ckid = 0
+
+def save(params, name):
+    global ckid
+    ckmap[name] = f"b{ckid}.pt"
+    ckid += 1
+    torch.save(params, f"{sys.argv[2]}/" + ckmap[name])
+    torch.save(ckmap, f"{sys.argv[2]}/m.pt")
+    print(name + ": " + str(params.shape))
+    del params
+
+def no_init(loading_code):
+    def dummy(self):
+        return
+    
+    modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
+    original = {}
+    for mod in modules:
+        original[mod] = mod.reset_parameters
+        mod.reset_parameters = dummy
+    
+    result = loading_code()
+    for mod in modules:
+        mod.reset_parameters = original[mod]
+    
+    return result
+
+lm = no_init(lambda: TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2').eval().cpu())
+fairdict = torch.load(f"{sys.argv[1]}", map_location="cpu")
+
+try:
+    os.mkdir(sys.argv[2])
+except:
+    pass
+
+hidden_dim = fairdict["cfg"]["model"]["decoder_embed_dim"]
+num_heads = fairdict["cfg"]["model"]["decoder_attention_heads"]
+num_layers = fairdict["cfg"]["model"]["decoder_layers"]
+
+fairdict = fairdict["model"]
+
+config = {
+    "activation_function": "gelu",
+    "architectures": ["GPTNeoForCausalLM"],
+    "attention_dropout": 0,
+    "attention_layers": ["global"] * num_layers,
+    "attention_types": [[["global"], num_layers]],
+    "bos_token_id": 50256,
+    "embed_dropout": 0,
+    "eos_token_id": 50256,
+    "gradient_checkpointing": False,
+    "hidden_size": hidden_dim,
+    "initializer_range": 0.02,
+    "intermediate_size": None,
+    "fair": True,
+    "layer_norm_epsilon": 1e-05,
+    "max_position_embeddings": 2048,
+    "model_type": "gpt_neo",
+    "num_heads": num_heads,
+    "num_layers": num_layers,
+    "resid_dropout": 0,
+    "rotary": False,
+    "summary_activation": None,
+    "summary_first_dropout": 0.1,
+    "summary_proj_to_labels": True,
+    "summary_type": "cls_index",
+    "summary_use_proj": True,
+    "model_dtype": "fp16",
+    "model_device": "cuda",
+    "transformers_version": "4.6.0.dev0",
+    "use_cache": True,
+    "vocab_size": 51200,
+    "window_size": 256,
+    "tokenizer_class": "GPT2Tokenizer",
+    "task_specific_params": {"text-generation": {"do_sample": True,"temperature": 1.0,"max_length": 50}}
+  }
+
+with open(f"{sys.argv[2]}/config.json", "w") as fh:
+    fh.write(json.dumps(config))
+
+#print(lm)
+
+def hack_embs(embs):
+    eot = embs[50256].clone()
+    newline = embs[198].clone()
+    if copy_eot_to_newline:
+        embs[198] = eot
+    if copy_newline_to_eot:
+        embs[50256] = newline
+
+# gpt2 compatible input/output embedding layers
+l1 = []
+l2 = []
+
+check = {}
+for i in range(50256):
+    check[i] = True
+
+for i, s in enumerate(lm.tgt_dict.symbols):
+    try:
+        if str(int(s)) == s and s != '50256':
+            l2.append(int(s))
+            l1.append(i)
+            del check[int(s)]
+    except:
+        pass
+
+for i, s in enumerate([lm.tgt_dict.eos_word, lm.tgt_dict.pad_word, lm.tgt_dict.bos_word, lm.tgt_dict.unk_word]):
+    l2.append(50256 + i)
+    l1.append(lm.tgt_dict.indices[s])
+
+mapping = {}
+for i in range(50260):
+    mapping[l1[i]] = l2[i]
+
+with torch.no_grad():
+    wte = fairdict["decoder.embed_tokens.weight"].clone()
+    for i in range(50260):
+        wte[mapping[i]] = fairdict["decoder.embed_tokens.weight"][i]
+    hack_embs(wte)
+    save(wte.half(), "transformer.wte.weight")
+    lm_head = fairdict["decoder.output_projection.weight"].clone()
+    for i in range(50260):
+        lm_head[mapping[i]] = fairdict["decoder.output_projection.weight"][i]
+    hack_embs(lm_head)
+    save(lm_head.half(), "lm_head.weight")
+
+save(torch.FloatTensor(1), "transformer.wpe_sin._float_tensor")
+
+new_state_dict = {}
+for y in fairdict:
+    dotlist = y.split(".")
+
+    if y == "decoder.version":
+        trans_to = "Passed"
+        pass
+
+    elif y == "decoder.embed_tokens.weight":
+        continue
+
+    elif len(dotlist) >= 2 and dotlist[1] == "layers":
+        layer_id = dotlist[2]
+
+        if dotlist[-2] in ["k_proj", "v_proj", "q_proj", "out_proj"]:
+            trans_to = f"transformer.h.{layer_id}.attn.attention.{dotlist[-2]}.{dotlist[-1]}"
+
+        if dotlist[-2] == "self_attn_layer_norm":
+            trans_to = f"transformer.h.{layer_id}.ln_1.{dotlist[-1]}"
+
+        if dotlist[3] == "fc1":
+            trans_to = f"transformer.h.{layer_id}.mlp.c_fc.{dotlist[-1]}"
+
+        if dotlist[3] == "fc2":
+            trans_to = f"transformer.h.{layer_id}.mlp.c_proj.{dotlist[-1]}"
+        
+        if dotlist[3] == "final_layer_norm":
+            trans_to = f"transformer.h.{layer_id}.ln_2.{dotlist[-1]}"
+
+    elif len(dotlist) >= 2 and dotlist[1] == "layer_norm":
+        trans_to = f"transformer.ln_f.{dotlist[-1]}"
+
+    elif y == "decoder.output_projection.weight":
+        continue
+
+    if trans_to != "Passed":
+        save(fairdict[y].half(), trans_to)
+    print(f"{trans_to} < {y}")
\ No newline at end of file
--- a/scripts/fairseqport.py
+++ b/scripts/fairseqport.py
+import json
+import torch
+from fairseq.models.transformer_lm import TransformerLanguageModel
+import sys
+import os
+from pathlib import Path
+
+copy_eot_to_newline = True
+copy_newline_to_eot = True
+model_dir = 'pretrained/en_dense_lm_125m' # path to smol model weights to fix tokenizer shuffle
+
+checkpoint = {}
+ckmap = {}
+ckid = 0
+
+def save(params, name):
+    global ckid
+    ckmap[name] = f"b{ckid}.pt"
+    ckid += 1
+    path = Path(f"{sys.argv[2]}/lm")
+    path.mkdir(parents=True, exist_ok=True)
+    torch.save(params, path / ckmap[name])
+    torch.save(ckmap, path / "m.pt")
+    print(name + ": " + str(params.shape))
+    del params
+
+def no_init(loading_code):
+    def dummy(self):
+        return
+    
+    modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
+    original = {}
+    for mod in modules:
+        original[mod] = mod.reset_parameters
+        mod.reset_parameters = dummy
+    
+    result = loading_code()
+    for mod in modules:
+        mod.reset_parameters = original[mod]
+    
+    return result
+
+lm = no_init(lambda: TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2').eval().cpu())
+fairdict = torch.load(f"{sys.argv[1]}", map_location="cpu")
+
+try:
+    os.mkdir(sys.argv[2])
+except:
+    pass
+
+hidden_dim = fairdict["cfg"]["model"]["decoder_embed_dim"]
+num_heads = fairdict["cfg"]["model"]["decoder_attention_heads"]
+num_layers = fairdict["cfg"]["model"]["decoder_layers"]
+
+fairdict = fairdict["model"]
+
+config = {
+    "model_class": "gpt-fairseq",
+    "model_path": ".",
+    "model_config": {
+        "n_layer": num_layers,
+        "n_head": num_heads,
+        "hidden_dim": hidden_dim,
+        "vocab_dim": 51200,
+        "eps": 1e-05,
+        "n_tokens": 2049
+    }
+  }
+
+with open(f"{sys.argv[2]}/config.json", "w") as fh:
+    fh.write(json.dumps(config))
+
+#print(lm)
+
+def hack_embs(embs):
+    eot = embs[50256].clone()
+    newline = embs[198].clone()
+    if copy_eot_to_newline:
+        embs[198] = eot
+    if copy_newline_to_eot:
+        embs[50256] = newline
+
+# gpt2 compatible input/output embedding layers
+l1 = []
+l2 = []
+
+check = {}
+for i in range(50256):
+    check[i] = True
+
+for i, s in enumerate(lm.tgt_dict.symbols):
+    try:
+        if str(int(s)) == s and s != '50256':
+            l2.append(int(s))
+            l1.append(i)
+            del check[int(s)]
+    except:
+        pass
+
+for i, s in enumerate([lm.tgt_dict.eos_word, lm.tgt_dict.pad_word, lm.tgt_dict.bos_word, lm.tgt_dict.unk_word]):
+    l2.append(50256 + i)
+    l1.append(lm.tgt_dict.indices[s])
+
+mapping = {}
+for i in range(50260):
+    mapping[l1[i]] = l2[i]
+
+
+with torch.no_grad():
+    wte = fairdict["decoder.embed_tokens.weight"].clone()
+    for i in range(50260):
+        wte[mapping[i]] = fairdict["decoder.embed_tokens.weight"][i]
+    hack_embs(wte)
+    save(wte.half(), "vocab_embed.weight")
+    lm_head = fairdict["decoder.output_projection.weight"].clone()
+    for i in range(50260):
+        lm_head[mapping[i]] = fairdict["decoder.output_projection.weight"][i]
+    hack_embs(lm_head)
+    save(lm_head.half(), "lm_head.weight")
+
+save(torch.FloatTensor(1), "pos_embed._float_tensor")
+
+new_state_dict = {}
+for y in fairdict:
+    dotlist = y.split(".")
+
+    if y == "decoder.version":
+        trans_to = "Passed"
+        pass
+
+    elif y == "decoder.embed_tokens.weight":
+        continue
+
+    elif len(dotlist) >= 2 and dotlist[1] == "layers":
+        layer_id = dotlist[2]
+
+        if dotlist[-2] in ["k_proj", "v_proj", "q_proj", "out_proj"]:
+            trans_to = f"layers.{layer_id}.attn.{dotlist[-2]}.{dotlist[-1]}"
+
+        if dotlist[-2] == "self_attn_layer_norm":
+            trans_to = f"layers.{layer_id}.ln_preattn.{dotlist[-1]}"
+
+        if dotlist[3] == "fc1":
+            trans_to = f"layers.{layer_id}.ff.ff1.{dotlist[-1]}"
+
+        if dotlist[3] == "fc2":
+            trans_to = f"layers.{layer_id}.ff.ff2.{dotlist[-1]}"
+        
+        if dotlist[3] == "final_layer_norm":
+            trans_to = f"layers.{layer_id}.ln_postattn.{dotlist[-1]}"
+
+    elif len(dotlist) >= 2 and dotlist[1] == "layer_norm":
+        trans_to = f"ln_final.{dotlist[-1]}"
+
+    elif y == "decoder.output_projection.weight":
+        continue
+
+    if trans_to != "Passed":
+        save(fairdict[y].half(), trans_to)
+    print(f"{trans_to} < {y}")
\ No newline at end of file
--- a/scripts/gpt2port.py
+++ b/scripts/gpt2port.py
 import torch
 import transformers
 import sys
+from icecream import ic
+import os
 """
 Original:

@@ -22,23 +24,27 @@ h.0.mlp.c_fc.bias
 h.0.mlp.c_proj.weight
 h.0.mlp.c_proj.bias

-attn has biases unlike GPT-J. QKV Matrices are also merged instead of separate. what is the order though?
+attn has biases unlike GPT-J. QKV Matrices are also merged instead of separate. what is the order though? probably just QKV.

 """
 x = torch.load("models/gpt2_vanilla/pytorch_model.bin")
+state_dict = x
 print(x["h.0.attn.c_attn.weight"].reshape(-1, 768, 768).shape)
-sys.exit(0)
+ic(x["h.0.attn.c_attn.weight"].shape)
+ic(x["h.0.attn.c_attn.bias"].shape)
+ic(x["h.0.attn.c_proj.weight"].shape)
+ic(x["h.0.attn.c_proj.bias"].shape)

 new_state_dict = {}
 module_map = {
                "ln_1": "ln_preattn",
+                "ln_2": "ln_postattn",
                "mlp.c_proj": "ff.ff2",
                "mlp.c_fc": "ff.ff1",
-                "attn.attention.out_proj": "attn.out_proj",
-                "attn.attention.k_proj": "attn.k_proj",
-                "attn.attention.v_proj": "attn.v_proj",
-                "attn.attention.q_proj": "attn.q_proj",
+                "attn.c_proj": "attn.out_proj",
+                "attn.c_attn": "attn.k_proj",
                "wte": "vocab_embed",
+                "wpe": "pos_embed",
                'ln_f': 'ln_final',
                'lm_head': 'lm_head',
                }
@@ -47,18 +53,38 @@ print(type(state_dict))
 for key in state_dict.keys():
    dotlist = key.split('.')
    if len(dotlist) > 3:
-        layer = dotlist[2]
+        layer = dotlist[1]
        for x in module_map:
            if x in key:
-                new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
-                print(f"{key} -> layers.{layer}.{module_map[x]}.{dotlist[-1]}")
+                if x == "attn.c_attn":
+                    if "weight" in key:
+                        hidden_dim = state_dict[key].shape[0]
+                        qkv = state_dict[key].reshape(-1, hidden_dim, hidden_dim).split(1)
+                        new_state_dict[f"layers.{layer}.attn.q_proj.weight"] = qkv[0].squeeze(0).transpose(-1, -2)
+                        new_state_dict[f"layers.{layer}.attn.k_proj.weight"] = qkv[1].squeeze(0).transpose(-1, -2)
+                        new_state_dict[f"layers.{layer}.attn.v_proj.weight"] = qkv[2].squeeze(0).transpose(-1, -2)
+                    if "bias" in key:
+                        hidden_dim = state_dict[key].shape[0] // 3
+                        qkv = state_dict[key].reshape(-1, hidden_dim).split(1)
+                        new_state_dict[f"layers.{layer}.attn.q_proj.bias"] = qkv[0].squeeze(0)
+                        new_state_dict[f"layers.{layer}.attn.k_proj.bias"] = qkv[1].squeeze(0)
+                        new_state_dict[f"layers.{layer}.attn.v_proj.bias"] = qkv[2].squeeze(0)
+                else:
+                    if len(state_dict[key].shape) == 2:
+                        ic("transpose!")
+                        new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key].transpose(-1, -2)
+                    else:
+                        new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
+                    print(f"{key} -> layers.{layer}.{module_map[x]}.{dotlist[-1]}")
    else:
        for x in module_map:
            if x in key:
                new_state_dict[f"{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
                print(f"{key} -> {module_map[x]}.{dotlist[-1]}")

-#print(new_state_dict)
+for k, v in new_state_dict.items():
+    print(f"{k} -> {v.shape}")
+

 def save(state_dict, path):
    try: os.mkdir(path)
@@ -69,4 +95,4 @@ def save(state_dict, path):
        torch.save(x[1], f"{path}/b{i}.pt")
    torch.save(checkpoint, f"{path}/m.pt")

-save(new_state_dict, "models/6b_vanilla")
\ No newline at end of file
+save(new_state_dict, "pretrained/gpt2/lm")