update

b39e6d1b · novelailab · 40e90836 · b39e6d1b · b39e6d1b · b39e6d1b
Commit b39e6d1b authored May 22, 2022 by novelailab
12 changed files
--- a/basedformer/models/gptj.py
+++ b/basedformer/models/gptj.py
 from typing import Callable, KeysView
-from regex import D
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -156,7 +155,8 @@ class GPTJLayer(nn.Module):
        residual = x
        if act_ck:
            x = ck(self.ln_preattn, x)
-            attn_out, kv = ck(self.attn, x, kv=kv, cache=cache)
+            attn_out, kv = ck(self.attn, x, kv, cache)
+            #attn_out, kv = self.attn(x, kv=kv, cache=cache)

        else:
            x = self.ln_preattn(x)

--- a/basedformer/hypernet.py
+++ b/basedformer/hypernet.py
--- a/basedformer/optimizer.py
+++ b/basedformer/optimizer.py
@@ -121,5 +121,9 @@ class BasedOptimizer:
            metadata = pickle.load(f)
        
        based_optimizer = cls(parameters, metadata, metadata["optimizer_name"])
-        based_optimizer.optimizer.load_state_dict(torch.load(path / "opt_states.pt"))
+        try:
+            based_optimizer.optimizer.load_state_dict(torch.load(path / "opt_states.pt"))
+        except:
+            print("Couldn't load the optimizer, initializing the optimizer states. Honk!!!")
+            pass
        return based_optimizer
\ No newline at end of file
--- a/basedformer/sampling.py
+++ b/basedformer/sampling.py
@@ -156,13 +156,13 @@ def func_multinomial(x):
    return torch.multinomial(x, 1)

 @torch.no_grad()
-def generate_greedy(forward, prompt_tokens, tokens_to_generate=50):
+def generate_greedy(forward, prompt_tokens, tokens_to_generate=50, hypernetwork=None):
    in_tokens = prompt_tokens
    padding_token = 50256
    generated = torch.zeros(prompt_tokens.shape[0], 0, dtype=torch.long).to(in_tokens.device)
    kv = None
    for i in range(tokens_to_generate):
-        logits, kv = forward(in_tokens, cache=True, kv=kv)
+        logits, kv = forward(in_tokens, cache=True, kv=kv, hypernetwork=hypernetwork)
        logits = logits[:, -1, :] #get the last token in the seq
        # get the token before the padding_token in the seq        
        logits = logits.argmax(dim=-1).unsqueeze(-1)
@@ -173,7 +173,7 @@ def generate_greedy(forward, prompt_tokens, tokens_to_generate=50):
    return generated

 @torch.no_grad()
-def generate(forward, prompt_tokens, tokens_to_generate=50, ops_list=[{"temp": 0.9}]):
+def generate(forward, prompt_tokens, tokens_to_generate=50, ops_list=[{"temp": 0.9}], hypernetwork=None):
    in_tokens = prompt_tokens
    context = prompt_tokens
    generated = torch.zeros(prompt_tokens.shape[0], 0, dtype=torch.long).to(in_tokens.device)
@@ -190,7 +190,7 @@ def generate(forward, prompt_tokens, tokens_to_generate=50, ops_list=[{"temp": 0
    }

    for _ in range(tokens_to_generate):
-        logits, kv = forward(in_tokens, cache=True, kv=kv)
+        logits, kv = forward(in_tokens, cache=True, kv=kv, hypernetwork=hypernetwork)
        logits = logits[:, -1, :] #get the last token in the seq
        logits = torch.log_softmax(logits, dim=-1)
        #can save one softmax here by not applying softmax for the first op,
@@ -222,7 +222,7 @@ def generate(forward, prompt_tokens, tokens_to_generate=50, ops_list=[{"temp": 0
            logits = torch.cat(logit_list, dim=0)

        else:
-            torch.manual_seed(69)
+            #torch.manual_seed(69)
            logits = torch.multinomial(logits, 1)

        generated = torch.cat([generated, logits], dim=-1)

--- a/basedformer/utils.py
+++ b/basedformer/utils.py
@@ -54,6 +54,11 @@ def no_init(loading_code):
 def count_parameters(model, only_trainable=False):
    return sum(p.numel() for p in model.parameters() if p.requires_grad or not only_trainable)

+def print_parameters(model, only_trainable=False):
+    params = sum(p.numel() for p in model.parameters() if p.requires_grad or not only_trainable)
+    params = params / 1e6
+    print(f"{params:.2f}M parameters")
+
 SPLIT_WEIGHTS_NAME = "m.pt"
 class SplitCheckpoint(MutableMapping):
    def __init__(self, name_or_path, device="cpu", subfolder=None):

--- a/finetune.py
+++ b/finetune.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.cuda.amp as amp
+import torch.optim as optim
+from pathlib import Path
+from torch.utils import data
+from basedformer import optimizer, utils, lm_utils
+import yaml
+import sys
+from tqdm import tqdm
+import time
+import wandb
+import numpy as np
+import os
+from icecream import ic
+
+# we need 250 batch size to train the small GPT.
+train_config = {
+    "data_path": "/home/xuser/diffusionstorage/datasets/enwik9-gpt2-2049.map",
+    #"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
+    #"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
+    "save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/gptj-finetune",
+    "do_save": False,
+    "run_name": "gptj-finetune",
+    "lr": 2e-5,
+    "end_lr": 6e-5,
+    "warmup_steps": 100,
+    "anneal_steps": 10000,
+    "bs": 1,
+    "gas": 4,
+    "seed": 69,
+    "save_every": 500,
+    "amp": True,
+    "loss_scale": True,
+}
+torch.manual_seed(train_config["seed"])
+bs = train_config["bs"]
+gas = train_config["gas"]
+
+Path(train_config["save_path"]).mkdir(parents=True, exist_ok=True)
+
+#model = GPTModel.gpt2_init(model_config).cuda().float()
+model = lm_utils.load_from_path("pretrained/gptj-6b").cuda().float()
+model.train()
+
+ic("model loaded")
+
+cp_list = sorted(os.listdir(train_config["save_path"]), key=lambda x: int(x.split("_")[-1]))
+last_cp = Path(train_config["save_path"]) / cp_list[-1] if len(cp_list) > 0 else None
+print(last_cp)
+
+
+trainable_params = [
+    model.vocab_embed,
+    model.lm_head,
+    model.layers[10],
+    ]
+
+for param in model.parameters():
+    param.requires_grad = False
+
+for name, p in model.named_parameters():
+    if ("ln" in name or "vocab_embed" in name):
+        p.requires_grad = True
+
+for module in trainable_params:
+    for p in module.parameters():
+        module.requires_grad = True
+
+'''
+if last_cp:
+    print("Loading from step {}".format(cp_list[-1].split("_")[-1]))
+    model.load(model_config, last_cp / "lm", strict=True)
+    opt = optimizer.BasedOptimizer.load(model.parameters(), last_cp / "opt")
+
+else:
+    opt = optimizer.BasedOptimizer(model.parameters(), train_config, "adamw")
+'''
+opt = optimizer.BasedOptimizer(model.layers[10].parameters(), train_config, "adamw")
+# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function.
+print(opt.curr_step)
+train_dataset = utils.FbDataset(2049, train_config["data_path"])
+if last_cp:
+    train_dataset.skip = opt.curr_step * bs * gas
+    
+train_loader = data.DataLoader(train_dataset, batch_size=bs*gas, shuffle=False, num_workers=0, )
+wandb.init(project="basedformer-tests", name=train_config["run_name"], config={**train_config, **model.config})
+
+if last_cp:
+    curr_step = opt.curr_step
+else:
+    curr_step = 0
+
+t = tqdm(train_loader, initial=curr_step)
+
+scaler = torch.cuda.amp.GradScaler()
+
+for input_ids, labels in t:
+    timex = time.perf_counter()
+    input_ids = input_ids.cuda()
+    labels = labels.cuda()
+    loss = 0
+    for x in range(train_config["gas"]):
+        with torch.cuda.amp.autocast(enabled=train_config["amp"], dtype=torch.float16):
+            logits = model(input_ids[x*bs:(x+1)*bs, :2048].cuda(), act_ck=True)
+            #print(tokenizer.decode(input_ids[x*bs:(x+1)*bs, :][0]))
+            #roll down the sequence
+            logits = logits.view(-1, logits.shape[-1])
+            gas_labels = labels[x*bs:(x+1)*bs, :2048].contiguous()
+            gas_labels = gas_labels.view(-1)
+            gas_loss = F.cross_entropy(logits, gas_labels)
+
+        if train_config["loss_scale"]:
+            scaler.scale(gas_loss).backward()
+        else:
+            gas_loss.backward()
+
+        loss += gas_loss.item()
+
+    loss = loss / gas
+    if train_config["loss_scale"]:
+        scaler.unscale_(opt.optimizer)
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
+    if train_config["loss_scale"]:
+        opt.step(scaler=scaler)
+    else:
+        opt.step()
+
+    if train_config["loss_scale"]:
+        scaler.update()
+
+    opt.zero_grad()
+    sec_per_step = (time.perf_counter() - timex)
+    step_per_sec = (1. / sec_per_step)
+    tokens_per_sec = (step_per_sec * 2048) * bs * gas
+    t.set_description(f"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step, {tokens_per_sec:.2f}tokens/s, loss={loss:.4f}")
+    wandb.log(
+        {
+            "train/loss": loss,
+            "train/tokens_per_sec": tokens_per_sec,
+            "train/sec_per_step": sec_per_step,
+            "train/step_per_sec": step_per_sec, 
+            "train/lr": opt.curr_lr, 
+            "train/loss_scale": scaler.get_scale()
+        },
+        step=curr_step)
+
+    if train_config["do_save"]:
+        if curr_step % train_config["save_every"] == 0:
+            save_folder = Path(train_config["save_path"]) / f"step_{curr_step}"
+            save_folder.mkdir(parents=True, exist_ok=True)
+            model.save(save_folder / "lm")
+            opt.save(save_folder / "opt")
+            print(f"Saved model at step {curr_step}")
+
+    curr_step += 1
\ No newline at end of file
--- a/hypertrain.py
+++ b/hypertrain.py
@@ -12,9 +12,10 @@ import wandb
 import numpy as np
 from torch.utils.checkpoint import checkpoint as ck
 from math import log2, ceil
-from basedformer import gptj, optimizer
+from basedformer import gptj, optimizer, lm_utils
 from basedformer.utils import *
 import glob
+from icecream import ic

 def _init_weights(module):
    if isinstance(module, nn.Linear):
@@ -146,27 +147,20 @@ class HyperNetworkSingle(nn.Module):
        x = x.mul(torch.sigmoid(x))
        return x.bfloat16()

-model_config = {
-    "n_layer": 28,
-    "n_head": 16,
-    "hidden_dim": 4096,
-    "vocab_dim": 50400,
-    "eps": 1e-5,
-}
-
 # we need 250 batch size to train the small GPT.
 train_config = {
-    "data_path": "/home/xuser/diffusionstorage/datasets/enwik9-gpt2-2049.map",
+    #"data_path": "/home/xuser/diffusionstorage/datasets/enwik9-gpt2-2049.map",
+    "data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v7_infilling.map",
    #"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
    #"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
-    "save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-bs16-save",
+    "save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-infilling",
    "do_save": True,
-    "run_name": "gpt-j-enwik9-6b-postln-bf16-2e-4-4bsz-every5layersavetest",
+    "run_name": "gpt-j-6b-2e-4-infilling",
    "lr": 2e-4,
    "end_lr": 2e-4,
    "warmup_steps": 50,
-    "bs": 1,
-    "gas": 4,
+    "bs": 4,
+    "gas": 1,
    "seed": 69,
    "save_every": 300,
    "amp": False,
@@ -179,7 +173,7 @@ gas = train_config["gas"]
 Path(train_config["save_path"]).mkdir(parents=True, exist_ok=True)

 #model = GPTModel.gpt2_init(model_config).cuda().float()
-model = gptj.load_gpt_j().lm.cuda().bfloat16()
+model = lm_utils.load_from_path("pretrained/gptj-6b").cuda().bfloat16()
 for param in model.parameters():
    param.requires_grad = False

@@ -187,7 +181,7 @@ for name, p in model.named_parameters():
    if ("ln" in name or "vocab_embed" in name):
        p.requires_grad = True

-hypernetwork = HyperNetworkSingle(model_config).cuda().float()
+hypernetwork = HyperNetworkSingle(model.config).cuda().float()
 #hypernetwork = nn.ModuleList([HyperNetworkSingle(model_config).cuda().float() for _ in range(model_config["n_layer"] // 5)])
 #hypernetwork = nn.ModuleList([HyperNetworkSingle(model_config).cuda().float() for _ in range(2)])
 for param in hypernetwork.parameters():
@@ -212,7 +206,7 @@ if last_cp:
    train_dataset.skip = opt.curr_step * bs * gas
    
 train_loader = data.DataLoader(train_dataset, batch_size=bs*gas, shuffle=False, num_workers=0, )
-wandb.init(project="hypernetwork-tests", name=train_config["run_name"], config={**train_config, **model_config})
+wandb.init(project="hypernetwork-tests", name=train_config["run_name"], config={**train_config, **model.config})

 if last_cp:
    curr_step = opt.curr_step
@@ -279,7 +273,5 @@ for input_ids, labels in t:
            torch.save(hypernetwork.state_dict(), save_folder / "hyper.pt")
            opt.save(save_folder / "opt")
            print(f"Saved model at step {curr_step}")
-            sys.exit(0)
-
    curr_step += 1
            
\ No newline at end of file
--- a/notebooks/lstmvstrans.ipynb
+++ b/notebooks/lstmvstrans.ipynb
--- a/pyfra_util.py
+++ b/pyfra_util.py
@@ -51,9 +51,9 @@ path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
 #env1.sh('pip3 install git+https://github.com/pytorch/fairseq')
 env1.sh('pip3 install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')

-with always_rerun():
-    env1.sh('pip uninstall transformers')
-    env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
+#with always_rerun():
+    #env1.sh('pip uninstall transformers')
+    #env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
    

 with always_rerun():

--- a/run_pyfra.py
+++ b/run_pyfra.py
@@ -7,14 +7,14 @@ import argparse
 # run command: default
 # kill

-name = 'pyfra-basedformer'
+name = 'basedformer'
 dry = False
 bash = False

 config_obj = KubeConfig()
 config_obj.set_name(name)
-config_obj.set_gpu(gpu_name=GPU.RTX_A6000, amount=1)
-config_obj.set_ram(64)
+config_obj.set_gpu(gpu_name=GPU.A100_NVLINK, amount=1)
+config_obj.set_ram(24)
 config_obj.set_cpu(4)
 config_obj.dry_run(dry)
 config_obj.print_information()
@@ -27,7 +27,7 @@ env1 = remote.env('noname', python_version=None)

 path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')

-if False:
+if True:
    env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
    env1.sh('pip install einops numpy')
    env1.sh('pip install tqdm')
@@ -37,7 +37,7 @@ if False:
    env1.sh('pip3 install dotmap icecream')
    path.sh("pip3 install --editable .")
 with always_rerun():
-    if True:
+    if False:
        #env1.sh('pip3 install transformers')
        #path.sh('pip3 install --editable ../lm-evaluation-harness/.')
        #env1.sh('pip3 install pytest')

--- a/scripts/hypersample.py
+++ b/scripts/hypersample.py
+
+from basedformer import lm_utils as lmu
+from basedformer.models import hypernet
+from basedformer import sampling
+import os
+from pathlib import Path
+from basedformer.utils import * 
+from transformers import AutoTokenizer
+from icecream import ic
+import time
+import sys
+
+def main():
+    #save_path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-bs4-2e-4-catchup"
+    save_path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-infilling"
+    cp_list = sorted(os.listdir(save_path), key=lambda x: int(x.split("_")[-1]))
+    last_cp = Path(save_path) / cp_list[-1] if len(cp_list) > 0 else None
+    print(last_cp)
+    bsz = 1
+    gen_len = 400
+    #torch.manual_seed(69)
+    tokenizer = AutoTokenizer.from_pretrained('gpt2')
+    mask = "████████"
+    prompt = "You hated the elves enough that if you seen one of them in the forest you would just slice their throats."
+    prompt = """'''Kurumuz''' is the founder of tech company [["""
+    promptnomask = f"""The room was lit now by a dozen candles. The door had been locked, and the windows barred; but there were still some faint glimmers of moonlight on the floor outside. For a moment the figure stood motionless in its doorway to look about it with an air of keen and nervous expectancy. Then he came forward into the chamber and moved, where he remained standing for an instant upon his toes like one listening intently before starting to rummage among the books and papers. He selected a large volume from among them and turned back to the window,{mask} holding it between himself and the rest of the room until he could feel the warm breath of the night creeping through the curtains.{mask}"""
+    prompt = f"""The room was lit now by a dozen candles. The door had been locked, and the windows barred; but there were still some faint glimmers of moonlight on the floor outside. For a moment the figure stood motionless in its doorway to look about it with an air of keen and nervous expectancy. Then he came forward into the chamber and moved{mask}, where he remained standing for an instant upon his toes like one listening intently before starting to rummage among the books and papers. He selected a large volume from among them and turned back to the window, holding it between himself and the rest of the room until he could feel the warm breath of the night creeping through the curtains.{mask}"""
+    tokens = tokenizer.encode(promptnomask)
+    print(tokens)
+    print("Prompt:")
+    for x in range(len(tokens)):
+        print(tokenizer.decode([tokens[x]]), end=" | ")
+    print("\n Generation:")
+    tokens = torch.LongTensor(tokens).unsqueeze(0).cuda()
+    tokens = [tokens] * bsz
+    #tokens = torch.cat([tokens, tokens], dim=0)
+    tokens = torch.cat(tokens, dim=0)
+    t = time.perf_counter()
+    model = lmu.load_from_path('pretrained/gptj-6b').cuda().bfloat16().eval()
+    hypernetwork = hypernet.HyperNetworkSingle(model.config).cuda().float()
+    print("Loading from step {}".format(cp_list[-1].split("_")[-1]))
+    hypernetwork.load_state_dict(torch.load(last_cp / "hyper.pt"))
+    
+    ic(time.perf_counter() - t)
+
+    rep_pen = {
+        "penalty": 5,
+    }
+
+    ops = {
+        "rep_pen": rep_pen,
+        "tfs": 0.86,
+        "temp": 0.8,
+    }
+    ops_list = [ops] * bsz
+
+    #tokens_generated = sampling.generate(model.forward, tokens, gen_len, ops_list=ops_list, hypernetwork=hypernetwork)
+    tokens_generated = sampling.generate_greedy(model.forward, tokens, gen_len, hypernetwork=hypernetwork)
+    #tokens_generated_batched = generate_real_batched(model.forward, tokens, gen_len, ops=ops)
+    print(tokens_generated.shape)
+    tokens_generated[tokens_generated == 48585] = 35625
+    ic(prompt)
+    tokens_generated = tokenizer.batch_decode(tokens_generated.cpu().numpy())
+    for gen in tokens_generated:
+        print(str(gen.split("*****")[0]))
+        print("++++++++++++")
+        print(str(gen.split("*****")[1]))
+        print("===========================================================")
+    #ic(tokenizer.batch_decode(tokens_generated_batched.cpu().numpy()))
+    #timeit(lambda: generate(model.forward, tokens, 30, ops_list=ops_list), n=30)
+    #timeit(lambda: generate_real_batched(model.forward, tokens, 30, ops=ops), n=30)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/scripts/lstmvstrans.py
+++ b/scripts/lstmvstrans.py
+from basedformer import models, utils
+import torch
+config = {
+    "n_layer": 6,
+    "n_head": 8,
+    "hidden_dim": 4096,
+}
+#init param matched GPT
+gpt = models.gptj.GPTJModel(config).cuda().float()
+utils.print_parameters(gpt)
+
+#init param matched LSTM
+lstm = torch.nn.LSTM(batch_first=True, input_size=4096, hidden_size=4096, num_layers=12).cuda().float()
+utils.print_parameters(lstm)
+
+x = torch.randint(0, 50256, (1, 1)).long().cuda()
+y = torch.rand(1, 1, 4096).cuda().float()
+with torch.no_grad():
+    print("GPT:")
+    utils.timeit(func=lambda: gpt(x), r=10, n=10)
+    print("LSTM:")
+    utils.timeit(func=lambda: lstm(y), r=10, n=10)
\ No newline at end of file