Commit bc280afb authored by novelailab's avatar novelailab

things fixed

parent fb25b47c
......@@ -80,7 +80,9 @@ with torch.no_grad():
hidden = hf_model.transformer.h[layer].mlp(hidden)
assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden))
hidden = hf_model.transformer.h[layer].attn(hidden)[0]
assert torch.allclose(hf_model.transformer.h[layer](hidden)[0], based_model.layers[layer](hidden))
assert torch.allclose(hf_model.transformer.ln_f(hidden), based_model.ln_final(hidden))
hidden = hf_model.transformer.ln_f(hidden)
assert torch.allclose(hf_model.lm_head(hidden), based_model.lm_head(hidden))
assert torch.allclose(hf_model.transformer(x)["last_hidden_state"], based_model.get_embeds(x))
assert torch.allclose(hf_model(x)["logits"], based_model(x))
\ No newline at end of file
......@@ -40,6 +40,9 @@ class BasedOptimizer:
if optimizer == "adamw":
self.optimizer = optim.AdamW(parameters, lr=0, weight_decay=self.weight_decay, betas=(self.beta1, self.beta2), eps=self.eps)
elif optimizer == "adamw8bit":
import bitsandbytes as bnb
self.optimizer = bnb.optim.Adam8bit(parameters, lr=0, weight_decay=self.weight_decay, betas=(self.beta1, self.beta2), eps=self.eps)
def step(self, scaler=None):
if scaler:
......
......@@ -10,7 +10,6 @@ except ImportError:
import os
from pathlib import Path
import math
from lm_arch.gptj import GPTJModel
def no_init(loading_code):
def dummy(self):
......@@ -238,7 +237,9 @@ class GPTLayer(nn.Module):
attn_out = self.attn(x)
ff_out = self.ff(x, act_ck)
x = residual + attn_out + ff_out
#order of addition matters, i had no idea... fixed a bug here.
x = attn_out + ff_out + residual
#x = residual + attn_out + ff_out -> doesn't match.
if hypernetwork:
hyper_out = hypernetwork(x)
x = x + hyper_out
......
......@@ -14,6 +14,34 @@ import time
import wandb
from lm_arch.gpt2 import GPT2Model
import numpy as np
from transformers import AutoTokenizer
class HyperNetwork(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config["hidden_dim"]
self.linear = nn.Linear(embed_dim, embed_dim, bias=True)
self.linear.weight.data.normal_(mean=0.0, std=0.02)
for param in self.linear.parameters():
param.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * config["n_layer"])))
def forward(self, hidden_states):
hidden_states = self.linear(hidden_states)
hidden_states = hidden_states.mul(torch.sigmoid(hidden_states))
return hidden_states
model_config = {
"n_layer": 28,
"n_head": 16,
"hidden_dim": 4096,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTLayer
}
model_config = {
"n_layer": 12,
......@@ -28,16 +56,18 @@ model_config = {
# we need 250 batch size to train the small GPT.
train_config = {
"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
"save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/owt2fp16amp2",
"run_name": "owt2-125m-fp16AMP-1024ctx-120bs-1e-4lr",
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
"save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/fixedj",
"run_name": "gpt-j-8bitopt-owt2-125m-fp16AMP-fixedj",
"lr": 1e-4,
"end_lr": 1e-4,
"warmup_steps": 100,
"end_lr": 1e-4 * 2,
"warmup_steps": 50,
"bs": 12,
"gas": 10,
"seed": 69,
"save_every": 500,
"amp": True,
"loss_scale": True,
}
torch.manual_seed(train_config["seed"])
bs = train_config["bs"]
......@@ -46,6 +76,17 @@ gas = train_config["gas"]
Path(train_config["save_path"]).mkdir(parents=True, exist_ok=True)
model = GPTModel.gpt2_init(model_config).cuda().float()
#model = load_gpt_j().cuda().half()
#for param in model.parameters():
# param.requires_grad = False
#for name, p in model.named_parameters():
# if ("ln" in name or "vocab_embed" in name):
# p.requires_grad = True
#hypernetwork = HyperNetwork(model_config).cuda().float()
#for param in hypernetwork.parameters():
# param.requires_grad = True
opt = optimizer.BasedOptimizer(model.parameters(), train_config, "adamw")
# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function.
......@@ -65,24 +106,15 @@ for input_ids, labels in t:
labels = labels.cuda()
loss = 0
for x in range(train_config["gas"]):
if train_config["amp"]:
with torch.cuda.amp.autocast():
#with torch.jit.fuser("fuser2"):
# module = torch.jit.trace(model, torch.randint(0, 50256, (12, 1024)).long().cuda())
logits = model(input_ids[x*bs:(x+1)*bs, :1024].cuda(), hypernetwork=None, act_ck=False)
logits = logits.view(-1, logits.shape[-1])
gas_labels = labels[x*bs:(x+1)*bs, :1024].contiguous()
gas_labels = gas_labels.view(-1)
gas_loss = F.cross_entropy(logits, gas_labels)
else:
with torch.cuda.amp.autocast(enabled=train_config["amp"], dtype=torch.float16):
logits = model(input_ids[x*bs:(x+1)*bs, :1024].cuda(), hypernetwork=None, act_ck=False)
#print(tokenizer.decode(input_ids[x*bs:(x+1)*bs, :][0]))
logits = logits.view(-1, logits.shape[-1])
gas_labels = labels[x*bs:(x+1)*bs, :1024].contiguous()
gas_labels = gas_labels.view(-1)
gas_loss = F.cross_entropy(logits, gas_labels)
if train_config["amp"]:
if train_config["loss_scale"]:
scaler.scale(gas_loss).backward()
else:
gas_loss.backward()
......@@ -90,14 +122,14 @@ for input_ids, labels in t:
loss += gas_loss.item()
loss = loss / gas
if train_config["amp"]:
if train_config["loss_scale"]:
scaler.unscale_(opt.optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
if train_config["amp"]:
if train_config["loss_scale"]:
opt.step(scaler=scaler)
else:
opt.step()
if train_config["amp"]:
if train_config["loss_scale"]:
scaler.update()
#opt.step()
opt.zero_grad()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment