Commit 1ab8bac6 authored by novelailab's avatar novelailab

opt/model save/load works!

parent 2d0b32de
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -150,7 +150,7 @@ class FeedForward(nn.Module): ...@@ -150,7 +150,7 @@ class FeedForward(nn.Module):
def forward(self, x, act_ck=False): def forward(self, x, act_ck=False):
x = self.ff1(x) x = self.ff1(x)
if act_ck: if act_ck:
ck(self.activation, x) x = ck(self.activation, x)
else: else:
x = self.activation(x) x = self.activation(x)
x = self.ff2(x) x = self.ff2(x)
......
from curses import meta
from torch import optim from torch import optim
import numpy as np import numpy as np
import torch import torch
...@@ -71,15 +72,15 @@ class BasedOptimizer: ...@@ -71,15 +72,15 @@ class BasedOptimizer:
def step(self, dry_run=False, scaler=None): def step(self, dry_run=False, scaler=None):
self.curr_lr = lr_schedule(self.curr_step, self.warmup_steps, self.anneal_steps, self.lr, self.end_lr)
if not dry_run: if not dry_run:
if scaler: if scaler:
scaler.step(self.optimizer) scaler.step(self.optimizer)
else: else:
self.optimizer.step() self.optimizer.step()
self.curr_step = self.curr_step + 1 self.curr_step = self.curr_step + 1
self.curr_lr = lr_schedule(self.curr_step, self.warmup_steps, self.anneal_steps, self.lr, self.end_lr)
if not self.max_lr: if not self.max_lr:
if self.curr_lr == self.end_lr: if self.curr_lr == self.end_lr:
...@@ -102,20 +103,20 @@ class BasedOptimizer: ...@@ -102,20 +103,20 @@ class BasedOptimizer:
print(f"curr_lr: {str(self.get_current_lr())}") print(f"curr_lr: {str(self.get_current_lr())}")
def save(self, path: Path): def save(self, path: Path):
path = path / "opt" path = Path(path)
path.mkdir(parents=True, exist_ok=True) path.mkdir(parents=True, exist_ok=True)
torch.save(self.optimizer.state_dict(), path / "opt_states.pt") torch.save(self.optimizer.state_dict(), path / "opt_states.pt")
#clean the optimizer and parameters from the dict.
del self.optimizer
del self.parameters
metadata = self.__dict__ metadata = self.__dict__.copy()
#clean the optimizer and parameters from the dict.
del metadata["optimizer"]
del metadata["parameters"]
with open(path / "opt_metadata.pkl", 'wb') as f: with open(path / "opt_metadata.pkl", 'wb') as f:
pickle.dump(metadata, f) pickle.dump(metadata, f)
@classmethod @classmethod
def load(cls, parameters, path): def load(cls, parameters, path):
path = path / "opt" path = Path(path)
with open(path / "opt_metadata.pkl", 'rb') as f: with open(path / "opt_metadata.pkl", 'rb') as f:
metadata = pickle.load(f) metadata = pickle.load(f)
......
...@@ -12,8 +12,9 @@ import wandb ...@@ -12,8 +12,9 @@ import wandb
import numpy as np import numpy as np
from torch.utils.checkpoint import checkpoint as ck from torch.utils.checkpoint import checkpoint as ck
from math import log2, ceil from math import log2, ceil
from basedformer import gptj, lm_base, optimizer from basedformer import gptj, optimizer
from basedformer.utils import * from basedformer.utils import *
import glob
def _init_weights(module): def _init_weights(module):
if isinstance(module, nn.Linear): if isinstance(module, nn.Linear):
...@@ -158,16 +159,16 @@ train_config = { ...@@ -158,16 +159,16 @@ train_config = {
"data_path": "/home/xuser/diffusionstorage/datasets/enwik9-gpt2-2049.map", "data_path": "/home/xuser/diffusionstorage/datasets/enwik9-gpt2-2049.map",
#"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map", #"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map", #"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
"save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-bs16", "save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-bs16-save",
"do_save": False, "do_save": True,
"run_name": "gpt-j-enwik9-6b-postln-bf16-2e-4-4bsz-every5layer", "run_name": "gpt-j-enwik9-6b-postln-bf16-2e-4-4bsz-every5layersavetest",
"lr": 2e-4, "lr": 2e-4,
"end_lr": 2e-4, "end_lr": 2e-4,
"warmup_steps": 50, "warmup_steps": 50,
"bs": 1, "bs": 1,
"gas": 4, "gas": 4,
"seed": 69, "seed": 69,
"save_every": 100, "save_every": 300,
"amp": False, "amp": False,
"loss_scale": False, "loss_scale": False,
} }
...@@ -178,7 +179,7 @@ gas = train_config["gas"] ...@@ -178,7 +179,7 @@ gas = train_config["gas"]
Path(train_config["save_path"]).mkdir(parents=True, exist_ok=True) Path(train_config["save_path"]).mkdir(parents=True, exist_ok=True)
#model = GPTModel.gpt2_init(model_config).cuda().float() #model = GPTModel.gpt2_init(model_config).cuda().float()
model = lm_base.().cuda().bfloat16() model = gptj.load_gpt_j().lm.cuda().bfloat16()
for param in model.parameters(): for param in model.parameters():
param.requires_grad = False param.requires_grad = False
...@@ -192,16 +193,33 @@ hypernetwork = HyperNetworkSingle(model_config).cuda().float() ...@@ -192,16 +193,33 @@ hypernetwork = HyperNetworkSingle(model_config).cuda().float()
for param in hypernetwork.parameters(): for param in hypernetwork.parameters():
param.requires_grad = True param.requires_grad = True
opt = optimizer.BasedOptimizer(hypernetwork.parameters(), train_config, "adamw") cp_list = sorted(os.listdir(train_config["save_path"]), key=lambda x: int(x.split("_")[-1]))
last_cp = Path(train_config["save_path"]) / cp_list[-1] if len(cp_list) > 0 else None
print(last_cp)
# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function. if last_cp:
print("Loading from step {}".format(cp_list[-1].split("_")[-1]))
hypernetwork.load_state_dict(torch.load(last_cp / "hyper.pt"))
opt = optimizer.BasedOptimizer.load(hypernetwork.parameters(), last_cp / "opt")
else:
opt = optimizer.BasedOptimizer(hypernetwork.parameters(), train_config, "adamw")
# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function.
print(opt.curr_step)
train_dataset = FbDataset(2049, train_config["data_path"]) train_dataset = FbDataset(2049, train_config["data_path"])
train_loader = data.DataLoader(train_dataset, batch_size=bs*gas, shuffle=False, num_workers=0) if last_cp:
train_dataset.skip = opt.curr_step * bs * gas
train_loader = data.DataLoader(train_dataset, batch_size=bs*gas, shuffle=False, num_workers=0, )
wandb.init(project="hypernetwork-tests", name=train_config["run_name"], config={**train_config, **model_config}) wandb.init(project="hypernetwork-tests", name=train_config["run_name"], config={**train_config, **model_config})
t = tqdm(train_loader) if last_cp:
curr_step = 0 curr_step = opt.curr_step
else:
curr_step = 0
t = tqdm(train_loader, initial=curr_step)
scaler = torch.cuda.amp.GradScaler() scaler = torch.cuda.amp.GradScaler()
...@@ -234,18 +252,34 @@ for input_ids, labels in t: ...@@ -234,18 +252,34 @@ for input_ids, labels in t:
opt.step(scaler=scaler) opt.step(scaler=scaler)
else: else:
opt.step() opt.step()
if train_config["loss_scale"]: if train_config["loss_scale"]:
scaler.update() scaler.update()
#opt.step()
opt.zero_grad() opt.zero_grad()
sec_per_step = (time.perf_counter() - timex) sec_per_step = (time.perf_counter() - timex)
step_per_sec = (1. / sec_per_step) step_per_sec = (1. / sec_per_step)
tokens_per_sec = (step_per_sec * 2048) * bs * gas tokens_per_sec = (step_per_sec * 2048) * bs * gas
t.set_description(f"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step, {tokens_per_sec:.2f}tokens/s, loss={loss:.4f}") t.set_description(f"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step, {tokens_per_sec:.2f}tokens/s, loss={loss:.4f}")
wandb.log({"train/loss": loss, "train/tokens_per_sec": tokens_per_sec, "train/sec_per_step": sec_per_step, "train/step_per_sec": step_per_sec, "train/lr": opt.curr_lr, "train/loss_scale": scaler.get_scale()}) wandb.log(
curr_step += 1 {
"train/loss": loss,
"train/tokens_per_sec": tokens_per_sec,
"train/sec_per_step": sec_per_step,
"train/step_per_sec": step_per_sec,
"train/lr": opt.curr_lr,
"train/loss_scale": scaler.get_scale()
},
step=curr_step)
if train_config["do_save"]: if train_config["do_save"]:
if curr_step % train_config["save_every"] == 0 or curr_step == 1: if curr_step % train_config["save_every"] == 0 and curr_step != 0:
torch.save(hypernetwork.state_dict(), train_config["save_path"] + f"/{curr_step}.hyper") save_folder = Path(train_config["save_path"]) / f"step_{curr_step}"
#model.save(train_config["save_path"] + f"/{curr_step}") save_folder.mkdir(parents=True, exist_ok=True)
torch.save(hypernetwork.state_dict(), save_folder / "hyper.pt")
opt.save(save_folder / "opt")
print(f"Saved model at step {curr_step}") print(f"Saved model at step {curr_step}")
sys.exit(0)
curr_step += 1
\ No newline at end of file
...@@ -13,13 +13,13 @@ bash = False ...@@ -13,13 +13,13 @@ bash = False
config_obj = KubeConfig() config_obj = KubeConfig()
config_obj.set_name(name) config_obj.set_name(name)
config_obj.set_gpu(gpu_name=GPU.A40, amount=1) config_obj.set_gpu(gpu_name=GPU.A100_PCIE_40GB, amount=1)
config_obj.set_ram(16) config_obj.set_ram(16)
config_obj.set_cpu(4) config_obj.set_cpu(4)
config_obj.dry_run(dry) config_obj.dry_run(dry)
config_obj.print_information() config_obj.print_information()
#config_obj.create_deployment(overwrite=True) config_obj.create_deployment(overwrite=True)
#config_obj.create_service(overwrite=True) config_obj.create_service(overwrite=True)
remote = config_obj.get_pyfra_remote() remote = config_obj.get_pyfra_remote()
env1 = remote.env('noname', python_version=None) env1 = remote.env('noname', python_version=None)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment