Commit a2b7dffb authored by novelailab's avatar novelailab

update

parent fd387a42
from .models import gptj
MODEL_MAP = {
"gptj": gptj.GPTJModel,
}
def get_model(model_name: str):
return MODEL_MAP[model_name]
from basedformer import gptj
from basedformer.models import gptj
from basedformer.utils import *
from basedformer import lm_utils
from transformers import AutoTokenizer
......
......@@ -12,7 +12,7 @@ import wandb
import numpy as np
from torch.utils.checkpoint import checkpoint as ck
from math import log2, ceil
from basedformer import gptj, optimizer, lm_utils
from basedformer import optimizer, lm_utils
from basedformer.utils import *
import glob
from transformers import AutoTokenizer
......@@ -194,9 +194,9 @@ train_config = {
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v7_infilling.map",
##"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
"save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-again",
"save_path": "/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-fairseq-6b-2048-enwik9-again",
"do_save": True,
"run_name": "gpt-j-enwik9-6b-postln-bf16-2e-4-4bsz-every5layer",
"run_name": "fairseq-6b-enwik9-6b-postln-bf16-2e-4-4bsz-every5layer",
"lr": 2e-4,
"end_lr": 2e-4,
"warmup_steps": 50,
......@@ -215,7 +215,7 @@ gas = train_config["gas"]
Path(train_config["save_path"]).mkdir(parents=True, exist_ok=True)
#model = GPTModel.gpt2_init(model_config).cuda().float()
model = lm_utils.load_from_path("pretrained/gptj-6b").cuda().bfloat16()
model = lm_utils.load_from_path("pretrained/fairseq_6_7b").cuda().bfloat16()
for param in model.parameters():
param.requires_grad = False
......
......@@ -28,16 +28,20 @@ env1 = remote.env('noname', python_version=None)
path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
if True:
env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
env1.sh('pip install einops numpy')
env1.sh('pip install tqdm')
#env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
env1.sh('pip3 install einops==0.4.1 pyyaml wandb')
env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4')
env1.sh('pip3 install dotmap icecream')
path.sh("pip3 install --editable .")
path.sh("pip3 install transformers")
path.sh("pip3 install termcolor")
#path.sh("pip3 uninstall transformers")
#env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
#env1.sh('pip install git+https://github.com/facebookresearch/fairseq')
env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
with always_rerun():
if False:
#env1.sh('pip3 install transformers')
......
......@@ -19,7 +19,7 @@ with torch.no_grad():
path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m"
based_model = lmu.load_from_path(path).cuda().half().eval()
print("Loaded based model")
x = torch.randint(0, 51200, (1, 300)).cuda().long()
x = torch.randint(0, 50256, (1, 2048)).cuda().long()
assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x))
hidden = hf_model.transformer.wte(x)
......@@ -33,7 +33,7 @@ with torch.no_grad():
ic(hf_model.transformer.h[layer].attn(hidden)[0].abs().mean())
ic(based_model.layers[layer].attn(hidden)[0].abs().mean())
ic((hf_model.transformer.h[layer].attn(hidden)[0] - based_model.layers[layer].attn(hidden)[0]).abs().mean())
assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0], rtol=1e-6)
assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0])
attn_out = hf_model.transformer.h[layer].attn(hidden)[0]
hidden = residual + attn_out
residual = hidden
......@@ -50,4 +50,7 @@ with torch.no_grad():
assert torch.allclose(hf_model.transformer.ln_f(hidden), based_model.ln_final(hidden))
hidden = hf_model.transformer.ln_f(hidden)
assert torch.allclose(hf_model.transformer(x)["last_hidden_state"], based_model.get_embeds(x)[0])
ic((hf_model(x)["logits"] - based_model(x)).mean())
print((hf_model(x)["logits"] - based_model(x)).abs().mean())
print((hf_model.transformer(x)["last_hidden_state"] - based_model.get_embeds(x)[0]).abs().mean())
assert torch.allclose(hf_model(x)["logits"], based_model(x))
\ No newline at end of file
......@@ -7,7 +7,7 @@ from pathlib import Path
copy_eot_to_newline = True
copy_newline_to_eot = True
model_dir = 'pretrained/en_dense_lm_125m' # path to smol model weights to fix tokenizer shuffle
model_dir = 'pretrained/en_dense_lm_6_7b' # path to smol model weights to fix tokenizer shuffle
checkpoint = {}
ckmap = {}
......@@ -40,7 +40,7 @@ def no_init(loading_code):
return result
lm = no_init(lambda: TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2').eval().cpu())
lm = no_init(lambda: TransformerLanguageModel.from_pretrained("pretrained/en_dense_lm_125m", bpe='gpt2').eval().cpu())
fairdict = torch.load(f"{sys.argv[1]}", map_location="cpu")
try:
......
from main import *
state_dict = SplitCheckpoint("j6b_vanilla", device="cpu")
from basedformer import utils
from pathlib import Path
import torch
state_dict = utils.SplitCheckpoint("/home/xuser/diffusionstorage/models/prodbigmodels/sigurd-v4", device="cpu")
# ORIGINAL
'''
......@@ -54,12 +54,12 @@ for key in state_dict.keys():
#print(new_state_dict)
def save(state_dict, path):
try: os.mkdir(path)
except: pass
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
checkpoint = {}
for i, x in enumerate(state_dict.items()):
checkpoint[x[0]] = f"{path}/b{i}.pt"
torch.save(x[1], f"{path}/b{i}.pt")
torch.save(checkpoint, f"{path}/m.pt")
save(new_state_dict, "models/6b_vanilla")
\ No newline at end of file
save(new_state_dict, "pretrained/sigurdv4/lm")
\ No newline at end of file
......@@ -25,7 +25,7 @@ def main():
tokenizer = AutoTokenizer.from_pretrained('gpt2')
mask = "████████"
prompt = "You hated the elves enough that if you seen one of them in the forest you would just slice their throats."
prompt = """'''Kurumuz''' is the founder of tech company [["""
#prompt = """'''Kurumuz''' is the founder of tech company [["""
#promptnomask = f"""The room was lit now by a dozen candles. The door had been locked, and the windows barred; but there were still some faint glimmers of moonlight on the floor outside. For a moment the figure stood motionless in its doorway to look about it with an air of keen and nervous expectancy. Then he came forward into the chamber and moved, where he remained standing for an instant upon his toes like one listening intently before starting to rummage among the books and papers. He selected a large volume from among them and turned back to the window,{mask} holding it between himself and the rest of the room until he could feel the warm breath of the night creeping through the curtains.{mask}"""
#prompt = f"""The room was lit now by a dozen candles. The door had been locked, and the windows barred; but there were still some faint glimmers of moonlight on the floor outside. For a moment the figure stood motionless in its doorway to look about it with an air of keen and nervous expectancy. Then he came forward into the chamber and moved{mask}, where he remained standing for an instant upon his toes like one listening intently before starting to rummage among the books and papers. He selected a large volume from among them and turned back to the window, holding it between himself and the rest of the room until he could feel the warm breath of the night creeping through the curtains.{mask}"""
tokens = tokenizer.encode(prompt)
......@@ -39,10 +39,10 @@ def main():
#tokens = torch.cat([tokens, tokens], dim=0)
tokens = torch.cat(tokens, dim=0)
t = time.perf_counter()
model = lmu.load_from_path('pretrained/gptj-6b').cuda().bfloat16().eval()
hypernetwork = hypernet.HyperNetworkSingle(model.config).cuda().float()
model = lmu.load_from_path('pretrained/fairseq_125m').cuda().bfloat16().eval()
#hypernetwork = hypernet.HyperNetworkSingle(model.config).cuda().float()
#print("Loading from step {}".format(cp_list[-1].split("_")[-1]))
hypernetwork.load_state_dict(torch.load(last_cp / "hyper.pt"))
#hypernetwork.load_state_dict(torch.load(last_cp / "hyper.pt"))
ic(time.perf_counter() - t)
......@@ -57,7 +57,7 @@ def main():
}
ops_list = [ops] * bsz
torch.manual_seed(69)
tokens_generated = sampling.generate(model.forward, tokens, gen_len, ops_list=ops_list, hypernetwork=hypernetwork, non_deterministic=False)
tokens_generated = sampling.generate(model.forward, tokens, gen_len, ops_list=ops_list, hypernetwork=None, non_deterministic=True)
#tokens_generated = sampling.generate_greedy(model.forward, tokens, gen_len, hypernetwork=hypernetwork)
#tokens_generated_batched = generate_real_batched(model.forward, tokens, gen_len, ops=ops)
#print(tokens_generated.shape)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment