Commit 7eebf8ad authored by novelailab's avatar novelailab

add more stuff

parent 4a13de3a
from main import *
state_dict = SplitCheckpoint("/home/xuser/models/j6b_ckpt_14001", device="cpu")
# ORIGINAL
'''
transformer.ln_f.weight
transformer.ln_f.bias
lm_head.weight
lm_head.bias
transformer.h.9.ln_1.weight
transformer.h.9.ln_1.bias
transformer.h.9.mlp.c_proj.weight
transformer.h.9.mlp.c_proj.bias
transformer.h.9.mlp.c_fc.weight
transformer.h.9.mlp.c_fc.bias
transformer.h.9.attn.attention.out_proj.weight
transformer.h.9.attn.attention.k_proj.weight
transformer.h.9.attn.attention.v_proj.weight
transformer.h.9.attn.attention.q_proj.weight
transformer.wte.weight
'''
new_state_dict = {}
module_map = {
"ln_1": "ln_preattn",
"mlp.c_proj": "ff.ff2",
"mlp.c_fc": "ff.ff1",
"attn.attention.out_proj": "attn.out_proj",
"attn.attention.k_proj": "attn.k_proj",
"attn.attention.v_proj": "attn.v_proj",
"attn.attention.q_proj": "attn.q_proj",
"wte": "vocab_embed",
'ln_f': 'ln_final',
'lm_head': 'lm_head',
}
print(type(state_dict))
for key in state_dict.keys():
dotlist = key.split('.')
if len(dotlist) > 3:
layer = dotlist[2]
for x in module_map:
if x in key:
new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
print(f"{key} -> layers.{layer}.{module_map[x]}.{dotlist[-1]}")
else:
for x in module_map:
if x in key:
new_state_dict[f"{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
print(f"{key} -> {module_map[x]}.{dotlist[-1]}")
#print(new_state_dict)
def save(state_dict, path):
try: os.mkdir(path)
except: pass
checkpoint = {}
for i, x in enumerate(state_dict.items()):
checkpoint[x[0]] = f"{path}/b{i}.pt"
torch.save(x[1], f"{path}/b{i}.pt")
torch.save(checkpoint, f"{path}/m.pt")
save(new_state_dict, "models/6b")
\ No newline at end of file
......@@ -8,6 +8,14 @@ except ImportError:
from collections import MutableMapping
import os
from pathlib import Path
import math
def defaults():
# Easily accessible defaults
D_LAYER = GPTLayer
D_ATTN = SelfAttention
D_FF = FeedForward
D_ACT = gelu_new
def no_init(loading_code):
def dummy(self):
......@@ -67,6 +75,9 @@ class SplitCheckpoint(MutableMapping):
#TODO: Might change with non einsum functions?
def gelu_new(x):
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
def fixed_pos_embedding(dim=None, seq_len=None, x=None):
if x is None:
x = torch.empty(0)
......@@ -84,19 +95,7 @@ def apply_rotary_pos_emb(x, sincos, offset=0):
sin, cos = map(lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2), sincos)
return (x * cos) + (rotate_every_two(x) * sin)
class FeedForward(nn.Module):
def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU):
self.ff1 = nn.Linear(dim, hidden_dim)
self.ff2 = nn.Linear(hidden_dim, dim)
self.activation = activation()
def forward(self, x):
x = self.ff1(x)
x = self.activation(x)
x = self.ff2(x)
return x
def _split_heads(self, tensor, num_heads, attn_head_size, rotary):
def _split_heads(tensor, num_heads, attn_head_size, rotary):
"""
Splits hidden_size dim into attn_head_size and num_heads
"""
......@@ -111,7 +110,7 @@ def _split_heads(self, tensor, num_heads, attn_head_size, rotary):
else:
raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
def _merge_heads(self, tensor, num_heads, attn_head_size):
def _merge_heads(tensor, num_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden_size
"""
......@@ -143,7 +142,8 @@ def _attn(query, key, value, causal_mask, masked_bias,
class SelfAttention(nn.Module):
# Code copied from HF, might want to sanity check later.
def __init__(self, hidden_dim, n_head):
def __init__(self, hidden_dim, n_head, device="cuda", dtype=torch.float16):
super(SelfAttention, self).__init__()
max_positions = 2049
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
1, 1, max_positions, max_positions).bool()
......@@ -154,10 +154,10 @@ class SelfAttention(nn.Module):
self.register_buffer("bias", bias)
self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False))
attn_bias = False
self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
self.q_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias)
self.k_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
self.v_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
self.q_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
self.out_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=attn_bias, device=device, dtype=dtype)
self.rotary_dim = self.head_dim
# TODO: handle rotary
sin, cos = fixed_pos_embedding(dim=self.rotary_dim, seq_len=max_positions)
......@@ -175,8 +175,8 @@ class SelfAttention(nn.Module):
offset = 0
key = self.apply_rotary_pos_emb(key, (self.sin, self.cos), offset=offset).to(key.dtype)
query = self.apply_rotary_pos_emb(query, (self.sin, self.cos), offset=offset).to(query.dtype)
key = apply_rotary_pos_emb(key, (self.sin, self.cos), offset=offset).to(key.dtype)
query = apply_rotary_pos_emb(query, (self.sin, self.cos), offset=offset).to(query.dtype)
key = key.permute(0, 2, 1, 3)
query = query.permute(0, 2, 1, 3)
......@@ -185,21 +185,34 @@ class SelfAttention(nn.Module):
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
x = _attn(
query, key, value, causal_mask, self.masked_bias, self.scale_attn
query, key, value, causal_mask, self.masked_bias, None, self.scale_attn
)
x = _merge_heads(x, self.num_heads, self.head_dim)
x = _merge_heads(x, self.n_head, self.head_dim)
x = self.out_proj(x)
return x # a, present, (attentions)
class FeedForward(nn.Module):
def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU, device="cuda", dtype=torch.float16):
super(FeedForward, self).__init__()
self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
self.activation = activation
def forward(self, x):
x = self.ff1(x)
x = self.activation(x)
x = self.ff2(x)
return x
class GPTLayer(nn.Module):
def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU):
def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU, device="cuda", dtype=torch.float16):
super(GPTLayer, self).__init__()
self.hidden_dim = hidden_dim
self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps)
#self.ln_postattn = nn.LayerNorm(hidden_dim, eps=eps)
self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation)
self.attn = attn(hidden_dim=hidden_dim, n_head=n_head)
self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
self.attn = attn(hidden_dim=hidden_dim, n_head=n_head, device=device, dtype=dtype)
def forward(self, x, hypernetwork):
residual = x
......@@ -209,20 +222,21 @@ class GPTLayer(nn.Module):
attn_out = self.attn(x)
ff_out = self.ff(x)
x = residual + attn_out + ff_out + (hyper_out if hyper_out is not None else 0)
x = residual + ff_out + attn_out + (hyper_out if hypernetwork is not None else 0)
return x
# Can access and change every module from here, as both Layer class and ff and attn classes are passed from GPTModel.
class GPTModel(nn.Module):
def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPTLayer):
def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPTLayer, device="cuda", dtype=torch.float16):
super(GPTModel, self).__init__()
self.hidden_dim = hidden_dim
self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim)
self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps)
self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
self.layers = nn.ModuleList([])
for _ in range(n_layer):
self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation))
self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
#TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
#TODO: Do we want to pass a config object everywhere?
#TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.
def forward(self, x, hypernetwork=None):
x = self.vocab_embed(x)
......@@ -232,9 +246,19 @@ class GPTModel(nn.Module):
x = self.ln_final(x)
return x
def load(self, path):
state_dict = SplitCheckpoint(path, device="cuda")
self.load_state_dict(state_dict)
@classmethod
def load(cls, config, path=None, state_dict=None):
if path:
state_dict = SplitCheckpoint(path, device="cuda")
model = no_init(lambda: cls(**config))
model.load_state_dict(state_dict, strict=False)
return model
@classmethod
def init(cls, config):
model = no_init(lambda: cls(**config))
return model
def save(self, path):
try: os.mkdir(path)
......@@ -243,6 +267,7 @@ class GPTModel(nn.Module):
for i, x in enumerate(self.state_dict().items()):
checkpoint[x[0]] = f"{path}/b{i}.pt"
torch.save(x[1], f"{path}/b{i}.pt")
torch.save(checkpoint, f"{path}/m.pt")
# TODO: Do we want to have the LM head as a seperate Class? Or just a function? I think we might be better off with a function here and maybe
# also for the self attention, we can just write a function that gets fed in the q, k, v.
......@@ -253,17 +278,57 @@ class GPTLM(nn.Module):
def forward(self, x):
return
def load_gpt_j(path):
def load_gpt_j(state_dict=None, path=None):
config = {
"n_layer": 28,
"n_head": 16,
"hidden_dim": 4096,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTLayer
}
model = GPTModel.load(config, path=path)
return model
def init_6b():
config = {
"n_layer": 28,
"n_head": 16,
"hidden_dim": 4096,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": nn.GELU,
"activation": gelu_new,
"Layer": GPTLayer
}
model = GPTModel(**config)
return model
def init_125m():
config = {
"n_layer": 12,
"n_head": 12,
"hidden_dim": 768,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTLayer
}
model = GPTModel.init(config)
return model
def init_1_3b():
config = {
"n_layer": 24,
"n_head": 16,
"hidden_dim": 2048,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTLayer
}
model = no_init(lambda: GPTModel(**config))
model.load(path)
return model
\ No newline at end of file
model = GPTModel(**config)
return model
from main import *
import time
state_dict = SplitCheckpoint("'/home/xuser/models/j6b_ckpt_14001", device="cuda")
for x in state_dict:
print(x)
from time import perf_counter, perf_counter_ns
import numpy as np
from tqdm import tqdm
from contextlib import contextmanager
#replicating timeit magic function of ipython
def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
precision = 'ns'
r_arr = np.empty([2, r]) # [0] = mean, [1] = std
if function:
func.__name__ = function.__name__
for i in tqdm(range(r)) if do_tqdm else range(r):
n_arr = np.empty(n)
for k in range(n):
start = perf_counter_ns()
func()
n_arr[k] = perf_counter_ns() - start
if not first:
# delete the first element from n_arr numpy array
n_arr = np.delete(n_arr, 0)
r_arr[0, i] = np.mean(n_arr)
r_arr[1, i] = np.std(n_arr)
best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
#check if best[0] bigger than 1ms in numpy
if best[0] < 1e3:
precision = 'ns'
elif best[0] >= 1e9:
print('b')
best[0] = best[0] * 1e-9
best[1] = best[1] * 1e-9
precision = 's'
elif best[0] >= 1e6:
best[0] = best[0] * 1e-6
best[1] = best[1] * 1e-6
precision = 'ms'
elif best[0] >= 1e3:
precision = 'μs'
best[0] = best[0] * 1e-3
best[1] = best[1] * 1e-3
if not quiet:
if precision == 'ns':
print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
if precision == 'μs':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 'ms':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 's':
print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
with torch.no_grad():
model = init_125m().cuda().half()
'''
timeit(lambda: model(torch.zeros((1, 2048)).long().cuda()), n=20, first=False)
module = torch.jit.trace(model, torch.zeros((1, 2048)).long().cuda())
torch.jit.optimize_for_inference(module)
timeit(lambda: module(torch.zeros((1, 2048)).long().cuda()), n=20, first=False)
timeit(lambda: model(torch.zeros((1, 1000)).long().cuda()), n=20, first=False)
module = torch.jit.trace(model, torch.zeros((1, 1000)).long().cuda())
torch.jit.optimize_for_inference(module)
timeit(lambda: module(torch.zeros((1, 1000)).long().cuda()), n=20, first=False)
'''
module = torch.jit.trace(model, torch.zeros((1, 2048)).long().cuda())
torch.jit.optimize_for_inference(module)
static_input = torch.zeros((1, 2048), device='cuda').long()
static_out = torch.randn((1, 2048, 2048), device='cuda').half()
timeit(lambda: module(static_input), n=20, first=False)
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
for i in range(3):
output = module(torch.randint(0, 50000, (1, 2048), device='cuda').long())
torch.cuda.current_stream().wait_stream(s)
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=s):
static_out = module(static_input)
real_inputs = [torch.randint(0, 50000, (1, 2048), device='cuda').long() for _ in range(100)]
t = time.perf_counter()
for data in real_inputs:
#print(data[0, :20])
static_input.copy_(data)
#timeit(lambda: g.replay(), n=100, first=True)
g.replay()
#print(static_out[0, 0, :20])
torch.cuda.synchronize()
print(f"{perf_counter() - t}s")
#for data in real_inputs:
# print(model(data)[0, 0, :20])
......@@ -7,14 +7,14 @@ dry = False
config_obj = KubeConfig()
config_obj.set_name(name)
config_obj.set_gpu(gpu_name=GPU.A40, amount=1)
config_obj.set_gpu(gpu_name=GPU.RTX_A5000, amount=1)
config_obj.set_ram(16)
config_obj.set_cpu(4)
#config_obj.set_cpu_only()
config_obj.dry_run(dry)
config_obj.print_information()
config_obj.create_deployment(overwrite=True)
config_obj.create_service(overwrite=True)
#config_obj.create_deployment(overwrite=True)
#config_obj.create_service(overwrite=True)
remote = config_obj.get_pyfra_remote()
env1 = remote.env('noname', python_version=None)
......@@ -25,5 +25,6 @@ models = {'6b': '/home/xuser/models/j6b_ckpt_14001', '20b': '/home/xuser/diffusi
path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
env1.sh('pip install einops numpy')
env1.sh('pip install tqdm')
with always_rerun():
path.sh(f'python3 test.py')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment