Commit 88f0a90e authored by novelailab's avatar novelailab

fairseq works

parent 24459438
This diff is collapsed.
from . import gptj
from . import gpt2
from . import fairseq
MODEL_MAP = {
"gptj": gptj.GPTJModel,
"gpt2": gpt2.GPT2Model,
"gpt-fairseq": fairseq.GPTFairModel
}
def get_model(model_name: str):
......
......@@ -17,12 +17,13 @@ class BaseModel(nn.Module):
self.ln_final = nn.LayerNorm(self.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
self.layers = nn.ModuleList([])
self.lm_head = nn.Linear(config.hidden_dim, config.vocab_dim, bias=True)
for _ in range(config.n_layer):
for i in range(config.n_layer):
self.layers.append(
config.Layer(
attn=config.SelfAttention,
ff=config.FeedForward,
config=config,
layer_idx=i,
)
)
......
This diff is collapsed.
......@@ -57,15 +57,18 @@ def _attn(query, key, value, causal_mask, masked_bias,
class SelfAttention(nn.Module):
# Code copied from HF, might want to sanity check later.
def __init__(self, hidden_dim, n_head, device, dtype):
def __init__(self, config):
nn.Module.__init__(self)
max_positions = 2049
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
1, 1, max_positions, max_positions).bool()
self.head_dim = hidden_dim // n_head
self.head_dim = config.hidden_dim // config.n_head
self.rotary_dim = self.head_dim // 4
self.hidden_dim = hidden_dim
self.n_head = n_head
self.hidden_dim = config.hidden_dim
self.n_head = config.n_head
device = config.device
dtype = config.dtype
self.register_buffer("scale_attn", torch.sqrt(torch.tensor(self.head_dim, requires_grad=False).float()))
self.register_buffer("bias", bias)
self.register_buffer("masked_bias", torch.tensor(-1e9, requires_grad=False)) #-1e10 is what mtj uses.
......@@ -101,14 +104,14 @@ class SelfAttention(nn.Module):
if cache:
return x, (key, value)
else:
return x
return x, None
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, activation, device, dtype):
def __init__(self, config):
nn.Module.__init__(self)
self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
self.activation = activation
self.ff1 = nn.Linear(config.hidden_dim, config.hidden_dim * 4, device=config.device, dtype=config.dtype)
self.ff2 = nn.Linear(config.hidden_dim * 4, config.hidden_dim, device=config.device, dtype=config.dtype)
self.activation = config.activation
def forward(self, x, act_ck=False):
x = self.ff1(x)
......@@ -120,39 +123,39 @@ class FeedForward(nn.Module):
return x
class GPT2Layer(nn.Module):
def __init__(self, attn, ff, hidden_dim, n_head, eps, activation, device, dtype):
def __init__(self, attn, ff, config):
nn.Module.__init__(self)
self.hidden_dim = hidden_dim
self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
self.ln_postattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
self.ff = ff(dim=hidden_dim, hidden_dim=hidden_dim*4, activation=activation, device=device, dtype=dtype)
self.attn = attn(hidden_dim=hidden_dim, n_head=n_head, device=device, dtype=dtype)
self.hidden_dim = config.hidden_dim
self.ln_preattn = nn.LayerNorm(config.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
self.ln_postattn = nn.LayerNorm(config.hidden_dim, eps=config.eps, device=config.device, dtype=config.dtype)
self.ff = ff(config)
self.attn = attn(config)
self.tick = True
def forward(self, x, layer_id=None, hypernetwork=None, act_ck=False):
def forward(self, x, layer_id=None, hypernetwork=None, act_ck=False, cache=False, kv=None):
residual = x
if act_ck:
x = ck(self.ln_preattn, x)
attn_out = ck(self.attn, x)
attn_out, kv = ck(self.attn, x, kv=kv, cache=cache)
else:
x = self.ln_preattn(x)
attn_out = self.attn(x)
attn_out, kv = self.attn(x, kv=kv, cache=cache)
residual = residual + attn_out
x = self.ln_postattn(x)
ff_out = self.ff(x, act_ck)
x = residual + ff_out
return x
return x, kv
class GPT2Model(base_lm.BaseModel):
def __init__(self, user_config, **kwargs):
self.default_config = {
'n_layer': 6,
'n_head': 8,
'n_tokens': 2048,
'n_tokens': 1024,
'hidden_dim': 512,
'vocab_dim': 50400,
'eps': 1e-5,
......@@ -163,4 +166,30 @@ class GPT2Model(base_lm.BaseModel):
'SelfAttention': SelfAttention,
'FeedForward': FeedForward,
}
base_lm.BaseModel.__init__(self, user_config, **kwargs)
\ No newline at end of file
base_lm.BaseModel.__init__(self, user_config, **kwargs)
self.pos_embed = nn.Embedding(self.config.n_tokens, self.config.hidden_dim)
def get_embeds(self, x, hypernetwork=None, act_ck=False, kv=None, cache=False):
if kv is None:
kv = [None] * self.n_layer
past_length = 0
else:
past_length = kv[0][0].size(-2) #get sequence dim of key
position_ids = torch.arange(past_length, x[-1] + past_length, dtype=torch.long, device=x.device)
position_ids = position_ids.unsqueeze(0).view(-1, x[-1])
kv_new = []
x = self.vocab_embed(x)
x = x + self.pos_embed(position_ids)
for layer_id, layer in enumerate(self.layers):
x, kvi = layer(x, layer_id=layer_id, hypernetwork=hypernetwork, act_ck=act_ck, kv=kv[layer_id], cache=cache)
kv_new.append(kvi)
x = self.ln_final(x)
if cache:
return x, kv_new
else:
return x, None
\ No newline at end of file
......@@ -48,13 +48,13 @@ remote = config_obj.get_pyfra_remote()
env1 = remote.env('noname', python_version=None)
path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
env1.sh('pip install einops numpy')
env1.sh('pip install tqdm')
env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
env1.sh('pip3 install einops==0.4.1 pyyaml wandb')
env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4')
env1.sh('pip3 install dotmap')
#env1.sh('pip3 install git+https://github.com/pytorch/fairseq')
env1.sh('pip3 install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
with always_rerun():
env1.sh('pip uninstall transformers')
env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
with always_rerun():
if args.bash:
......
......@@ -40,7 +40,9 @@ if False:
#path.sh("pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113")
with always_rerun():
if True:
path.sh("python3 ../lm-evaluation-harness/main.py --model basedformer --batch_size 8 --model_args pretrained=/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/gptj-6b --device 0 --tasks lambada")
env1.sh('pip3 uninstall transformers')
env1.sh('pip3 install transformers')
path.sh("python3 ../lm-evaluation-harness/main.py --model basedformer --batch_size 8 --model_args pretrained=/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m --device 0 --tasks lambada --no_cache")
#path.sh("python3 ../lm-evaluation-harness/main.py --batch_size 8")
else:
......
from basedformer.utils import *
import basedformer.lm_utils as lmu
from fairseq.models.transformer_lm import TransformerLanguageModel
import time
import torch
from time import perf_counter, perf_counter_ns
import numpy as np
from tqdm import tqdm
from contextlib import contextmanager
import torch.nn.functional as F
from transformers import GPTNeoForCausalLM
from icecream import ic
#replicating timeit magic function of ipython
def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
precision = 'ns'
r_arr = np.empty([2, r]) # [0] = mean, [1] = std
if function:
func.__name__ = function.__name__
for i in tqdm(range(r)) if do_tqdm else range(r):
n_arr = np.empty(n)
for k in range(n):
start = perf_counter_ns()
func()
n_arr[k] = perf_counter_ns() - start
if not first:
# delete the first element from n_arr numpy array
n_arr = np.delete(n_arr, 0)
r_arr[0, i] = np.mean(n_arr)
r_arr[1, i] = np.std(n_arr)
best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
#check if best[0] bigger than 1ms in numpy
if best[0] < 1e3:
precision = 'ns'
elif best[0] >= 1e9:
print('b')
best[0] = best[0] * 1e-9
best[1] = best[1] * 1e-9
precision = 's'
elif best[0] >= 1e6:
best[0] = best[0] * 1e-6
best[1] = best[1] * 1e-6
precision = 'ms'
elif best[0] >= 1e3:
precision = 'μs'
best[0] = best[0] * 1e-3
best[1] = best[1] * 1e-3
if not quiet:
if precision == 'ns':
print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
if precision == 'μs':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 'ms':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 's':
print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
with torch.no_grad():
model_dir = '/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/hf_125m/'
hf_model = no_init(lambda: GPTNeoForCausalLM.from_pretrained(model_dir)).cuda().half().eval()
print("Loaded hf model")
path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m"
based_model = lmu.load_from_path(path).cuda().half().eval()
print("Loaded based model")
x = torch.randint(0, 50256, (1, 2048)).cuda().long()
assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x))
hidden = hf_model.transformer.wte(x)
for layer in range(len(based_model.layers)):
ic(layer)
residual = hidden
#ln_preattn
assert torch.allclose(hf_model.transformer.h[layer].ln_1(hidden), based_model.layers[layer].ln_preattn(hidden))
hidden = hf_model.transformer.h[layer].ln_1(hidden)
#attn
ic(hf_model.transformer.h[layer].attn(hidden)[0].abs().mean())
ic(based_model.layers[layer].attn(hidden)[0].abs().mean())
ic((hf_model.transformer.h[layer].attn(hidden)[0] - based_model.layers[layer].attn(hidden)[0]).abs().mean())
#assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0], rtol=1e-6)
attn_out = hf_model.transformer.h[layer].attn(hidden)[0]
hidden = residual + attn_out
residual = hidden
assert torch.allclose(hf_model.transformer.h[layer].ln_2(hidden), based_model.layers[layer].ln_postattn(hidden))
hidden = hf_model.transformer.h[layer].ln_2(hidden)
#ffn
assert torch.allclose(hf_model.transformer.h[layer].mlp(hidden), based_model.layers[layer].ff(hidden))
ff_out = hf_model.transformer.h[layer].mlp(hidden)
hidden = residual + ff_out
assert torch.allclose(hf_model.transformer.h[layer](hidden)[0], based_model.layers[layer](hidden)[0])
ic(hf_model(x)["logits"].abs().mean())
ic(based_model(x).abs().mean())
assert torch.allclose(hf_model.transformer.ln_f(hidden), based_model.ln_final(hidden))
hidden = hf_model.transformer.ln_f(hidden)
assert torch.allclose(hf_model.transformer(x)["last_hidden_state"], based_model.get_embeds(x)[0])
assert torch.allclose(hf_model(x)["logits"], based_model(x))
\ No newline at end of file
from basedformer import gptj
from basedformer.utils import *
import basedformer.lm_utils as lmu
import time
import torch
from time import perf_counter, perf_counter_ns
import numpy as np
from tqdm import tqdm
from contextlib import contextmanager
import torch.nn.functional as F
from transformers import AutoModelForCausalLM
#replicating timeit magic function of ipython
def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
precision = 'ns'
r_arr = np.empty([2, r]) # [0] = mean, [1] = std
if function:
func.__name__ = function.__name__
for i in tqdm(range(r)) if do_tqdm else range(r):
n_arr = np.empty(n)
for k in range(n):
start = perf_counter_ns()
func()
n_arr[k] = perf_counter_ns() - start
if not first:
# delete the first element from n_arr numpy array
n_arr = np.delete(n_arr, 0)
r_arr[0, i] = np.mean(n_arr)
r_arr[1, i] = np.std(n_arr)
best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
#check if best[0] bigger than 1ms in numpy
if best[0] < 1e3:
precision = 'ns'
elif best[0] >= 1e9:
print('b')
best[0] = best[0] * 1e-9
best[1] = best[1] * 1e-9
precision = 's'
elif best[0] >= 1e6:
best[0] = best[0] * 1e-6
best[1] = best[1] * 1e-6
precision = 'ms'
elif best[0] >= 1e3:
precision = 'μs'
best[0] = best[0] * 1e-3
best[1] = best[1] * 1e-3
if not quiet:
if precision == 'ns':
print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
if precision == 'μs':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 'ms':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 's':
print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
with torch.no_grad():
hf_model = no_init(lambda: AutoModelForCausalLM.from_pretrained('gpt2')).cuda().half().eval()
print("Loaded hf model")
path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/gpt2"
based_model = lmu.load_from_path(path).cuda().half().eval()
print("Loaded based model")
x = torch.randint(0, 50256, (1, 1000)).cuda().long()
assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x))
hidden = hf_model.transformer.wte(x)
for layer in range(28):
assert torch.allclose(hf_model.transformer.h[layer].ln_1(hidden), based_model.layers[layer].ln_preattn(hidden))
hidden = hf_model.transformer.h[layer].ln_1(hidden)
assert torch.allclose(hf_model.transformer.h[layer].ln_2(hidden), based_model.layers[layer].ln_postattn(hidden))
hidden = hf_model.transformer.h[layer].ln_2(hidden)
hf_mlp = hf_model.transformer.h[layer].mlp(hidden)
based_mlp = based_model.layers[layer].ff(hidden)
print((hf_mlp - based_mlp).abs().sum())
hidden = hf_mlp
hf_attn = hf_model.transformer.h[layer].attn(hidden)[0]
based_attn = based_model.layers[layer].attn(hidden)[0]
print((hf_attn - based_attn).abs().sum())
assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0])
hidden = hf_model.transformer.h[layer].attn(hidden)[0]
assert torch.allclose(hf_model.transformer.h[layer](hidden)[0], based_model.layers[layer](hidden)[0])
assert torch.allclose(hf_model.transformer.ln_f(hidden), based_model.ln_final(hidden))
hidden = hf_model.transformer.ln_f(hidden)
assert torch.allclose(hf_model.transformer(x)["last_hidden_state"], based_model.get_embeds(x)[0])
assert torch.allclose(hf_model(x)["logits"], based_model(x))
\ No newline at end of file
import json
import torch
from fairseq.models.transformer_lm import TransformerLanguageModel
import sys
import os
copy_eot_to_newline = True
copy_newline_to_eot = True
model_dir = 'pretrained/en_dense_lm_125m' # path to smol model weights to fix tokenizer shuffle
checkpoint = {}
ckmap = {}
ckid = 0
def save(params, name):
global ckid
ckmap[name] = f"b{ckid}.pt"
ckid += 1
torch.save(params, f"{sys.argv[2]}/" + ckmap[name])
torch.save(ckmap, f"{sys.argv[2]}/m.pt")
print(name + ": " + str(params.shape))
del params
def no_init(loading_code):
def dummy(self):
return
modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
original = {}
for mod in modules:
original[mod] = mod.reset_parameters
mod.reset_parameters = dummy
result = loading_code()
for mod in modules:
mod.reset_parameters = original[mod]
return result
lm = no_init(lambda: TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2').eval().cpu())
fairdict = torch.load(f"{sys.argv[1]}", map_location="cpu")
try:
os.mkdir(sys.argv[2])
except:
pass
hidden_dim = fairdict["cfg"]["model"]["decoder_embed_dim"]
num_heads = fairdict["cfg"]["model"]["decoder_attention_heads"]
num_layers = fairdict["cfg"]["model"]["decoder_layers"]
fairdict = fairdict["model"]
config = {
"activation_function": "gelu",
"architectures": ["GPTNeoForCausalLM"],
"attention_dropout": 0,
"attention_layers": ["global"] * num_layers,
"attention_types": [[["global"], num_layers]],
"bos_token_id": 50256,
"embed_dropout": 0,
"eos_token_id": 50256,
"gradient_checkpointing": False,
"hidden_size": hidden_dim,
"initializer_range": 0.02,
"intermediate_size": None,
"fair": True,
"layer_norm_epsilon": 1e-05,
"max_position_embeddings": 2048,
"model_type": "gpt_neo",
"num_heads": num_heads,
"num_layers": num_layers,
"resid_dropout": 0,
"rotary": False,
"summary_activation": None,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": True,
"summary_type": "cls_index",
"summary_use_proj": True,
"model_dtype": "fp16",
"model_device": "cuda",
"transformers_version": "4.6.0.dev0",
"use_cache": True,
"vocab_size": 51200,
"window_size": 256,
"tokenizer_class": "GPT2Tokenizer",
"task_specific_params": {"text-generation": {"do_sample": True,"temperature": 1.0,"max_length": 50}}
}
with open(f"{sys.argv[2]}/config.json", "w") as fh:
fh.write(json.dumps(config))
#print(lm)
def hack_embs(embs):
eot = embs[50256].clone()
newline = embs[198].clone()
if copy_eot_to_newline:
embs[198] = eot
if copy_newline_to_eot:
embs[50256] = newline
# gpt2 compatible input/output embedding layers
l1 = []
l2 = []
check = {}
for i in range(50256):
check[i] = True
for i, s in enumerate(lm.tgt_dict.symbols):
try:
if str(int(s)) == s and s != '50256':
l2.append(int(s))
l1.append(i)
del check[int(s)]
except:
pass
for i, s in enumerate([lm.tgt_dict.eos_word, lm.tgt_dict.pad_word, lm.tgt_dict.bos_word, lm.tgt_dict.unk_word]):
l2.append(50256 + i)
l1.append(lm.tgt_dict.indices[s])
mapping = {}
for i in range(50260):
mapping[l1[i]] = l2[i]
with torch.no_grad():
wte = fairdict["decoder.embed_tokens.weight"].clone()
for i in range(50260):
wte[mapping[i]] = fairdict["decoder.embed_tokens.weight"][i]
hack_embs(wte)
save(wte.half(), "transformer.wte.weight")
lm_head = fairdict["decoder.output_projection.weight"].clone()
for i in range(50260):
lm_head[mapping[i]] = fairdict["decoder.output_projection.weight"][i]
hack_embs(lm_head)
save(lm_head.half(), "lm_head.weight")
save(torch.FloatTensor(1), "transformer.wpe_sin._float_tensor")
new_state_dict = {}
for y in fairdict:
dotlist = y.split(".")
if y == "decoder.version":
trans_to = "Passed"
pass
elif y == "decoder.embed_tokens.weight":
continue
elif len(dotlist) >= 2 and dotlist[1] == "layers":
layer_id = dotlist[2]
if dotlist[-2] in ["k_proj", "v_proj", "q_proj", "out_proj"]:
trans_to = f"transformer.h.{layer_id}.attn.attention.{dotlist[-2]}.{dotlist[-1]}"
if dotlist[-2] == "self_attn_layer_norm":
trans_to = f"transformer.h.{layer_id}.ln_1.{dotlist[-1]}"
if dotlist[3] == "fc1":
trans_to = f"transformer.h.{layer_id}.mlp.c_fc.{dotlist[-1]}"
if dotlist[3] == "fc2":
trans_to = f"transformer.h.{layer_id}.mlp.c_proj.{dotlist[-1]}"
if dotlist[3] == "final_layer_norm":
trans_to = f"transformer.h.{layer_id}.ln_2.{dotlist[-1]}"
elif len(dotlist) >= 2 and dotlist[1] == "layer_norm":
trans_to = f"transformer.ln_f.{dotlist[-1]}"
elif y == "decoder.output_projection.weight":
continue
if trans_to != "Passed":
save(fairdict[y].half(), trans_to)
print(f"{trans_to} < {y}")
\ No newline at end of file
import json
import torch
from fairseq.models.transformer_lm import TransformerLanguageModel
import sys
import os
from pathlib import Path
copy_eot_to_newline = True
copy_newline_to_eot = True
model_dir = 'pretrained/en_dense_lm_125m' # path to smol model weights to fix tokenizer shuffle
checkpoint = {}
ckmap = {}
ckid = 0
def save(params, name):
global ckid
ckmap[name] = f"b{ckid}.pt"
ckid += 1
path = Path(f"{sys.argv[2]}/lm")
path.mkdir(parents=True, exist_ok=True)
torch.save(params, path / ckmap[name])
torch.save(ckmap, path / "m.pt")
print(name + ": " + str(params.shape))
del params
def no_init(loading_code):
def dummy(self):
return
modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
original = {}
for mod in modules:
original[mod] = mod.reset_parameters
mod.reset_parameters = dummy
result = loading_code()
for mod in modules:
mod.reset_parameters = original[mod]
return result
lm = no_init(lambda: TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2').eval().cpu())
fairdict = torch.load(f"{sys.argv[1]}", map_location="cpu")
try:
os.mkdir(sys.argv[2])
except:
pass
hidden_dim = fairdict["cfg"]["model"]["decoder_embed_dim"]
num_heads = fairdict["cfg"]["model"]["decoder_attention_heads"]
num_layers = fairdict["cfg"]["model"]["decoder_layers"]
fairdict = fairdict["model"]
config = {
"model_class": "gpt-fairseq",
"model_path": ".",
"model_config": {
"n_layer": num_layers,
"n_head": num_heads,
"hidden_dim": hidden_dim,
"vocab_dim": 51200,
"eps": 1e-05,
"n_tokens": 2049
}
}
with open(f"{sys.argv[2]}/config.json", "w") as fh:
fh.write(json.dumps(config))
#print(lm)
def hack_embs(embs):
eot = embs[50256].clone()
newline = embs[198].clone()
if copy_eot_to_newline:
embs[198] = eot
if copy_newline_to_eot:
embs[50256] = newline
# gpt2 compatible input/output embedding layers
l1 = []
l2 = []
check = {}
for i in range(50256):
check[i] = True
for i, s in enumerate(lm.tgt_dict.symbols):
try:
if str(int(s)) == s and s != '50256':
l2.append(int(s))
l1.append(i)
del check[int(s)]
except:
pass
for i, s in enumerate([lm.tgt_dict.eos_word, lm.tgt_dict.pad_word, lm.tgt_dict.bos_word, lm.tgt_dict.unk_word]):
l2.append(50256 + i)
l1.append(lm.tgt_dict.indices[s])
mapping = {}
for i in range(50260):
mapping[l1[i]] = l2[i]
with torch.no_grad():
wte = fairdict["decoder.embed_tokens.weight"].clone()
for i in range(50260):
wte[mapping[i]] = fairdict["decoder.embed_tokens.weight"][i]
hack_embs(wte)
save(wte.half(), "vocab_embed.weight")
lm_head = fairdict["decoder.output_projection.weight"].clone()
for i in range(50260):
lm_head[mapping[i]] = fairdict["decoder.output_projection.weight"][i]
hack_embs(lm_head)
save(lm_head.half(), "lm_head.weight")
save(torch.FloatTensor(1), "pos_embed._float_tensor")
new_state_dict = {}
for y in fairdict:
dotlist = y.split(".")
if y == "decoder.version":
trans_to = "Passed"
pass
elif y == "decoder.embed_tokens.weight":
continue
elif len(dotlist) >= 2 and dotlist[1] == "layers":
layer_id = dotlist[2]
if dotlist[-2] in ["k_proj", "v_proj", "q_proj", "out_proj"]:
trans_to = f"layers.{layer_id}.attn.{dotlist[-2]}.{dotlist[-1]}"
if dotlist[-2] == "self_attn_layer_norm":
trans_to = f"layers.{layer_id}.ln_preattn.{dotlist[-1]}"
if dotlist[3] == "fc1":
trans_to = f"layers.{layer_id}.ff.ff1.{dotlist[-1]}"
if dotlist[3] == "fc2":
trans_to = f"layers.{layer_id}.ff.ff2.{dotlist[-1]}"
if dotlist[3] == "final_layer_norm":
trans_to = f"layers.{layer_id}.ln_postattn.{dotlist[-1]}"
elif len(dotlist) >= 2 and dotlist[1] == "layer_norm":
trans_to = f"ln_final.{dotlist[-1]}"
elif y == "decoder.output_projection.weight":
continue
if trans_to != "Passed":
save(fairdict[y].half(), trans_to)
print(f"{trans_to} < {y}")
\ No newline at end of file
import torch
import transformers
import sys
from icecream import ic
import os
"""
Original:
......@@ -22,23 +24,27 @@ h.0.mlp.c_fc.bias
h.0.mlp.c_proj.weight
h.0.mlp.c_proj.bias
attn has biases unlike GPT-J. QKV Matrices are also merged instead of separate. what is the order though?
attn has biases unlike GPT-J. QKV Matrices are also merged instead of separate. what is the order though? probably just QKV.
"""
x = torch.load("models/gpt2_vanilla/pytorch_model.bin")
state_dict = x
print(x["h.0.attn.c_attn.weight"].reshape(-1, 768, 768).shape)
sys.exit(0)
ic(x["h.0.attn.c_attn.weight"].shape)
ic(x["h.0.attn.c_attn.bias"].shape)
ic(x["h.0.attn.c_proj.weight"].shape)
ic(x["h.0.attn.c_proj.bias"].shape)
new_state_dict = {}
module_map = {
"ln_1": "ln_preattn",
"ln_2": "ln_postattn",
"mlp.c_proj": "ff.ff2",
"mlp.c_fc": "ff.ff1",
"attn.attention.out_proj": "attn.out_proj",
"attn.attention.k_proj": "attn.k_proj",
"attn.attention.v_proj": "attn.v_proj",
"attn.attention.q_proj": "attn.q_proj",
"attn.c_proj": "attn.out_proj",
"attn.c_attn": "attn.k_proj",
"wte": "vocab_embed",
"wpe": "pos_embed",
'ln_f': 'ln_final',
'lm_head': 'lm_head',
}
......@@ -47,18 +53,38 @@ print(type(state_dict))
for key in state_dict.keys():
dotlist = key.split('.')
if len(dotlist) > 3:
layer = dotlist[2]
layer = dotlist[1]
for x in module_map:
if x in key:
new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
print(f"{key} -> layers.{layer}.{module_map[x]}.{dotlist[-1]}")
if x == "attn.c_attn":
if "weight" in key:
hidden_dim = state_dict[key].shape[0]
qkv = state_dict[key].reshape(-1, hidden_dim, hidden_dim).split(1)
new_state_dict[f"layers.{layer}.attn.q_proj.weight"] = qkv[0].squeeze(0).transpose(-1, -2)
new_state_dict[f"layers.{layer}.attn.k_proj.weight"] = qkv[1].squeeze(0).transpose(-1, -2)
new_state_dict[f"layers.{layer}.attn.v_proj.weight"] = qkv[2].squeeze(0).transpose(-1, -2)
if "bias" in key:
hidden_dim = state_dict[key].shape[0] // 3
qkv = state_dict[key].reshape(-1, hidden_dim).split(1)
new_state_dict[f"layers.{layer}.attn.q_proj.bias"] = qkv[0].squeeze(0)
new_state_dict[f"layers.{layer}.attn.k_proj.bias"] = qkv[1].squeeze(0)
new_state_dict[f"layers.{layer}.attn.v_proj.bias"] = qkv[2].squeeze(0)
else:
if len(state_dict[key].shape) == 2:
ic("transpose!")
new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key].transpose(-1, -2)
else:
new_state_dict[f"layers.{layer}.{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
print(f"{key} -> layers.{layer}.{module_map[x]}.{dotlist[-1]}")
else:
for x in module_map:
if x in key:
new_state_dict[f"{module_map[x]}.{dotlist[-1]}"] = state_dict[key]
print(f"{key} -> {module_map[x]}.{dotlist[-1]}")
#print(new_state_dict)
for k, v in new_state_dict.items():
print(f"{k} -> {v.shape}")
def save(state_dict, path):
try: os.mkdir(path)
......@@ -69,4 +95,4 @@ def save(state_dict, path):
torch.save(x[1], f"{path}/b{i}.pt")
torch.save(checkpoint, f"{path}/m.pt")
save(new_state_dict, "models/6b_vanilla")
\ No newline at end of file
save(new_state_dict, "pretrained/gpt2/lm")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment