Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
B
Basedformer
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
novelai-storage
Basedformer
Commits
bc280afb
Commit
bc280afb
authored
Mar 26, 2022
by
novelailab
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
things fixed
parent
fb25b47c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
60 additions
and
22 deletions
+60
-22
comparehf.py
comparehf.py
+3
-1
lm_train/optimizer.py
lm_train/optimizer.py
+3
-0
main.py
main.py
+3
-2
train.py
train.py
+51
-19
No files found.
comparehf.py
View file @
bc280afb
...
...
@@ -80,7 +80,9 @@ with torch.no_grad():
hidden
=
hf_model
.
transformer
.
h
[
layer
]
.
mlp
(
hidden
)
assert
torch
.
allclose
(
hf_model
.
transformer
.
h
[
layer
]
.
attn
(
hidden
)[
0
],
based_model
.
layers
[
layer
]
.
attn
(
hidden
))
hidden
=
hf_model
.
transformer
.
h
[
layer
]
.
attn
(
hidden
)[
0
]
assert
torch
.
allclose
(
hf_model
.
transformer
.
h
[
layer
](
hidden
)[
0
],
based_model
.
layers
[
layer
](
hidden
))
assert
torch
.
allclose
(
hf_model
.
transformer
.
ln_f
(
hidden
),
based_model
.
ln_final
(
hidden
))
hidden
=
hf_model
.
transformer
.
ln_f
(
hidden
)
assert
torch
.
allclose
(
hf_model
.
lm_head
(
hidden
),
based_model
.
lm_head
(
hidden
))
assert
torch
.
allclose
(
hf_model
.
transformer
(
x
)[
"last_hidden_state"
],
based_model
.
get_embeds
(
x
))
assert
torch
.
allclose
(
hf_model
(
x
)[
"logits"
],
based_model
(
x
))
\ No newline at end of file
lm_train/optimizer.py
View file @
bc280afb
...
...
@@ -40,6 +40,9 @@ class BasedOptimizer:
if
optimizer
==
"adamw"
:
self
.
optimizer
=
optim
.
AdamW
(
parameters
,
lr
=
0
,
weight_decay
=
self
.
weight_decay
,
betas
=
(
self
.
beta1
,
self
.
beta2
),
eps
=
self
.
eps
)
elif
optimizer
==
"adamw8bit"
:
import
bitsandbytes
as
bnb
self
.
optimizer
=
bnb
.
optim
.
Adam8bit
(
parameters
,
lr
=
0
,
weight_decay
=
self
.
weight_decay
,
betas
=
(
self
.
beta1
,
self
.
beta2
),
eps
=
self
.
eps
)
def
step
(
self
,
scaler
=
None
):
if
scaler
:
...
...
main.py
View file @
bc280afb
...
...
@@ -10,7 +10,6 @@ except ImportError:
import
os
from
pathlib
import
Path
import
math
from
lm_arch.gptj
import
GPTJModel
def
no_init
(
loading_code
):
def
dummy
(
self
):
...
...
@@ -238,7 +237,9 @@ class GPTLayer(nn.Module):
attn_out
=
self
.
attn
(
x
)
ff_out
=
self
.
ff
(
x
,
act_ck
)
x
=
residual
+
attn_out
+
ff_out
#order of addition matters, i had no idea... fixed a bug here.
x
=
attn_out
+
ff_out
+
residual
#x = residual + attn_out + ff_out -> doesn't match.
if
hypernetwork
:
hyper_out
=
hypernetwork
(
x
)
x
=
x
+
hyper_out
...
...
train.py
View file @
bc280afb
...
...
@@ -14,6 +14,34 @@ import time
import
wandb
from
lm_arch.gpt2
import
GPT2Model
import
numpy
as
np
from
transformers
import
AutoTokenizer
class
HyperNetwork
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
()
.
__init__
()
embed_dim
=
config
[
"hidden_dim"
]
self
.
linear
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
True
)
self
.
linear
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
0.02
)
for
param
in
self
.
linear
.
parameters
():
param
.
data
.
normal_
(
mean
=
0.0
,
std
=
(
0.02
/
math
.
sqrt
(
2
*
config
[
"n_layer"
])))
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
linear
(
hidden_states
)
hidden_states
=
hidden_states
.
mul
(
torch
.
sigmoid
(
hidden_states
))
return
hidden_states
model_config
=
{
"n_layer"
:
28
,
"n_head"
:
16
,
"hidden_dim"
:
4096
,
"vocab_dim"
:
50400
,
"eps"
:
1e-5
,
"activation"
:
gelu_new
,
"Layer"
:
GPTLayer
}
model_config
=
{
"n_layer"
:
12
,
...
...
@@ -28,16 +56,18 @@ model_config = {
# we need 250 batch size to train the small GPT.
train_config
=
{
"data_path"
:
"/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map"
,
"save_path"
:
"/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/owt2fp16amp2"
,
"run_name"
:
"owt2-125m-fp16AMP-1024ctx-120bs-1e-4lr"
,
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
"save_path"
:
"/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/fixedj"
,
"run_name"
:
"gpt-j-8bitopt-owt2-125m-fp16AMP-fixedj"
,
"lr"
:
1e-4
,
"end_lr"
:
1e-4
,
"warmup_steps"
:
10
0
,
"end_lr"
:
1e-4
*
2
,
"warmup_steps"
:
5
0
,
"bs"
:
12
,
"gas"
:
10
,
"seed"
:
69
,
"save_every"
:
500
,
"amp"
:
True
,
"loss_scale"
:
True
,
}
torch
.
manual_seed
(
train_config
[
"seed"
])
bs
=
train_config
[
"bs"
]
...
...
@@ -46,6 +76,17 @@ gas = train_config["gas"]
Path
(
train_config
[
"save_path"
])
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
model
=
GPTModel
.
gpt2_init
(
model_config
)
.
cuda
()
.
float
()
#model = load_gpt_j().cuda().half()
#for param in model.parameters():
# param.requires_grad = False
#for name, p in model.named_parameters():
# if ("ln" in name or "vocab_embed" in name):
# p.requires_grad = True
#hypernetwork = HyperNetwork(model_config).cuda().float()
#for param in hypernetwork.parameters():
# param.requires_grad = True
opt
=
optimizer
.
BasedOptimizer
(
model
.
parameters
(),
train_config
,
"adamw"
)
# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function.
...
...
@@ -65,24 +106,15 @@ for input_ids, labels in t:
labels
=
labels
.
cuda
()
loss
=
0
for
x
in
range
(
train_config
[
"gas"
]):
if
train_config
[
"amp"
]:
with
torch
.
cuda
.
amp
.
autocast
():
#with torch.jit.fuser("fuser2"):
# module = torch.jit.trace(model, torch.randint(0, 50256, (12, 1024)).long().cuda())
logits
=
model
(
input_ids
[
x
*
bs
:(
x
+
1
)
*
bs
,
:
1024
]
.
cuda
(),
hypernetwork
=
None
,
act_ck
=
False
)
logits
=
logits
.
view
(
-
1
,
logits
.
shape
[
-
1
])
gas_labels
=
labels
[
x
*
bs
:(
x
+
1
)
*
bs
,
:
1024
]
.
contiguous
()
gas_labels
=
gas_labels
.
view
(
-
1
)
gas_loss
=
F
.
cross_entropy
(
logits
,
gas_labels
)
else
:
with
torch
.
cuda
.
amp
.
autocast
(
enabled
=
train_config
[
"amp"
],
dtype
=
torch
.
float16
):
logits
=
model
(
input_ids
[
x
*
bs
:(
x
+
1
)
*
bs
,
:
1024
]
.
cuda
(),
hypernetwork
=
None
,
act_ck
=
False
)
#print(tokenizer.decode(input_ids[x*bs:(x+1)*bs, :][0]))
logits
=
logits
.
view
(
-
1
,
logits
.
shape
[
-
1
])
gas_labels
=
labels
[
x
*
bs
:(
x
+
1
)
*
bs
,
:
1024
]
.
contiguous
()
gas_labels
=
gas_labels
.
view
(
-
1
)
gas_loss
=
F
.
cross_entropy
(
logits
,
gas_labels
)
if
train_config
[
"
amp
"
]:
if
train_config
[
"
loss_scale
"
]:
scaler
.
scale
(
gas_loss
)
.
backward
()
else
:
gas_loss
.
backward
()
...
...
@@ -90,14 +122,14 @@ for input_ids, labels in t:
loss
+=
gas_loss
.
item
()
loss
=
loss
/
gas
if
train_config
[
"
amp
"
]:
if
train_config
[
"
loss_scale
"
]:
scaler
.
unscale_
(
opt
.
optimizer
)
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
1
)
if
train_config
[
"
amp
"
]:
if
train_config
[
"
loss_scale
"
]:
opt
.
step
(
scaler
=
scaler
)
else
:
opt
.
step
()
if
train_config
[
"
amp
"
]:
if
train_config
[
"
loss_scale
"
]:
scaler
.
update
()
#opt.step()
opt
.
zero_grad
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment