Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
B
Basedformer
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
novelai-storage
Basedformer
Commits
1ab8bac6
Commit
1ab8bac6
authored
Apr 10, 2022
by
novelailab
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
opt/model save/load works!
parent
2d0b32de
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
65 additions
and
29 deletions
+65
-29
.tmp.86973
.tmp.86973
+1
-0
basedformer/gptj.py
basedformer/gptj.py
+1
-1
basedformer/optimizer.py
basedformer/optimizer.py
+9
-8
hypertrain.py
hypertrain.py
+51
-17
run_pyfra.py
run_pyfra.py
+3
-3
No files found.
.tmp.86973
0 → 100644
View file @
1ab8bac6
This source diff could not be displayed because it is too large. You can
view the blob
instead.
basedformer/gptj.py
View file @
1ab8bac6
...
...
@@ -150,7 +150,7 @@ class FeedForward(nn.Module):
def
forward
(
self
,
x
,
act_ck
=
False
):
x
=
self
.
ff1
(
x
)
if
act_ck
:
ck
(
self
.
activation
,
x
)
x
=
ck
(
self
.
activation
,
x
)
else
:
x
=
self
.
activation
(
x
)
x
=
self
.
ff2
(
x
)
...
...
basedformer/optimizer.py
View file @
1ab8bac6
from
curses
import
meta
from
torch
import
optim
import
numpy
as
np
import
torch
...
...
@@ -71,7 +72,6 @@ class BasedOptimizer:
def
step
(
self
,
dry_run
=
False
,
scaler
=
None
):
self
.
curr_lr
=
lr_schedule
(
self
.
curr_step
,
self
.
warmup_steps
,
self
.
anneal_steps
,
self
.
lr
,
self
.
end_lr
)
if
not
dry_run
:
if
scaler
:
scaler
.
step
(
self
.
optimizer
)
...
...
@@ -80,6 +80,7 @@ class BasedOptimizer:
self
.
optimizer
.
step
()
self
.
curr_step
=
self
.
curr_step
+
1
self
.
curr_lr
=
lr_schedule
(
self
.
curr_step
,
self
.
warmup_steps
,
self
.
anneal_steps
,
self
.
lr
,
self
.
end_lr
)
if
not
self
.
max_lr
:
if
self
.
curr_lr
==
self
.
end_lr
:
...
...
@@ -102,20 +103,20 @@ class BasedOptimizer:
print
(
f
"curr_lr: {str(self.get_current_lr())}"
)
def
save
(
self
,
path
:
Path
):
path
=
path
/
"opt"
path
=
Path
(
path
)
path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
torch
.
save
(
self
.
optimizer
.
state_dict
(),
path
/
"opt_states.pt"
)
#clean the optimizer and parameters from the dict.
del
self
.
optimizer
del
self
.
parameters
metadata
=
self
.
__dict__
metadata
=
self
.
__dict__
.
copy
()
#clean the optimizer and parameters from the dict.
del
metadata
[
"optimizer"
]
del
metadata
[
"parameters"
]
with
open
(
path
/
"opt_metadata.pkl"
,
'wb'
)
as
f
:
pickle
.
dump
(
metadata
,
f
)
@
classmethod
def
load
(
cls
,
parameters
,
path
):
path
=
path
/
"opt"
path
=
Path
(
path
)
with
open
(
path
/
"opt_metadata.pkl"
,
'rb'
)
as
f
:
metadata
=
pickle
.
load
(
f
)
...
...
hypertrain.py
View file @
1ab8bac6
...
...
@@ -12,8 +12,9 @@ import wandb
import
numpy
as
np
from
torch.utils.checkpoint
import
checkpoint
as
ck
from
math
import
log2
,
ceil
from
basedformer
import
gptj
,
lm_base
,
optimizer
from
basedformer
import
gptj
,
optimizer
from
basedformer.utils
import
*
import
glob
def
_init_weights
(
module
):
if
isinstance
(
module
,
nn
.
Linear
):
...
...
@@ -158,16 +159,16 @@ train_config = {
"data_path"
:
"/home/xuser/diffusionstorage/datasets/enwik9-gpt2-2049.map"
,
#"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
"save_path"
:
"/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-bs16"
,
"do_save"
:
Fals
e
,
"run_name"
:
"gpt-j-enwik9-6b-postln-bf16-2e-4-4bsz-every5layer"
,
"save_path"
:
"/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/hypernetwork-gptj-2048-enwik9-bs16
-save
"
,
"do_save"
:
Tru
e
,
"run_name"
:
"gpt-j-enwik9-6b-postln-bf16-2e-4-4bsz-every5layer
savetest
"
,
"lr"
:
2e-4
,
"end_lr"
:
2e-4
,
"warmup_steps"
:
50
,
"bs"
:
1
,
"gas"
:
4
,
"seed"
:
69
,
"save_every"
:
1
00
,
"save_every"
:
3
00
,
"amp"
:
False
,
"loss_scale"
:
False
,
}
...
...
@@ -178,7 +179,7 @@ gas = train_config["gas"]
Path
(
train_config
[
"save_path"
])
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
#model = GPTModel.gpt2_init(model_config).cuda().float()
model
=
lm_base
.
()
.
cuda
()
.
bfloat16
()
model
=
gptj
.
load_gpt_j
()
.
lm
.
cuda
()
.
bfloat16
()
for
param
in
model
.
parameters
():
param
.
requires_grad
=
False
...
...
@@ -192,16 +193,33 @@ hypernetwork = HyperNetworkSingle(model_config).cuda().float()
for
param
in
hypernetwork
.
parameters
():
param
.
requires_grad
=
True
opt
=
optimizer
.
BasedOptimizer
(
hypernetwork
.
parameters
(),
train_config
,
"adamw"
)
cp_list
=
sorted
(
os
.
listdir
(
train_config
[
"save_path"
]),
key
=
lambda
x
:
int
(
x
.
split
(
"_"
)[
-
1
]))
last_cp
=
Path
(
train_config
[
"save_path"
])
/
cp_list
[
-
1
]
if
len
(
cp_list
)
>
0
else
None
print
(
last_cp
)
# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function.
if
last_cp
:
print
(
"Loading from step {}"
.
format
(
cp_list
[
-
1
]
.
split
(
"_"
)[
-
1
]))
hypernetwork
.
load_state_dict
(
torch
.
load
(
last_cp
/
"hyper.pt"
))
opt
=
optimizer
.
BasedOptimizer
.
load
(
hypernetwork
.
parameters
(),
last_cp
/
"opt"
)
else
:
opt
=
optimizer
.
BasedOptimizer
(
hypernetwork
.
parameters
(),
train_config
,
"adamw"
)
# TODO: Add load, add evals, add FP16 AMP, and Data Parallel, outputting hidden states from the get_logits function.
print
(
opt
.
curr_step
)
train_dataset
=
FbDataset
(
2049
,
train_config
[
"data_path"
])
train_loader
=
data
.
DataLoader
(
train_dataset
,
batch_size
=
bs
*
gas
,
shuffle
=
False
,
num_workers
=
0
)
if
last_cp
:
train_dataset
.
skip
=
opt
.
curr_step
*
bs
*
gas
train_loader
=
data
.
DataLoader
(
train_dataset
,
batch_size
=
bs
*
gas
,
shuffle
=
False
,
num_workers
=
0
,
)
wandb
.
init
(
project
=
"hypernetwork-tests"
,
name
=
train_config
[
"run_name"
],
config
=
{
**
train_config
,
**
model_config
})
t
=
tqdm
(
train_loader
)
curr_step
=
0
if
last_cp
:
curr_step
=
opt
.
curr_step
else
:
curr_step
=
0
t
=
tqdm
(
train_loader
,
initial
=
curr_step
)
scaler
=
torch
.
cuda
.
amp
.
GradScaler
()
...
...
@@ -234,18 +252,34 @@ for input_ids, labels in t:
opt
.
step
(
scaler
=
scaler
)
else
:
opt
.
step
()
if
train_config
[
"loss_scale"
]:
scaler
.
update
()
#opt.step()
opt
.
zero_grad
()
sec_per_step
=
(
time
.
perf_counter
()
-
timex
)
step_per_sec
=
(
1.
/
sec_per_step
)
tokens_per_sec
=
(
step_per_sec
*
2048
)
*
bs
*
gas
t
.
set_description
(
f
"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step, {tokens_per_sec:.2f}tokens/s, loss={loss:.4f}"
)
wandb
.
log
({
"train/loss"
:
loss
,
"train/tokens_per_sec"
:
tokens_per_sec
,
"train/sec_per_step"
:
sec_per_step
,
"train/step_per_sec"
:
step_per_sec
,
"train/lr"
:
opt
.
curr_lr
,
"train/loss_scale"
:
scaler
.
get_scale
()})
curr_step
+=
1
wandb
.
log
(
{
"train/loss"
:
loss
,
"train/tokens_per_sec"
:
tokens_per_sec
,
"train/sec_per_step"
:
sec_per_step
,
"train/step_per_sec"
:
step_per_sec
,
"train/lr"
:
opt
.
curr_lr
,
"train/loss_scale"
:
scaler
.
get_scale
()
},
step
=
curr_step
)
if
train_config
[
"do_save"
]:
if
curr_step
%
train_config
[
"save_every"
]
==
0
or
curr_step
==
1
:
torch
.
save
(
hypernetwork
.
state_dict
(),
train_config
[
"save_path"
]
+
f
"/{curr_step}.hyper"
)
#model.save(train_config["save_path"] + f"/{curr_step}")
if
curr_step
%
train_config
[
"save_every"
]
==
0
and
curr_step
!=
0
:
save_folder
=
Path
(
train_config
[
"save_path"
])
/
f
"step_{curr_step}"
save_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
torch
.
save
(
hypernetwork
.
state_dict
(),
save_folder
/
"hyper.pt"
)
opt
.
save
(
save_folder
/
"opt"
)
print
(
f
"Saved model at step {curr_step}"
)
sys
.
exit
(
0
)
curr_step
+=
1
\ No newline at end of file
run_pyfra.py
View file @
1ab8bac6
...
...
@@ -13,13 +13,13 @@ bash = False
config_obj
=
KubeConfig
()
config_obj
.
set_name
(
name
)
config_obj
.
set_gpu
(
gpu_name
=
GPU
.
A
40
,
amount
=
1
)
config_obj
.
set_gpu
(
gpu_name
=
GPU
.
A
100_PCIE_40GB
,
amount
=
1
)
config_obj
.
set_ram
(
16
)
config_obj
.
set_cpu
(
4
)
config_obj
.
dry_run
(
dry
)
config_obj
.
print_information
()
#
config_obj.create_deployment(overwrite=True)
#
config_obj.create_service(overwrite=True)
config_obj
.
create_deployment
(
overwrite
=
True
)
config_obj
.
create_service
(
overwrite
=
True
)
remote
=
config_obj
.
get_pyfra_remote
()
env1
=
remote
.
env
(
'noname'
,
python_version
=
None
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment