Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
B
Basedformer
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
novelai-storage
Basedformer
Commits
55a00bbf
Commit
55a00bbf
authored
Jul 17, 2022
by
Wes Brown
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Epoch support, and mask `<|endoftext|>`
parent
eebb1fa8
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
77 additions
and
65 deletions
+77
-65
hypertrain.py
hypertrain.py
+77
-65
No files found.
hypertrain.py
View file @
55a00bbf
...
...
@@ -28,7 +28,8 @@ prompts = ["<|endoftext|>",
"The mercurial and beautiful"
,
"<|endoftext|>[ Author:"
,
"<|endoftext|>[ Genre:"
,
"***"
]
"***"
,
"----"
]
def
_init_weights
(
module
):
...
...
@@ -285,6 +286,7 @@ parser.add_argument("--logs", type=str, help="log directory location",
parser
.
add_argument
(
"--masked"
,
type
=
bool
,
help
=
"masked softmax fusion"
)
parser
.
add_argument
(
"--sample_vanilla"
,
type
=
bool
,
help
=
"sample vanilla model"
)
parser
.
add_argument
(
"--shuffle"
,
type
=
bool
,
help
=
"shuffle dataset contexts"
)
parser
.
add_argument
(
"--epochs"
,
type
=
int
,
help
=
"number of epochs to train for"
)
parser
.
set_defaults
(
loss_scale
=
False
,
amp
=
False
,
no_resume
=
False
,
masked
=
False
,
sample_vanilla
=
False
,
shuffle
=
False
)
args
=
parser
.
parse_args
()
...
...
@@ -312,6 +314,7 @@ train_config = {
"context_size"
:
args
.
context_size
,
"sample_vanilla"
:
args
.
sample_vanilla
,
"shuffle"
:
args
.
shuffle
,
"epochs"
:
args
.
epochs
,
}
torch
.
manual_seed
(
train_config
[
"seed"
])
bs
=
train_config
[
"bs"
]
...
...
@@ -368,70 +371,79 @@ if last_cp:
else
:
curr_step
=
0
t
=
tqdm
(
train_loader
,
initial
=
curr_step
)
for
input_ids
,
labels
in
t
:
timex
=
time
.
perf_counter
()
input_ids
=
input_ids
.
to
(
gpu
)
labels
=
labels
.
to
(
gpu
)
loss
=
0
for
x
in
range
(
train_config
[
"gas"
]):
with
amp
.
autocast
(
enabled
=
train_config
[
"amp"
],
dtype
=
torch
.
float16
):
logits
,
_
=
model
(
input_ids
[
x
*
bs
:(
x
+
1
)
*
bs
,
:]
.
to
(
gpu
),
hypernetwork
=
hypernetwork
,
act_ck
=
True
)
logits
=
logits
.
view
(
-
1
,
logits
.
shape
[
-
1
])
gas_labels
=
labels
[
x
*
bs
:(
x
+
1
)
*
bs
,
:]
.
contiguous
()
gas_labels
=
gas_labels
.
view
(
-
1
)
gas_loss
=
F
.
cross_entropy
(
logits
,
gas_labels
)
if
train_config
[
"loss_scale"
]:
scaler
.
scale
(
gas_loss
)
.
backward
()
else
:
gas_loss
.
backward
()
loss
+=
gas_loss
.
item
()
loss
=
loss
/
gas
if
train_config
[
"loss_scale"
]:
scaler
.
unscale_
(
opt
.
optimizer
)
torch
.
nn
.
utils
.
clip_grad_norm_
(
hypernetwork
.
parameters
(),
1
)
if
train_config
[
"loss_scale"
]:
opt
.
step
(
scaler
=
scaler
)
else
:
opt
.
step
()
if
train_config
[
"loss_scale"
]:
scaler
.
update
()
opt
.
zero_grad
()
sec_per_step
=
(
time
.
perf_counter
()
-
timex
)
step_per_sec
=
(
1.
/
sec_per_step
)
tokens_per_sec
=
(
step_per_sec
*
train_config
[
"context_size"
])
*
bs
*
gas
t
.
set_description
(
f
"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step,"
+
f
"{tokens_per_sec:.2f}tokens/s, loss={loss:.4f}"
)
wandb
.
log
(
{
"train/loss"
:
loss
,
"train/tokens_per_sec"
:
tokens_per_sec
,
"train/sec_per_step"
:
sec_per_step
,
"train/step_per_sec"
:
step_per_sec
,
"train/lr"
:
opt
.
curr_lr
,
"train/loss_scale"
:
scaler
.
get_scale
()
},
step
=
curr_step
)
if
train_config
[
"do_save"
]
and
\
curr_step
%
train_config
[
"save_every"
]
==
0
and
\
curr_step
!=
0
:
hypernetwork_saver
(
f
"step_{curr_step}"
)
print
(
f
"
\n
Saved model at step {curr_step}"
)
if
curr_step
%
train_config
[
"eval_every"
]
==
0
:
eval_fn
(
curr_step
)
curr_step
+=
1
epoch_steps
=
len
(
train_loader
)
total_steps
=
epoch_steps
*
train_config
[
'epochs'
]
with
tqdm
(
total
=
total_steps
,
initial
=
curr_step
)
as
t
:
for
epoch
in
range
(
train_config
[
'epochs'
]):
for
input_ids
,
labels
in
train_loader
:
timex
=
time
.
perf_counter
()
input_ids
=
input_ids
.
to
(
gpu
)
labels
=
labels
.
to
(
gpu
)
loss
=
0
for
x
in
range
(
train_config
[
"gas"
]):
with
amp
.
autocast
(
enabled
=
train_config
[
"amp"
],
dtype
=
torch
.
float16
):
logits
,
_
=
model
(
input_ids
[
x
*
bs
:(
x
+
1
)
*
bs
,
:]
.
to
(
gpu
),
hypernetwork
=
hypernetwork
,
act_ck
=
True
)
logits
=
logits
.
view
(
-
1
,
logits
.
shape
[
-
1
])
gas_labels
=
labels
[
x
*
bs
:(
x
+
1
)
*
bs
,
:]
.
contiguous
()
gas_labels
=
gas_labels
.
view
(
-
1
)
gas_labels
[
gas_labels
==
50256
]
=
-
100
gas_loss
=
F
.
cross_entropy
(
logits
,
gas_labels
)
if
train_config
[
"loss_scale"
]:
scaler
.
scale
(
gas_loss
)
.
backward
()
else
:
gas_loss
.
backward
()
loss
+=
gas_loss
.
item
()
loss
=
loss
/
gas
if
train_config
[
"loss_scale"
]:
scaler
.
unscale_
(
opt
.
optimizer
)
torch
.
nn
.
utils
.
clip_grad_norm_
(
hypernetwork
.
parameters
(),
1
)
if
train_config
[
"loss_scale"
]:
opt
.
step
(
scaler
=
scaler
)
else
:
opt
.
step
()
if
train_config
[
"loss_scale"
]:
scaler
.
update
()
opt
.
zero_grad
()
sec_per_step
=
(
time
.
perf_counter
()
-
timex
)
step_per_sec
=
(
1.
/
sec_per_step
)
tokens_per_sec
=
(
step_per_sec
*
train_config
[
"context_size"
])
*
\
bs
*
gas
t
.
set_description
(
f
"{step_per_sec:.2f} steps/s, "
f
"{sec_per_step:.2f}s/step, "
f
"{tokens_per_sec:.2f}tokens/s, "
f
"loss={loss:.4f}"
)
wandb
.
log
(
{
"train/epoch"
:
float
(
curr_step
)
/
float
(
epoch_steps
),
"train/loss"
:
loss
,
"train/tokens_per_sec"
:
tokens_per_sec
,
"train/sec_per_step"
:
sec_per_step
,
"train/step_per_sec"
:
step_per_sec
,
"train/lr"
:
opt
.
curr_lr
,
"train/loss_scale"
:
scaler
.
get_scale
()
},
step
=
curr_step
)
if
train_config
[
"do_save"
]
and
\
curr_step
%
train_config
[
"save_every"
]
==
0
and
\
curr_step
!=
0
:
hypernetwork_saver
(
f
"step_{curr_step}"
)
print
(
f
"
\n
Saved model at step {curr_step}"
)
if
curr_step
%
train_config
[
"eval_every"
]
==
0
:
eval_fn
(
curr_step
)
curr_step
+=
1
t
.
update
(
1
)
eval_fn
(
curr_step
)
hypernetwork_saver
(
"final"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment