Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
B
Basedformer
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
novelai-storage
Basedformer
Commits
4596b61e
Commit
4596b61e
authored
Mar 27, 2022
by
novelailab
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
hypertrain GRU and pass layer_ids
parent
c99ffa47
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
67 additions
and
13 deletions
+67
-13
hypertrain.py
hypertrain.py
+59
-6
main.py
main.py
+8
-7
No files found.
hypertrain.py
View file @
4596b61e
...
...
@@ -68,6 +68,34 @@ def discounted_cumsum(t, gamma):
def
shift
(
x
,
amt
,
dim
=
-
1
):
return
F
.
pad
(
x
,
(
*
((
0
,
0
)
*
(
-
dim
-
1
)),
amt
,
-
amt
),
value
=
0.
)
class
HyperNetworkGRU
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
()
.
__init__
()
embed_dim
=
config
[
"hidden_dim"
]
self
.
linear1
=
nn
.
Linear
(
embed_dim
,
embed_dim
//
8
)
self
.
gru
=
nn
.
GRU
(
embed_dim
//
8
,
embed_dim
//
8
,
num_layers
=
1
,
bidirectional
=
False
,
batch_first
=
True
)
self
.
linear2
=
nn
.
Linear
(
embed_dim
//
8
,
embed_dim
)
self
.
ln_1
=
nn
.
LayerNorm
(
embed_dim
//
8
,
eps
=
1e-5
)
self
.
activation
=
gelu_new
for
module
in
self
.
modules
():
_init_weights
(
module
)
for
param
in
self
.
linear2
.
parameters
():
param
.
data
.
normal_
(
mean
=
0.0
,
std
=
(
0.02
/
math
.
sqrt
(
2
*
config
[
"n_layer"
])))
for
param
in
self
.
gru
.
parameters
():
param
.
data
.
normal_
(
mean
=
0.0
,
std
=
(
0.02
/
math
.
sqrt
(
2
*
config
[
"n_layer"
])))
def
forward
(
self
,
x
):
x
=
x
.
float
()
x
=
self
.
linear1
(
x
)
x
=
self
.
gru
(
x
)[
0
]
x
=
self
.
ln_1
(
x
)
x
=
self
.
linear2
(
x
)
x
=
ck
(
self
.
activation
,
x
)
return
x
.
bfloat16
()
class
HyperNetwork
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
()
.
__init__
()
...
...
@@ -89,13 +117,37 @@ class HyperNetwork(nn.Module):
def
forward
(
self
,
x
):
x
=
x
.
float
()
x
=
shift_tokens
(
x
,
self
.
num_shifts
)
#
x = shift_tokens(x, self.num_shifts)
x
=
self
.
linear
(
x
)
x
=
ck
(
self
.
activation
,
x
)
x
=
self
.
linear2
(
x
)
x
=
x
.
mul
(
torch
.
sigmoid
(
x
))
return
x
.
bfloat16
()
class
HyperNetworkSingle
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
()
.
__init__
()
embed_dim
=
config
[
"hidden_dim"
]
self
.
linear
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
True
)
self
.
activation
=
gelu_new
#self.linear.weight.data.normal_(mean=0.0, std=0.02)
for
module
in
self
.
modules
():
_init_weights
(
module
)
for
param
in
self
.
linear
.
parameters
():
param
.
data
.
normal_
(
mean
=
0.0
,
std
=
(
0.02
/
math
.
sqrt
(
2
*
config
[
"n_layer"
])))
#state = self.state_dict()
#for k in state:
# state[k] = state[k] * 1 / math.sqrt(2 * config["n_layer"])
#self.load_state_dict(state)
def
forward
(
self
,
x
):
x
=
x
.
float
()
#x = shift_tokens(x, self.num_shifts)
x
=
self
.
linear
(
x
)
x
=
x
.
mul
(
torch
.
sigmoid
(
x
))
return
x
.
bfloat16
()
model_config
=
{
"n_layer"
:
12
,
...
...
@@ -123,9 +175,9 @@ train_config = {
#"data_path": "/home/xuser/diffusionstorage/datasets/OWT2-gpt2-full.map",
#"data_path": "/home/xuser/diffusionstorage/datasets/sigurd/map/sigurd_v5_fs_2049.map",
"save_path"
:
"/home/xuser/diffusionstorage/workspace/kuru/basedformer/models/fixedj"
,
"run_name"
:
"
bighyper-gpt-j-enwik9-6b-postln-bf16-1
e-4"
,
"lr"
:
1
e-4
,
"end_lr"
:
1
e-4
,
"run_name"
:
"
gpt-j-enwik9-6b-postln-bf16-5
e-4"
,
"lr"
:
5
e-4
,
"end_lr"
:
5
e-4
,
"warmup_steps"
:
50
,
"bs"
:
1
,
"gas"
:
16
,
...
...
@@ -149,7 +201,8 @@ for name, p in model.named_parameters():
if
(
"ln"
in
name
or
"vocab_embed"
in
name
):
p
.
requires_grad
=
True
hypernetwork
=
HyperNetwork
(
model_config
)
.
cuda
()
.
float
()
#hypernetwork = HyperNetwork(model_config).cuda().float()
hypernetwork
=
nn
.
ModuleList
([
HyperNetwork
(
model_config
)
.
cuda
()
.
float
()
for
_
in
range
(
model_config
[
"n_layer"
]
//
5
)])
for
param
in
hypernetwork
.
parameters
():
param
.
requires_grad
=
True
...
...
@@ -201,7 +254,7 @@ for input_ids, labels in t:
opt
.
zero_grad
()
sec_per_step
=
(
time
.
perf_counter
()
-
timex
)
/
(
bs
*
gas
)
step_per_sec
=
(
1.
/
sec_per_step
)
tokens_per_sec
=
step_per_sec
*
1024
tokens_per_sec
=
step_per_sec
*
2048
t
.
set_description
(
f
"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step, {tokens_per_sec:.2f}tokens/s, loss={loss:.4f}"
)
wandb
.
log
({
"train/loss"
:
loss
,
"train/tokens_per_sec"
:
tokens_per_sec
,
"train/sec_per_step"
:
sec_per_step
,
"train/step_per_sec"
:
step_per_sec
,
"train/lr"
:
opt
.
curr_lr
,
"train/loss_scale"
:
scaler
.
get_scale
()})
curr_step
+=
1
...
...
main.py
View file @
4596b61e
...
...
@@ -225,8 +225,9 @@ class GPTLayer(nn.Module):
self
.
ln_preattn
=
nn
.
LayerNorm
(
hidden_dim
,
eps
=
eps
,
device
=
device
,
dtype
=
dtype
)
self
.
ff
=
ff
(
dim
=
hidden_dim
,
hidden_dim
=
hidden_dim
*
4
,
activation
=
activation
,
device
=
device
,
dtype
=
dtype
)
self
.
attn
=
attn
(
hidden_dim
=
hidden_dim
,
n_head
=
n_head
,
device
=
device
,
dtype
=
dtype
)
self
.
tick
=
True
def
forward
(
self
,
x
,
hypernetwork
=
None
,
act_ck
=
False
):
def
forward
(
self
,
x
,
layer_id
=
None
,
hypernetwork
=
None
,
act_ck
=
False
):
residual
=
x
if
act_ck
:
...
...
@@ -237,14 +238,14 @@ class GPTLayer(nn.Module):
x
=
self
.
ln_preattn
(
x
)
attn_out
=
self
.
attn
(
x
)
if
hypernetwork
:
hyper_out
=
hypernetwork
(
x
)
if
hypernetwork
and
layer_id
%
5
==
0
:
hyper_out
=
hypernetwork
[(
layer_id
//
5
)
-
1
]
(
x
)
ff_out
=
self
.
ff
(
x
,
act_ck
)
#order of addition matters, i had no idea... fixed a bug here.
x
=
attn_out
+
ff_out
+
residual
#x = residual + attn_out + ff_out -> doesn't match.
if
hypernetwork
:
if
hypernetwork
and
layer_id
%
5
==
0
:
x
=
x
+
hyper_out
return
x
...
...
@@ -284,8 +285,8 @@ class GPTModel(nn.Module):
def
get_embeds
(
self
,
x
,
hypernetwork
=
None
,
act_ck
=
False
):
x
=
self
.
vocab_embed
(
x
)
for
layer
in
self
.
layers
:
x
=
layer
(
x
,
hypernetwork
,
act_ck
)
for
layer
_id
,
layer
in
enumerate
(
self
.
layers
)
:
x
=
layer
(
x
,
layer_id
,
hypernetwork
,
act_ck
)
x
=
self
.
ln_final
(
x
)
return
x
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment