Epoch support, and mask `<|endoftext|>`

55a00bbf · Wes Brown · eebb1fa8 · 55a00bbf
Commit 55a00bbf authored Jul 17, 2022 by Wes Brown
Hide whitespace changes
Inline Side-by-side

Showing with 77 additions and 65 deletions

hypertrain.py hypertrain.py +77 -65

No files found.
--- a/hypertrain.py
+++ b/hypertrain.py
@@ -28,7 +28,8 @@ prompts = ["<|endoftext|>",
           "The mercurial and beautiful",
           "<|endoftext|>[ Author:",
           "<|endoftext|>[ Genre:",
-           "***"]
+           "***",
+           "----"]


 def _init_weights(module):
@@ -285,6 +286,7 @@ parser.add_argument("--logs", type=str, help="log directory location",
 parser.add_argument("--masked", type=bool, help="masked softmax fusion")
 parser.add_argument("--sample_vanilla", type=bool, help="sample vanilla model")
 parser.add_argument("--shuffle", type=bool, help="shuffle dataset contexts")
+parser.add_argument("--epochs", type=int, help="number of epochs to train for")
 parser.set_defaults(loss_scale=False, amp=False, no_resume=False, masked=False,
                    sample_vanilla=False, shuffle=False)
 args = parser.parse_args()
@@ -312,6 +314,7 @@ train_config = {
    "context_size": args.context_size,
    "sample_vanilla": args.sample_vanilla,
    "shuffle": args.shuffle,
+    "epochs": args.epochs,
 }
 torch.manual_seed(train_config["seed"])
 bs = train_config["bs"]
@@ -368,70 +371,79 @@ if last_cp:
 else:
    curr_step = 0

-t = tqdm(train_loader, initial=curr_step)
-
-for input_ids, labels in t:
-    timex = time.perf_counter()
-    input_ids = input_ids.to(gpu)
-    labels = labels.to(gpu)
-    loss = 0
-    for x in range(train_config["gas"]):
-        with amp.autocast(enabled=train_config["amp"],
-                          dtype=torch.float16):
-            logits, _ = model(input_ids[x * bs:(x + 1) * bs, :].to(gpu),
-                              hypernetwork=hypernetwork,
-                              act_ck=True)
-            logits = logits.view(-1, logits.shape[-1])
-            gas_labels = labels[x * bs:(x + 1) * bs, :].contiguous()
-            gas_labels = gas_labels.view(-1)
-            gas_loss = F.cross_entropy(logits, gas_labels)
-
-        if train_config["loss_scale"]:
-            scaler.scale(gas_loss).backward()
-        else:
-            gas_loss.backward()
-
-        loss += gas_loss.item()
-
-    loss = loss / gas
-    if train_config["loss_scale"]:
-        scaler.unscale_(opt.optimizer)
-    torch.nn.utils.clip_grad_norm_(hypernetwork.parameters(), 1)
-    if train_config["loss_scale"]:
-        opt.step(scaler=scaler)
-    else:
-        opt.step()
-
-    if train_config["loss_scale"]:
-        scaler.update()
-
-    opt.zero_grad()
-    sec_per_step = (time.perf_counter() - timex)
-    step_per_sec = (1. / sec_per_step)
-    tokens_per_sec = (step_per_sec * train_config["context_size"]) * bs * gas
-    t.set_description(f"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step,"
-                      + f"{tokens_per_sec:.2f}tokens/s, loss={loss:.4f}")
-    wandb.log(
-        {
-            "train/loss": loss,
-            "train/tokens_per_sec": tokens_per_sec,
-            "train/sec_per_step": sec_per_step,
-            "train/step_per_sec": step_per_sec,
-            "train/lr": opt.curr_lr,
-            "train/loss_scale": scaler.get_scale()
-        },
-        step=curr_step)
-
-    if train_config["do_save"] and \
-            curr_step % train_config["save_every"] == 0 and \
-            curr_step != 0:
-        hypernetwork_saver(f"step_{curr_step}")
-        print(f"\nSaved model at step {curr_step}")
-
-    if curr_step % train_config["eval_every"] == 0:
-        eval_fn(curr_step)
-
-    curr_step += 1
+epoch_steps = len(train_loader)
+total_steps = epoch_steps * train_config['epochs']
+
+with tqdm(total=total_steps, initial=curr_step) as t:
+    for epoch in range(train_config['epochs']):
+        for input_ids, labels in train_loader:
+            timex = time.perf_counter()
+            input_ids = input_ids.to(gpu)
+            labels = labels.to(gpu)
+            loss = 0
+            for x in range(train_config["gas"]):
+                with amp.autocast(enabled=train_config["amp"],
+                                  dtype=torch.float16):
+                    logits, _ = model(input_ids[x * bs:(x + 1) * bs, :].to(gpu),
+                                      hypernetwork=hypernetwork,
+                                      act_ck=True)
+                    logits = logits.view(-1, logits.shape[-1])
+                    gas_labels = labels[x * bs:(x + 1) * bs, :].contiguous()
+                    gas_labels = gas_labels.view(-1)
+                    gas_labels[gas_labels == 50256] = -100
+                    gas_loss = F.cross_entropy(logits, gas_labels)
+
+                if train_config["loss_scale"]:
+                    scaler.scale(gas_loss).backward()
+                else:
+                    gas_loss.backward()
+
+                loss += gas_loss.item()
+
+            loss = loss / gas
+            if train_config["loss_scale"]:
+                scaler.unscale_(opt.optimizer)
+            torch.nn.utils.clip_grad_norm_(hypernetwork.parameters(), 1)
+            if train_config["loss_scale"]:
+                opt.step(scaler=scaler)
+            else:
+                opt.step()
+
+            if train_config["loss_scale"]:
+                scaler.update()
+
+            opt.zero_grad()
+            sec_per_step = (time.perf_counter() - timex)
+            step_per_sec = (1. / sec_per_step)
+            tokens_per_sec = (step_per_sec * train_config["context_size"]) * \
+                             bs * gas
+            t.set_description(f"{step_per_sec:.2f} steps/s, "
+                              f"{sec_per_step:.2f}s/step, "
+                              f"{tokens_per_sec:.2f}tokens/s, "
+                              f"loss={loss:.4f}")
+            wandb.log(
+                {
+                    "train/epoch": float(curr_step) / float(epoch_steps),
+                    "train/loss": loss,
+                    "train/tokens_per_sec": tokens_per_sec,
+                    "train/sec_per_step": sec_per_step,
+                    "train/step_per_sec": step_per_sec,
+                    "train/lr": opt.curr_lr,
+                    "train/loss_scale": scaler.get_scale()
+                },
+                step=curr_step)
+
+            if train_config["do_save"] and \
+                    curr_step % train_config["save_every"] == 0 and \
+                    curr_step != 0:
+                hypernetwork_saver(f"step_{curr_step}")
+                print(f"\nSaved model at step {curr_step}")
+
+            if curr_step % train_config["eval_every"] == 0:
+                eval_fn(curr_step)
+
+            curr_step += 1
+            t.update(1)

 eval_fn(curr_step)
 hypernetwork_saver("final")