Created
February 6, 2025 14:45
-
-
Save cat-state/89fd0d63cd67f42e8e9e9ac7caea7e65 to your computer and use it in GitHub Desktop.
nanogpt slowrun
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
with open(sys.argv[0]) as f: | |
code = f.read() # read the code of this file ASAP, for logging | |
import uuid | |
import time | |
import copy | |
from dataclasses import dataclass | |
from functools import lru_cache | |
from pathlib import Path | |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
import torch | |
torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems | |
from torch import Tensor, nn | |
import torch.nn.functional as F | |
import torch.distributed as dist | |
# use of FlexAttention contributed by @KoszarskyB | |
from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
#torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min | |
# ----------------------------------------------------------------------------- | |
# Custom operators: FP8 matmul by @YouJiacheng | |
def log_expm1(x): | |
return torch.where(x > 4.0, x, torch.where(x < 1.0, torch.log(torch.expm1(x)), x + torch.log1p(-torch.exp(-x)))) | |
def relumax_score_mod(score, b, h, q_idx, kv_idx): | |
return torch.log(F.relu(torch.expm1(score) + 1e-3)) | |
return torch.log(F.relu(score) + 1e-3) | |
d = score.dtype | |
score = score.float() | |
return torch.where(score < 0.0001, torch.full_like(score, fill_value=-4.0), log_expm1(score)).to(d) | |
@torch.library.custom_op("nanogpt::mm", mutates_args=()) | |
def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: | |
@torch.compile | |
def impl(x: Tensor, w: Tensor): | |
assert x.is_contiguous() and w.is_contiguous() | |
x_f8 = x.mul(x_s).to(torch.float8_e4m3fn) | |
w_f8 = w.mul(w_s).to(torch.float8_e4m3fn) | |
out = torch._scaled_mm( | |
x_f8, | |
w_f8.t(), | |
out_dtype=torch.bfloat16, | |
scale_a=x.new_tensor(1 / x_s, dtype=torch.float32), | |
scale_b=x.new_tensor(1 / w_s, dtype=torch.float32), | |
use_fast_accum=True, | |
) | |
return out, x_f8, w_f8 | |
return impl(x, w) | |
@mm_op.register_fake | |
def _(x: Tensor, w: Tensor, *_): | |
assert x.ndim == w.ndim == 2 | |
assert x.shape[1] == w.shape[1] | |
assert x.device == w.device | |
assert x.is_contiguous() and w.is_contiguous() | |
return x @ w.t(), x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) | |
@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) | |
def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: | |
@torch.compile | |
def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): | |
assert grad.is_contiguous() | |
x_inv_s = grad.new_tensor(1 / x_s, dtype=torch.float32) | |
w_inv_s = grad.new_tensor(1 / w_s, dtype=torch.float32) | |
grad_inv_s = grad.new_tensor(1 / grad_s, dtype=torch.float32) | |
grad_f8 = grad.mul(grad_s).to(torch.float8_e5m2) | |
grad_x = torch._scaled_mm( | |
grad_f8, | |
w_f8.t().contiguous().t(), | |
out_dtype=torch.bfloat16, | |
scale_a=grad_inv_s, | |
scale_b=w_inv_s, | |
use_fast_accum=False, | |
) | |
# faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) | |
grad_w = torch._scaled_mm( | |
x_f8.t().contiguous(), | |
grad_f8.t().contiguous().t(), | |
out_dtype=torch.float32, | |
scale_a=x_inv_s, | |
scale_b=grad_inv_s, | |
use_fast_accum=False, | |
).t() | |
return grad_x, grad_w | |
return impl(g, x_f8, w_f8) | |
@mm_backward_op.register_fake | |
def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): | |
return x_f8.to(torch.bfloat16), w_f8.to(torch.float32) | |
def backward(ctx, grad_out: Tensor, *_): | |
x_f8, w_f8 = ctx.saved_tensors | |
x_s, w_s, grad_s = ctx.scales | |
grad_x, grad_w = torch.ops.nanogpt.mm_backward( | |
grad_out, x_f8, w_f8, x_s, w_s, grad_s | |
) | |
return grad_x, grad_w, None, None, None | |
def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): | |
*_, x_s, w_s, grad_s = inputs | |
_, x_f8, w_f8 = output | |
ctx.save_for_backward(x_f8, w_f8) | |
ctx.scales = x_s, w_s, grad_s | |
ctx.set_materialize_grads(False) | |
mm_op.register_autograd(backward, setup_context=setup_context) | |
# ----------------------------------------------------------------------------- | |
# Muon optimizer | |
@torch.compile | |
def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: | |
""" | |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
performance at all relative to UV^T, where USV^T = G is the SVD. | |
""" | |
assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng | |
a, b, c = (3.4445, -4.7750, 2.0315) | |
X = G.bfloat16() | |
if G.size(-2) > G.size(-1): | |
X = X.mT | |
# Ensure spectral norm is at most 1 | |
X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) | |
# Perform the NS iterations | |
for _ in range(steps): | |
A = X @ X.mT | |
B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
X = a * X + B @ X | |
if G.size(-2) > G.size(-1): | |
X = X.mT | |
return X | |
class Muon(torch.optim.Optimizer): | |
""" | |
Muon - MomentUm Orthogonalized by Newton-schulz | |
https://kellerjordan.github.io/posts/muon/ | |
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
the advantage that it can be stably run in bfloat16 on the GPU. | |
Some warnings: | |
- This optimizer should not be used for the embedding layer, the final fully connected layer, | |
or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). | |
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
Arguments: | |
lr: The learning rate used by the internal SGD. | |
momentum: The momentum used by the internal SGD. | |
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
ns_steps: The number of Newton-Schulz iteration steps to use. | |
""" | |
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1): | |
self.rank = rank | |
self.world_size = world_size | |
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) | |
params: list[Tensor] = [*params] | |
param_groups = [] | |
for size in {p.numel() for p in params}: | |
b = torch.empty(world_size, size, dtype=torch.bfloat16, device="cuda") | |
group = dict(params=[p for p in params if p.numel() == size], | |
update_buffer=b, update_buffer_views=[b[i] for i in range(world_size)]) | |
param_groups.append(group) | |
super().__init__(param_groups, defaults) | |
@torch.no_grad() | |
def step(self): | |
for group in self.param_groups: | |
update_buffer: Tensor = group["update_buffer"] | |
update_buffer_views: list[Tensor] = group["update_buffer_views"] | |
# generate weight updates in distributed fashion | |
params: list[Tensor] = group["params"] | |
handle = None | |
params_world = None | |
def update_prev(): # optimized Muon implementation contributed by @YouJiacheng | |
handle.wait() | |
for p_world, g_world in zip(params_world, update_buffer_views): | |
p_world.add_(g_world.view_as(p_world), | |
alpha=-group["lr"] * max(1, p_world.size(-2) / p_world.size(-1))**0.5) | |
for base_i in range(len(params))[::self.world_size]: | |
if base_i + self.rank < len(params): | |
p = params[base_i + self.rank] | |
g = p.grad | |
assert g is not None | |
state = self.state[p] | |
if "momentum_buffer" not in state: | |
state["momentum_buffer"] = torch.zeros_like(g) | |
buf: Tensor = state["momentum_buffer"] | |
buf.lerp_(g, 1 - group["momentum"]) | |
g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf | |
g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]).flatten() | |
else: | |
g = update_buffer_views[self.rank] | |
if base_i > 0: | |
update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng | |
handle = dist.all_gather_into_tensor(update_buffer, g, async_op=True) | |
params_world = params[base_i : base_i + self.world_size] | |
update_prev() | |
# ----------------------------------------------------------------------------- | |
# PyTorch nn.Module definitions for the model | |
def norm(x: Tensor): | |
return F.rms_norm(x, (x.size(-1),)) | |
class CastedLinear(nn.Linear): | |
def __init__(self, in_features: int, out_features: int, use_fp8: bool = False, x_s: float = 1.0, w_s: float = 1.0, grad_s: float = 1.0): | |
super().__init__(in_features, out_features, bias=False) | |
self.use_fp8 = use_fp8 | |
self.x_s = x_s | |
self.w_s = w_s | |
self.grad_s = grad_s | |
def reset_parameters(self) -> None: | |
std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) | |
bound = (3 ** 0.5) * std | |
with torch.no_grad(): | |
self.weight.uniform_(-bound, bound) | |
def forward(self, x: Tensor): | |
if self.use_fp8 and self.training: | |
_x = x.flatten(0, -2) | |
out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] | |
return out.reshape(*x.shape[:-1], -1) | |
else: | |
return F.linear(x, self.weight.type_as(x)) | |
class Rotary(nn.Module): | |
def __init__(self, dim: int, max_seq_len: int): | |
super().__init__() | |
# half-truncate RoPE by @YouJiacheng (w/ base freq tuning) | |
angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) | |
angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) | |
t = torch.arange(max_seq_len, dtype=torch.float32) | |
theta = torch.einsum("i,j -> ij", t, angular_freq) | |
self.cos = nn.Buffer(theta.cos(), persistent=False) | |
self.sin = nn.Buffer(theta.sin(), persistent=False) | |
def forward(self, x_BTHD: Tensor): | |
assert self.cos.size(0) >= x_BTHD.size(-3) | |
cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :] | |
x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) | |
y1 = x1 * cos + x2 * sin | |
y2 = x1 * (-sin) + x2 * cos | |
return torch.cat((y1, y2), 3).type_as(x_BTHD) | |
class CausalSelfAttention(nn.Module): | |
def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128): | |
super().__init__() | |
self.num_heads = num_heads | |
self.head_dim = head_dim | |
hdim = num_heads * head_dim | |
std = 0.5 * (dim ** -0.5) | |
bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng | |
# merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng | |
# https://x.com/hi_tysam/status/1879699187107033311 | |
self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound)) | |
self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) | |
self.rotary = Rotary(head_dim, max_seq_len) | |
self.c_proj = CastedLinear(hdim, dim) | |
self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
# scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun | |
# inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 | |
self.attn_scale = 0.12 | |
def forward(self, x: Tensor, ve: Tensor | None, block_mask: BlockMask): | |
B, T = x.size(0), x.size(1) # batch size, sequence length | |
assert B == 1, "Must use batch size = 1 for FlexAttention" | |
q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) | |
q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
q, k = self.rotary(q), self.rotary(k) | |
if ve is not None: | |
v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
else: # skip mid-layers token value embeddings by @YouJiacheng | |
v = self.lambdas[0] * v | |
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, score_mod=relumax_score_mod, scale=self.attn_scale).transpose(1, 2) | |
y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side | |
y = self.c_proj(y) | |
return y | |
class MLP(nn.Module): | |
def __init__(self, dim: int): | |
super().__init__() | |
hdim = 4 * dim | |
self.c_fc = CastedLinear(dim, hdim) | |
self.c_proj = CastedLinear(hdim, dim) | |
self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
def forward(self, x: Tensor): | |
x = self.c_fc(x) | |
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
x = self.c_proj(x) | |
return x | |
class Block(nn.Module): | |
def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int): | |
super().__init__() | |
# skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None | |
self.mlp = MLP(dim) | |
self.lambdas = nn.Parameter(torch.tensor([1., 0.])) | |
def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, block_mask: BlockMask): | |
x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
if self.attn is not None: | |
x = x + self.attn(norm(x), ve, block_mask) | |
x = x + self.mlp(norm(x)) | |
return x | |
# ----------------------------------------------------------------------------- | |
# The main model | |
def next_multiple_of_n(v: float | int, *, n: int): | |
return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) | |
class GPT(nn.Module): | |
def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int): | |
super().__init__() | |
self.embed = nn.Embedding(vocab_size, model_dim) | |
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 | |
# value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 | |
self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) | |
self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)]) | |
# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. | |
# suggested to me by @Grad62304977. this originates from Karpathy's experiments. | |
self.lm_head = CastedLinear(model_dim, next_multiple_of_n(vocab_size, n=128), use_fp8=True, x_s=2.0, w_s=2.0**9, grad_s=2.0**19) | |
self.lm_head.weight.detach().zero_() # @Grad62304977 | |
# Add learnable skip connection weights for decoder layers | |
assert num_layers % 2 == 0 | |
self.skip_weights = nn.Parameter(torch.ones(num_layers//2)) | |
def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor): | |
BLOCK_SIZE = 128 | |
docs = (input_seq == 50256).cumsum(0) | |
def document_causal(b, h, q_idx, kv_idx): | |
causal_mask = q_idx >= kv_idx | |
document_mask = docs[q_idx] == docs[kv_idx] | |
return causal_mask & document_mask | |
def dense_to_ordered(dense_blockmask: Tensor): | |
num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32) | |
indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32) | |
return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
# manual block mask creation by @YouJiacheng | |
assert len(input_seq) % BLOCK_SIZE == 0 | |
NUM_BLOCKS = len(input_seq) // BLOCK_SIZE | |
block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") | |
causal_blockmask_any = block_idx[:, None] >= block_idx | |
causal_blockmask_all = block_idx[:, None] > block_idx | |
docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low) | |
document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low) | |
blockmask_any = causal_blockmask_any & document_blockmask_any | |
blockmask_all = causal_blockmask_all & document_blockmask_all | |
partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all) | |
full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all) | |
def build_bm(window_size_blocks: Tensor) -> BlockMask: | |
return BlockMask.from_kv_blocks( | |
torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)), | |
partial_kv_indices, | |
torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1), | |
full_kv_indices, | |
BLOCK_SIZE=BLOCK_SIZE, | |
mask_mod=document_causal, | |
) | |
# Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper | |
return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2) | |
def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): | |
assert input_seq.ndim == 1 | |
ve = [value_embed(input_seq) for value_embed in self.value_embeds] | |
# 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] | |
assert len(ve) == len(self.blocks) | |
long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks) | |
block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm] | |
assert len(block_masks) == len(self.blocks) | |
x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 | |
# U-net design by @brendanh0gan | |
skip_connections = [] | |
n = len(self.skip_weights) | |
for i in range(len(self.blocks)): | |
if i >= n: | |
x = x + self.skip_weights[i - n] * skip_connections.pop() | |
x = self.blocks[i](x, ve[i], x0, block_masks[i]) | |
if i < n: | |
skip_connections.append(x) | |
x = norm(x) | |
logits = self.lm_head(x) | |
# @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) | |
logits = 30 * torch.sigmoid(logits.float() / 7.5) | |
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq) | |
return loss | |
# ----------------------------------------------------------------------------- | |
# Our own simple Distributed Data Loader | |
def _load_data_shard(file: Path): | |
header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) # header is 256 int32 | |
assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
assert header[1] == 1, "unsupported version" | |
num_tokens = int(header[2]) # number of tokens (claimed) | |
with file.open("rb", buffering=0) as f: | |
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
f.seek(256 * 4) | |
nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
assert nbytes == 2 * num_tokens, "number of tokens read does not match header" | |
return tokens | |
def distributed_data_generator(filename_pattern: str, batch_size: int, rank : int, world_size : int): | |
files = sorted(Path.cwd().glob(filename_pattern)) | |
assert batch_size % world_size == 0 | |
local_batch_size = batch_size // world_size | |
file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training | |
tokens, pos = _load_data_shard(next(file_iter)), 0 | |
while True: | |
if pos + batch_size + 1 >= len(tokens): | |
tokens, pos = _load_data_shard(next(file_iter)), 0 | |
buf = tokens[pos + rank * local_batch_size:][:local_batch_size + 1] | |
inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; | |
targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful. | |
pos += batch_size | |
yield inputs, targets | |
# ----------------------------------------------------------------------------- | |
# int main | |
@dataclass | |
class Hyperparameters: | |
# data | |
train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on | |
val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on | |
val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
# optimization | |
num_iterations = 1770 # number of iterations to run | |
cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate | |
# architecture | |
vocab_size = 50257 | |
# evaluation and logging | |
val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
# implementation | |
train_seq_len = 48*1024 # FlexAttention sequence length | |
val_seq_len = 4*64*1024 # FlexAttention sequence length for validation | |
save_checkpoint = False | |
args = Hyperparameters() | |
# torchrun sets these env variables | |
rank = int(os.environ["RANK"]) | |
world_size = int(os.environ["WORLD_SIZE"]) | |
assert world_size == 8 # this code is designed for 8xH100 | |
assert torch.cuda.is_available() | |
device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) | |
torch.cuda.set_device(device) | |
dist.init_process_group(backend="nccl", device_id=device) | |
dist.barrier() | |
master_process = (rank == 0) # this process will do logging, checkpointing etc. | |
# begin logging | |
logfile = None | |
if master_process: | |
run_id = uuid.uuid4() | |
os.makedirs("logs", exist_ok=True) | |
logfile = f"logs/{run_id}.txt" | |
print(logfile) | |
def print0(s, console=False): | |
if master_process: | |
with open(logfile, "a") as f: | |
if console: | |
print(s) | |
print(s, file=f) | |
# begin by printing this file (the Python code) | |
print0(code) | |
print0("="*100) | |
# log information about the hardware/software environment this is running on | |
print0(f"Running Python {sys.version}") | |
print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") | |
def nvidia_smi(): | |
import subprocess # avoid top level import | |
return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout | |
print0(nvidia_smi()) | |
print0("="*100) | |
######################################## | |
# Construct model and optimizer # | |
######################################## | |
model: nn.Module = GPT(vocab_size=args.vocab_size, num_layers=12, num_heads=6, model_dim=768, | |
max_seq_len=max(args.train_seq_len, args.val_seq_len)).cuda() | |
for m in model.modules(): | |
if isinstance(m, nn.Embedding): | |
m.bfloat16() | |
for param in model.parameters(): | |
dist.broadcast(param.detach(), 0) | |
# collect the parameters to optimize | |
hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n] | |
embed_params = [p for n, p in model.named_parameters() if "embed" in n] | |
scalar_params = [p for p in model.parameters() if p.ndim < 2] | |
head_params = [model.lm_head.weight] | |
# init the optimizer(s) | |
adam_params = [dict(params=head_params, lr=0.008), dict(params=embed_params, lr=0.6), dict(params=scalar_params, lr=0.04)] | |
# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence | |
# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 | |
optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), eps=1e-10, fused=True) | |
optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size) | |
optimizers = [optimizer1, optimizer2] | |
for opt in optimizers: | |
for group in opt.param_groups: | |
group["initial_lr"] = group["lr"] | |
# learning rate schedule: stable then decay | |
def get_lr(step: int): | |
x = step / args.num_iterations # progress in training | |
assert 0 <= x < 1 | |
if x < 1 - args.cooldown_frac: | |
return 1.0 | |
else: | |
w = (1 - x) / args.cooldown_frac | |
return w * 1.0 + (1 - w) * 0.1 | |
# attention window size schedule: linearly increase | |
@lru_cache(1) | |
def get_window_size_blocks_helper(window_size: int): | |
return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) | |
def get_window_size_blocks(step: int): | |
x = step / args.num_iterations # progress in training | |
assert 0 <= x <= 1 | |
# Linearly increase the block-wise sliding window size over training 128 -> 1792 | |
# increase by @fernbear.bsky.social; block-wise by @YouJiacheng | |
window_size = next_multiple_of_n(1728 * x, n=128) | |
return get_window_size_blocks_helper(window_size) | |
model: nn.Module = torch.compile(model, dynamic=False) | |
######################################## | |
# Warmup kernels # | |
######################################## | |
# Warmup the training kernels, then re-initialize the state so we aren't cheating | |
warmup_steps = 10 | |
initial_state = dict(model=copy.deepcopy(model.state_dict()), | |
optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state | |
for _ in range(warmup_steps): | |
inputs = targets = torch.randint(0, args.vocab_size, size=(args.train_seq_len,), device="cuda") | |
model(inputs.to(torch.int32), targets, get_window_size_blocks(0)).backward() | |
for param in model.parameters(): | |
dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) | |
for opt in optimizers: | |
opt.step() | |
model.zero_grad(set_to_none=True) | |
model.load_state_dict(initial_state["model"]) | |
for opt, opt_state in zip(optimizers, initial_state["optimizers"]): | |
opt.load_state_dict(opt_state) | |
del initial_state | |
######################################## | |
# Training and validation # | |
######################################## | |
train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, rank, world_size) | |
training_time_ms = 0 | |
# start the clock | |
torch.cuda.synchronize() | |
t0 = time.perf_counter() | |
# begin training | |
train_steps = args.num_iterations | |
for step in range(train_steps + 1): | |
last_step = (step == train_steps) | |
# --------------- VALIDATION SECTION ----------------- | |
if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
# stop the clock | |
torch.cuda.synchronize() | |
training_time_ms += 1000 * (time.perf_counter() - t0) | |
model.eval() | |
val_batch_size = world_size * args.val_seq_len | |
assert args.val_tokens % val_batch_size == 0 | |
val_steps = args.val_tokens // val_batch_size | |
val_loader = distributed_data_generator(args.val_files, val_batch_size, rank, world_size) | |
val_loss = 0 | |
with torch.no_grad(): | |
for _ in range(val_steps): | |
inputs, targets = next(val_loader) | |
val_loss += model(inputs, targets, get_window_size_blocks(step)) | |
val_loss /= val_steps | |
del val_loader | |
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) | |
model.train() | |
# start the clock again | |
torch.cuda.synchronize() | |
t0 = time.perf_counter() | |
if last_step: | |
if master_process and args.save_checkpoint: | |
log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
os.makedirs(f"logs/{run_id}", exist_ok=True) | |
torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") | |
# the last step only has the validation loop, so break to avoid training | |
break | |
# --------------- TRAINING SECTION ----------------- | |
inputs, targets = next(train_loader) | |
model(inputs, targets, get_window_size_blocks(step)).backward() | |
for param in model.parameters(): | |
dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) | |
# set optimization hyperparameters | |
for opt in optimizers: | |
for group in opt.param_groups: | |
group["lr"] = group["initial_lr"] * get_lr(step) | |
for group in optimizer2.param_groups: | |
frac = min(step / 300, 1) # momentum warmup for muon | |
group["momentum"] = (1 - frac) * 0.85 + frac * 0.95 | |
# step the optimizers | |
for opt in optimizers: | |
opt.step() | |
# null the gradients | |
model.zero_grad(set_to_none=True) | |
# logging | |
approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) | |
print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) | |
print0( | |
f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " | |
f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", | |
console=True, | |
) | |
dist.destroy_process_group() | |
==================================================================================================== | |
Running Python 3.12.7 (main, Feb 4 2025, 17:47:37) [GCC 13.2.0] | |
Running PyTorch 2.7.0.dev20250125+cu126 compiled for CUDA 12.6 | |
Thu Feb 6 00:16:22 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:8D:00.0 Off | 0 | | |
| N/A 28C P0 114W / 700W | 7746MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:91:00.0 Off | 0 | | |
| N/A 26C P0 122W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:95:00.0 Off | 0 | | |
| N/A 28C P0 116W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:99:00.0 Off | 0 | | |
| N/A 26C P0 120W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 4 NVIDIA H100 80GB HBM3 On | 00000000:AB:00.0 Off | 0 | | |
| N/A 28C P0 122W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 5 NVIDIA H100 80GB HBM3 On | 00000000:AF:00.0 Off | 0 | | |
| N/A 26C P0 113W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 6 NVIDIA H100 80GB HBM3 On | 00000000:B3:00.0 Off | 0 | | |
| N/A 28C P0 119W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 7 NVIDIA H100 80GB HBM3 On | 00000000:B7:00.0 Off | 0 | | |
| N/A 25C P0 116W / 700W | 3216MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
+-----------------------------------------------------------------------------------------+ | |
==================================================================================================== | |
step:0/1770 val_loss:10.8258 train_time:0ms step_avg:0.02ms | |
step:1/1770 train_time:70ms step_avg:70.20ms | |
step:2/1770 train_time:155ms step_avg:77.58ms | |
step:3/1770 train_time:249ms step_avg:83.00ms | |
step:4/1770 train_time:348ms step_avg:87.12ms | |
step:5/1770 train_time:448ms step_avg:89.66ms | |
step:6/1770 train_time:548ms step_avg:91.34ms | |
step:7/1770 train_time:648ms step_avg:92.58ms | |
step:8/1770 train_time:748ms step_avg:93.47ms | |
step:9/1770 train_time:847ms step_avg:94.15ms | |
step:10/1770 train_time:948ms step_avg:94.77ms | |
step:11/1770 train_time:1047ms step_avg:95.23ms | |
step:12/1770 train_time:1149ms step_avg:95.78ms | |
step:13/1770 train_time:1251ms step_avg:96.25ms | |
step:14/1770 train_time:1352ms step_avg:96.54ms | |
step:15/1770 train_time:1452ms step_avg:96.82ms | |
step:16/1770 train_time:1553ms step_avg:97.03ms | |
step:17/1770 train_time:1653ms step_avg:97.23ms | |
step:18/1770 train_time:1754ms step_avg:97.43ms | |
step:19/1770 train_time:1855ms step_avg:97.61ms | |
step:20/1770 train_time:1956ms step_avg:97.79ms | |
step:21/1770 train_time:2056ms step_avg:97.89ms | |
step:22/1770 train_time:2156ms step_avg:98.01ms | |
step:23/1770 train_time:2257ms step_avg:98.12ms | |
step:24/1770 train_time:2357ms step_avg:98.20ms | |
step:25/1770 train_time:2458ms step_avg:98.30ms | |
step:26/1770 train_time:2558ms step_avg:98.40ms | |
step:27/1770 train_time:2659ms step_avg:98.47ms | |
step:28/1770 train_time:2759ms step_avg:98.53ms | |
step:29/1770 train_time:2860ms step_avg:98.62ms | |
step:30/1770 train_time:2961ms step_avg:98.69ms | |
step:31/1770 train_time:3061ms step_avg:98.76ms | |
step:32/1770 train_time:3161ms step_avg:98.79ms | |
step:33/1770 train_time:3261ms step_avg:98.83ms | |
step:34/1770 train_time:3362ms step_avg:98.88ms | |
step:35/1770 train_time:3462ms step_avg:98.91ms | |
step:36/1770 train_time:3562ms step_avg:98.93ms | |
step:37/1770 train_time:3661ms step_avg:98.95ms | |
step:38/1770 train_time:3761ms step_avg:98.98ms | |
step:39/1770 train_time:3861ms step_avg:99.00ms | |
step:40/1770 train_time:3961ms step_avg:99.02ms | |
step:41/1770 train_time:4061ms step_avg:99.04ms | |
step:42/1770 train_time:4161ms step_avg:99.07ms | |
step:43/1770 train_time:4261ms step_avg:99.09ms | |
step:44/1770 train_time:4361ms step_avg:99.10ms | |
step:45/1770 train_time:4460ms step_avg:99.12ms | |
step:46/1770 train_time:4560ms step_avg:99.13ms | |
step:47/1770 train_time:4661ms step_avg:99.16ms | |
step:48/1770 train_time:4761ms step_avg:99.18ms | |
step:49/1770 train_time:4861ms step_avg:99.20ms | |
step:50/1770 train_time:4961ms step_avg:99.22ms | |
step:51/1770 train_time:5061ms step_avg:99.24ms | |
step:52/1770 train_time:5161ms step_avg:99.25ms | |
step:53/1770 train_time:5261ms step_avg:99.27ms | |
step:54/1770 train_time:5361ms step_avg:99.28ms | |
step:55/1770 train_time:5461ms step_avg:99.28ms | |
step:56/1770 train_time:5561ms step_avg:99.30ms | |
step:57/1770 train_time:5660ms step_avg:99.31ms | |
step:58/1770 train_time:5760ms step_avg:99.32ms | |
step:59/1770 train_time:5860ms step_avg:99.33ms | |
step:60/1770 train_time:5961ms step_avg:99.35ms | |
step:61/1770 train_time:6061ms step_avg:99.36ms | |
step:62/1770 train_time:6161ms step_avg:99.37ms | |
step:63/1770 train_time:6262ms step_avg:99.39ms | |
step:64/1770 train_time:6362ms step_avg:99.40ms | |
step:65/1770 train_time:6461ms step_avg:99.41ms | |
step:66/1770 train_time:6561ms step_avg:99.41ms | |
step:67/1770 train_time:6661ms step_avg:99.41ms | |
step:68/1770 train_time:6760ms step_avg:99.42ms | |
step:69/1770 train_time:6860ms step_avg:99.42ms | |
step:70/1770 train_time:6960ms step_avg:99.43ms | |
step:71/1770 train_time:7060ms step_avg:99.43ms | |
step:72/1770 train_time:7160ms step_avg:99.44ms | |
step:73/1770 train_time:7260ms step_avg:99.45ms | |
step:74/1770 train_time:7361ms step_avg:99.47ms | |
step:75/1770 train_time:7461ms step_avg:99.48ms | |
step:76/1770 train_time:7561ms step_avg:99.49ms | |
step:77/1770 train_time:7661ms step_avg:99.49ms | |
step:78/1770 train_time:7760ms step_avg:99.49ms | |
step:79/1770 train_time:7860ms step_avg:99.49ms | |
step:80/1770 train_time:7959ms step_avg:99.49ms | |
step:81/1770 train_time:8059ms step_avg:99.49ms | |
step:82/1770 train_time:8158ms step_avg:99.49ms | |
step:83/1770 train_time:8258ms step_avg:99.49ms | |
step:84/1770 train_time:8359ms step_avg:99.51ms | |
step:85/1770 train_time:8458ms step_avg:99.51ms | |
step:86/1770 train_time:8559ms step_avg:99.52ms | |
step:87/1770 train_time:8659ms step_avg:99.53ms | |
step:88/1770 train_time:8760ms step_avg:99.54ms | |
step:89/1770 train_time:8860ms step_avg:99.55ms | |
step:90/1770 train_time:8961ms step_avg:99.57ms | |
step:91/1770 train_time:9062ms step_avg:99.58ms | |
step:92/1770 train_time:9162ms step_avg:99.59ms | |
step:93/1770 train_time:9262ms step_avg:99.59ms | |
step:94/1770 train_time:9362ms step_avg:99.59ms | |
step:95/1770 train_time:9462ms step_avg:99.60ms | |
step:96/1770 train_time:9562ms step_avg:99.60ms | |
step:97/1770 train_time:9662ms step_avg:99.60ms | |
step:98/1770 train_time:9761ms step_avg:99.61ms | |
step:99/1770 train_time:9861ms step_avg:99.61ms | |
step:100/1770 train_time:9961ms step_avg:99.61ms | |
step:101/1770 train_time:10062ms step_avg:99.62ms | |
step:102/1770 train_time:10162ms step_avg:99.63ms | |
step:103/1770 train_time:10262ms step_avg:99.63ms | |
step:104/1770 train_time:10362ms step_avg:99.63ms | |
step:105/1770 train_time:10462ms step_avg:99.63ms | |
step:106/1770 train_time:10561ms step_avg:99.63ms | |
step:107/1770 train_time:10661ms step_avg:99.63ms | |
step:108/1770 train_time:10760ms step_avg:99.63ms | |
step:109/1770 train_time:10860ms step_avg:99.63ms | |
step:110/1770 train_time:10959ms step_avg:99.63ms | |
step:111/1770 train_time:11060ms step_avg:99.64ms | |
step:112/1770 train_time:11160ms step_avg:99.65ms | |
step:113/1770 train_time:11261ms step_avg:99.65ms | |
step:114/1770 train_time:11361ms step_avg:99.66ms | |
step:115/1770 train_time:11461ms step_avg:99.66ms | |
step:116/1770 train_time:11561ms step_avg:99.66ms | |
step:117/1770 train_time:11662ms step_avg:99.67ms | |
step:118/1770 train_time:11762ms step_avg:99.68ms | |
step:119/1770 train_time:11862ms step_avg:99.68ms | |
step:120/1770 train_time:11962ms step_avg:99.68ms | |
step:121/1770 train_time:12062ms step_avg:99.68ms | |
step:122/1770 train_time:12162ms step_avg:99.69ms | |
step:123/1770 train_time:12262ms step_avg:99.69ms | |
step:124/1770 train_time:12362ms step_avg:99.70ms | |
step:125/1770 train_time:12462ms step_avg:99.70ms | |
step:125/1770 val_loss:4.6514 train_time:12561ms step_avg:100.49ms | |
step:126/1770 train_time:12579ms step_avg:99.83ms | |
step:127/1770 train_time:12668ms step_avg:99.75ms | |
step:128/1770 train_time:12772ms step_avg:99.78ms | |
step:129/1770 train_time:12874ms step_avg:99.80ms | |
step:130/1770 train_time:12973ms step_avg:99.80ms | |
step:131/1770 train_time:13074ms step_avg:99.80ms | |
step:132/1770 train_time:13173ms step_avg:99.80ms | |
step:133/1770 train_time:13273ms step_avg:99.80ms | |
step:134/1770 train_time:13375ms step_avg:99.81ms | |
step:135/1770 train_time:13478ms step_avg:99.84ms | |
step:136/1770 train_time:13581ms step_avg:99.86ms | |
step:137/1770 train_time:13684ms step_avg:99.88ms | |
step:138/1770 train_time:13786ms step_avg:99.90ms | |
step:139/1770 train_time:13888ms step_avg:99.91ms | |
step:140/1770 train_time:13990ms step_avg:99.93ms | |
step:141/1770 train_time:14092ms step_avg:99.94ms | |
step:142/1770 train_time:14194ms step_avg:99.96ms | |
step:143/1770 train_time:14297ms step_avg:99.98ms | |
step:144/1770 train_time:14398ms step_avg:99.99ms | |
step:145/1770 train_time:14500ms step_avg:100.00ms | |
step:146/1770 train_time:14602ms step_avg:100.01ms | |
step:147/1770 train_time:14703ms step_avg:100.02ms | |
step:148/1770 train_time:14805ms step_avg:100.03ms | |
step:149/1770 train_time:14907ms step_avg:100.05ms | |
step:150/1770 train_time:15009ms step_avg:100.06ms | |
step:151/1770 train_time:15112ms step_avg:100.08ms | |
step:152/1770 train_time:15214ms step_avg:100.09ms | |
step:153/1770 train_time:15316ms step_avg:100.10ms | |
step:154/1770 train_time:15418ms step_avg:100.12ms | |
step:155/1770 train_time:15520ms step_avg:100.13ms | |
step:156/1770 train_time:15622ms step_avg:100.14ms | |
step:157/1770 train_time:15724ms step_avg:100.15ms | |
step:158/1770 train_time:15826ms step_avg:100.16ms | |
step:159/1770 train_time:15928ms step_avg:100.18ms | |
step:160/1770 train_time:16030ms step_avg:100.19ms | |
step:161/1770 train_time:16132ms step_avg:100.20ms | |
step:162/1770 train_time:16234ms step_avg:100.21ms | |
step:163/1770 train_time:16336ms step_avg:100.22ms | |
step:164/1770 train_time:16438ms step_avg:100.23ms | |
step:165/1770 train_time:16541ms step_avg:100.25ms | |
step:166/1770 train_time:16641ms step_avg:100.25ms | |
step:167/1770 train_time:16743ms step_avg:100.26ms | |
step:168/1770 train_time:16845ms step_avg:100.27ms | |
step:169/1770 train_time:16947ms step_avg:100.28ms | |
step:170/1770 train_time:17048ms step_avg:100.29ms | |
step:171/1770 train_time:17151ms step_avg:100.30ms | |
step:172/1770 train_time:17253ms step_avg:100.31ms | |
step:173/1770 train_time:17355ms step_avg:100.32ms | |
step:174/1770 train_time:17457ms step_avg:100.33ms | |
step:175/1770 train_time:17559ms step_avg:100.34ms | |
step:176/1770 train_time:17661ms step_avg:100.35ms | |
step:177/1770 train_time:17763ms step_avg:100.36ms | |
step:178/1770 train_time:17865ms step_avg:100.36ms | |
step:179/1770 train_time:17967ms step_avg:100.38ms | |
step:180/1770 train_time:18069ms step_avg:100.39ms | |
step:181/1770 train_time:18171ms step_avg:100.39ms | |
step:182/1770 train_time:18274ms step_avg:100.40ms | |
step:183/1770 train_time:18376ms step_avg:100.41ms | |
step:184/1770 train_time:18479ms step_avg:100.43ms | |
step:185/1770 train_time:18581ms step_avg:100.44ms | |
step:186/1770 train_time:18683ms step_avg:100.44ms | |
step:187/1770 train_time:18784ms step_avg:100.45ms | |
step:188/1770 train_time:18886ms step_avg:100.46ms | |
step:189/1770 train_time:18988ms step_avg:100.47ms | |
step:190/1770 train_time:19091ms step_avg:100.48ms | |
step:191/1770 train_time:19193ms step_avg:100.49ms | |
step:192/1770 train_time:19295ms step_avg:100.49ms | |
step:193/1770 train_time:19397ms step_avg:100.50ms | |
step:194/1770 train_time:19499ms step_avg:100.51ms | |
step:195/1770 train_time:19601ms step_avg:100.52ms | |
step:196/1770 train_time:19703ms step_avg:100.52ms | |
step:197/1770 train_time:19804ms step_avg:100.53ms | |
step:198/1770 train_time:19907ms step_avg:100.54ms | |
step:199/1770 train_time:20008ms step_avg:100.54ms | |
step:200/1770 train_time:20110ms step_avg:100.55ms | |
step:201/1770 train_time:20213ms step_avg:100.56ms | |
step:202/1770 train_time:20315ms step_avg:100.57ms | |
step:203/1770 train_time:20417ms step_avg:100.57ms | |
step:204/1770 train_time:20518ms step_avg:100.58ms | |
step:205/1770 train_time:20621ms step_avg:100.59ms | |
step:206/1770 train_time:20723ms step_avg:100.60ms | |
step:207/1770 train_time:20824ms step_avg:100.60ms | |
step:208/1770 train_time:20926ms step_avg:100.61ms | |
step:209/1770 train_time:21028ms step_avg:100.61ms | |
step:210/1770 train_time:21131ms step_avg:100.62ms | |
step:211/1770 train_time:21234ms step_avg:100.63ms | |
step:212/1770 train_time:21335ms step_avg:100.64ms | |
step:213/1770 train_time:21437ms step_avg:100.64ms | |
step:214/1770 train_time:21539ms step_avg:100.65ms | |
step:215/1770 train_time:21641ms step_avg:100.66ms | |
step:216/1770 train_time:21743ms step_avg:100.66ms | |
step:217/1770 train_time:21845ms step_avg:100.67ms | |
step:218/1770 train_time:21947ms step_avg:100.67ms | |
step:219/1770 train_time:22049ms step_avg:100.68ms | |
step:220/1770 train_time:22151ms step_avg:100.69ms | |
step:221/1770 train_time:22253ms step_avg:100.69ms | |
step:222/1770 train_time:22355ms step_avg:100.70ms | |
step:223/1770 train_time:22457ms step_avg:100.71ms | |
step:224/1770 train_time:22560ms step_avg:100.71ms | |
step:225/1770 train_time:22662ms step_avg:100.72ms | |
step:226/1770 train_time:22763ms step_avg:100.72ms | |
step:227/1770 train_time:22865ms step_avg:100.73ms | |
step:228/1770 train_time:22967ms step_avg:100.73ms | |
step:229/1770 train_time:23070ms step_avg:100.74ms | |
step:230/1770 train_time:23172ms step_avg:100.75ms | |
step:231/1770 train_time:23274ms step_avg:100.75ms | |
step:232/1770 train_time:23376ms step_avg:100.76ms | |
step:233/1770 train_time:23479ms step_avg:100.77ms | |
step:234/1770 train_time:23580ms step_avg:100.77ms | |
step:235/1770 train_time:23683ms step_avg:100.78ms | |
step:236/1770 train_time:23784ms step_avg:100.78ms | |
step:237/1770 train_time:23886ms step_avg:100.79ms | |
step:238/1770 train_time:23989ms step_avg:100.79ms | |
step:239/1770 train_time:24091ms step_avg:100.80ms | |
step:240/1770 train_time:24193ms step_avg:100.81ms | |
step:241/1770 train_time:24295ms step_avg:100.81ms | |
step:242/1770 train_time:24398ms step_avg:100.82ms | |
step:243/1770 train_time:24500ms step_avg:100.82ms | |
step:244/1770 train_time:24602ms step_avg:100.83ms | |
step:245/1770 train_time:24704ms step_avg:100.83ms | |
step:246/1770 train_time:24805ms step_avg:100.83ms | |
step:247/1770 train_time:24907ms step_avg:100.84ms | |
step:248/1770 train_time:25009ms step_avg:100.84ms | |
step:249/1770 train_time:25111ms step_avg:100.85ms | |
step:250/1770 train_time:25214ms step_avg:100.85ms | |
step:250/1770 val_loss:4.1125 train_time:25314ms step_avg:101.26ms | |
step:251/1770 train_time:25333ms step_avg:100.93ms | |
step:252/1770 train_time:25424ms step_avg:100.89ms | |
step:253/1770 train_time:25529ms step_avg:100.91ms | |
step:254/1770 train_time:25631ms step_avg:100.91ms | |
step:255/1770 train_time:25733ms step_avg:100.91ms | |
step:256/1770 train_time:25835ms step_avg:100.92ms | |
step:257/1770 train_time:25938ms step_avg:100.92ms | |
step:258/1770 train_time:26039ms step_avg:100.93ms | |
step:259/1770 train_time:26141ms step_avg:100.93ms | |
step:260/1770 train_time:26244ms step_avg:100.94ms | |
step:261/1770 train_time:26347ms step_avg:100.94ms | |
step:262/1770 train_time:26449ms step_avg:100.95ms | |
step:263/1770 train_time:26550ms step_avg:100.95ms | |
step:264/1770 train_time:26652ms step_avg:100.95ms | |
step:265/1770 train_time:26756ms step_avg:100.97ms | |
step:266/1770 train_time:26860ms step_avg:100.98ms | |
step:267/1770 train_time:26963ms step_avg:100.99ms | |
step:268/1770 train_time:27067ms step_avg:101.00ms | |
step:269/1770 train_time:27170ms step_avg:101.01ms | |
step:270/1770 train_time:27274ms step_avg:101.01ms | |
step:271/1770 train_time:27377ms step_avg:101.02ms | |
step:272/1770 train_time:27481ms step_avg:101.03ms | |
step:273/1770 train_time:27585ms step_avg:101.04ms | |
step:274/1770 train_time:27689ms step_avg:101.05ms | |
step:275/1770 train_time:27792ms step_avg:101.06ms | |
step:276/1770 train_time:27895ms step_avg:101.07ms | |
step:277/1770 train_time:28000ms step_avg:101.08ms | |
step:278/1770 train_time:28104ms step_avg:101.09ms | |
step:279/1770 train_time:28209ms step_avg:101.11ms | |
step:280/1770 train_time:28313ms step_avg:101.12ms | |
step:281/1770 train_time:28417ms step_avg:101.13ms | |
step:282/1770 train_time:28520ms step_avg:101.14ms | |
step:283/1770 train_time:28624ms step_avg:101.15ms | |
step:284/1770 train_time:28728ms step_avg:101.15ms | |
step:285/1770 train_time:28831ms step_avg:101.16ms | |
step:286/1770 train_time:28934ms step_avg:101.17ms | |
step:287/1770 train_time:29039ms step_avg:101.18ms | |
step:288/1770 train_time:29143ms step_avg:101.19ms | |
step:289/1770 train_time:29247ms step_avg:101.20ms | |
step:290/1770 train_time:29351ms step_avg:101.21ms | |
step:291/1770 train_time:29455ms step_avg:101.22ms | |
step:292/1770 train_time:29558ms step_avg:101.23ms | |
step:293/1770 train_time:29662ms step_avg:101.23ms | |
step:294/1770 train_time:29766ms step_avg:101.24ms | |
step:295/1770 train_time:29869ms step_avg:101.25ms | |
step:296/1770 train_time:29972ms step_avg:101.26ms | |
step:297/1770 train_time:30076ms step_avg:101.27ms | |
step:298/1770 train_time:30180ms step_avg:101.28ms | |
step:299/1770 train_time:30283ms step_avg:101.28ms | |
step:300/1770 train_time:30388ms step_avg:101.29ms | |
step:301/1770 train_time:30492ms step_avg:101.30ms | |
step:302/1770 train_time:30595ms step_avg:101.31ms | |
step:303/1770 train_time:30699ms step_avg:101.32ms | |
step:304/1770 train_time:30802ms step_avg:101.32ms | |
step:305/1770 train_time:30907ms step_avg:101.34ms | |
step:306/1770 train_time:31011ms step_avg:101.34ms | |
step:307/1770 train_time:31114ms step_avg:101.35ms | |
step:308/1770 train_time:31218ms step_avg:101.36ms | |
step:309/1770 train_time:31322ms step_avg:101.36ms | |
step:310/1770 train_time:31426ms step_avg:101.37ms | |
step:311/1770 train_time:31530ms step_avg:101.38ms | |
step:312/1770 train_time:31633ms step_avg:101.39ms | |
step:313/1770 train_time:31736ms step_avg:101.39ms | |
step:314/1770 train_time:31840ms step_avg:101.40ms | |
step:315/1770 train_time:31944ms step_avg:101.41ms | |
step:316/1770 train_time:32047ms step_avg:101.41ms | |
step:317/1770 train_time:32151ms step_avg:101.42ms | |
step:318/1770 train_time:32254ms step_avg:101.43ms | |
step:319/1770 train_time:32358ms step_avg:101.43ms | |
step:320/1770 train_time:32461ms step_avg:101.44ms | |
step:321/1770 train_time:32565ms step_avg:101.45ms | |
step:322/1770 train_time:32669ms step_avg:101.46ms | |
step:323/1770 train_time:32772ms step_avg:101.46ms | |
step:324/1770 train_time:32875ms step_avg:101.47ms | |
step:325/1770 train_time:32979ms step_avg:101.48ms | |
step:326/1770 train_time:33083ms step_avg:101.48ms | |
step:327/1770 train_time:33187ms step_avg:101.49ms | |
step:328/1770 train_time:33291ms step_avg:101.50ms | |
step:329/1770 train_time:33394ms step_avg:101.50ms | |
step:330/1770 train_time:33497ms step_avg:101.51ms | |
step:331/1770 train_time:33602ms step_avg:101.52ms | |
step:332/1770 train_time:33705ms step_avg:101.52ms | |
step:333/1770 train_time:33809ms step_avg:101.53ms | |
step:334/1770 train_time:33912ms step_avg:101.53ms | |
step:335/1770 train_time:34016ms step_avg:101.54ms | |
step:336/1770 train_time:34120ms step_avg:101.55ms | |
step:337/1770 train_time:34224ms step_avg:101.55ms | |
step:338/1770 train_time:34328ms step_avg:101.56ms | |
step:339/1770 train_time:34432ms step_avg:101.57ms | |
step:340/1770 train_time:34535ms step_avg:101.57ms | |
step:341/1770 train_time:34639ms step_avg:101.58ms | |
step:342/1770 train_time:34743ms step_avg:101.59ms | |
step:343/1770 train_time:34847ms step_avg:101.59ms | |
step:344/1770 train_time:34950ms step_avg:101.60ms | |
step:345/1770 train_time:35054ms step_avg:101.61ms | |
step:346/1770 train_time:35157ms step_avg:101.61ms | |
step:347/1770 train_time:35262ms step_avg:101.62ms | |
step:348/1770 train_time:35365ms step_avg:101.62ms | |
step:349/1770 train_time:35469ms step_avg:101.63ms | |
step:350/1770 train_time:35572ms step_avg:101.64ms | |
step:351/1770 train_time:35676ms step_avg:101.64ms | |
step:352/1770 train_time:35780ms step_avg:101.65ms | |
step:353/1770 train_time:35884ms step_avg:101.65ms | |
step:354/1770 train_time:35988ms step_avg:101.66ms | |
step:355/1770 train_time:36091ms step_avg:101.67ms | |
step:356/1770 train_time:36195ms step_avg:101.67ms | |
step:357/1770 train_time:36298ms step_avg:101.68ms | |
step:358/1770 train_time:36402ms step_avg:101.68ms | |
step:359/1770 train_time:36506ms step_avg:101.69ms | |
step:360/1770 train_time:36610ms step_avg:101.69ms | |
step:361/1770 train_time:36713ms step_avg:101.70ms | |
step:362/1770 train_time:36816ms step_avg:101.70ms | |
step:363/1770 train_time:36920ms step_avg:101.71ms | |
step:364/1770 train_time:37024ms step_avg:101.71ms | |
step:365/1770 train_time:37128ms step_avg:101.72ms | |
step:366/1770 train_time:37231ms step_avg:101.73ms | |
step:367/1770 train_time:37334ms step_avg:101.73ms | |
step:368/1770 train_time:37439ms step_avg:101.74ms | |
step:369/1770 train_time:37543ms step_avg:101.74ms | |
step:370/1770 train_time:37648ms step_avg:101.75ms | |
step:371/1770 train_time:37751ms step_avg:101.75ms | |
step:372/1770 train_time:37854ms step_avg:101.76ms | |
step:373/1770 train_time:37958ms step_avg:101.77ms | |
step:374/1770 train_time:38062ms step_avg:101.77ms | |
step:375/1770 train_time:38166ms step_avg:101.78ms | |
step:375/1770 val_loss:3.9060 train_time:38269ms step_avg:102.05ms | |
step:376/1770 train_time:38287ms step_avg:101.83ms | |
step:377/1770 train_time:38380ms step_avg:101.80ms | |
step:378/1770 train_time:38485ms step_avg:101.81ms | |
step:379/1770 train_time:38588ms step_avg:101.82ms | |
step:380/1770 train_time:38693ms step_avg:101.82ms | |
step:381/1770 train_time:38796ms step_avg:101.83ms | |
step:382/1770 train_time:38900ms step_avg:101.83ms | |
step:383/1770 train_time:39003ms step_avg:101.84ms | |
step:384/1770 train_time:39107ms step_avg:101.84ms | |
step:385/1770 train_time:39210ms step_avg:101.84ms | |
step:386/1770 train_time:39315ms step_avg:101.85ms | |
step:387/1770 train_time:39419ms step_avg:101.86ms | |
step:388/1770 train_time:39523ms step_avg:101.86ms | |
step:389/1770 train_time:39627ms step_avg:101.87ms | |
step:390/1770 train_time:39730ms step_avg:101.87ms | |
step:391/1770 train_time:39835ms step_avg:101.88ms | |
step:392/1770 train_time:39939ms step_avg:101.88ms | |
step:393/1770 train_time:40043ms step_avg:101.89ms | |
step:394/1770 train_time:40147ms step_avg:101.90ms | |
step:395/1770 train_time:40251ms step_avg:101.90ms | |
step:396/1770 train_time:40362ms step_avg:101.92ms | |
step:397/1770 train_time:40472ms step_avg:101.94ms | |
step:398/1770 train_time:40582ms step_avg:101.96ms | |
step:399/1770 train_time:40693ms step_avg:101.99ms | |
step:400/1770 train_time:40803ms step_avg:102.01ms | |
step:401/1770 train_time:40913ms step_avg:102.03ms | |
step:402/1770 train_time:41024ms step_avg:102.05ms | |
step:403/1770 train_time:41134ms step_avg:102.07ms | |
step:404/1770 train_time:41244ms step_avg:102.09ms | |
step:405/1770 train_time:41354ms step_avg:102.11ms | |
step:406/1770 train_time:41464ms step_avg:102.13ms | |
step:407/1770 train_time:41574ms step_avg:102.15ms | |
step:408/1770 train_time:41685ms step_avg:102.17ms | |
step:409/1770 train_time:41795ms step_avg:102.19ms | |
step:410/1770 train_time:41906ms step_avg:102.21ms | |
step:411/1770 train_time:42017ms step_avg:102.23ms | |
step:412/1770 train_time:42128ms step_avg:102.25ms | |
step:413/1770 train_time:42238ms step_avg:102.27ms | |
step:414/1770 train_time:42349ms step_avg:102.29ms | |
step:415/1770 train_time:42460ms step_avg:102.31ms | |
step:416/1770 train_time:42570ms step_avg:102.33ms | |
step:417/1770 train_time:42680ms step_avg:102.35ms | |
step:418/1770 train_time:42791ms step_avg:102.37ms | |
step:419/1770 train_time:42902ms step_avg:102.39ms | |
step:420/1770 train_time:43012ms step_avg:102.41ms | |
step:421/1770 train_time:43122ms step_avg:102.43ms | |
step:422/1770 train_time:43232ms step_avg:102.45ms | |
step:423/1770 train_time:43342ms step_avg:102.46ms | |
step:424/1770 train_time:43453ms step_avg:102.48ms | |
step:425/1770 train_time:43564ms step_avg:102.50ms | |
step:426/1770 train_time:43673ms step_avg:102.52ms | |
step:427/1770 train_time:43784ms step_avg:102.54ms | |
step:428/1770 train_time:43895ms step_avg:102.56ms | |
step:429/1770 train_time:44005ms step_avg:102.58ms | |
step:430/1770 train_time:44116ms step_avg:102.60ms | |
step:431/1770 train_time:44227ms step_avg:102.61ms | |
step:432/1770 train_time:44337ms step_avg:102.63ms | |
step:433/1770 train_time:44448ms step_avg:102.65ms | |
step:434/1770 train_time:44558ms step_avg:102.67ms | |
step:435/1770 train_time:44669ms step_avg:102.69ms | |
step:436/1770 train_time:44780ms step_avg:102.71ms | |
step:437/1770 train_time:44890ms step_avg:102.72ms | |
step:438/1770 train_time:45002ms step_avg:102.74ms | |
step:439/1770 train_time:45114ms step_avg:102.77ms | |
step:440/1770 train_time:45225ms step_avg:102.78ms | |
step:441/1770 train_time:45335ms step_avg:102.80ms | |
step:442/1770 train_time:45445ms step_avg:102.82ms | |
step:443/1770 train_time:45556ms step_avg:102.84ms | |
step:444/1770 train_time:45667ms step_avg:102.85ms | |
step:445/1770 train_time:45778ms step_avg:102.87ms | |
step:446/1770 train_time:45888ms step_avg:102.89ms | |
step:447/1770 train_time:45998ms step_avg:102.90ms | |
step:448/1770 train_time:46108ms step_avg:102.92ms | |
step:449/1770 train_time:46219ms step_avg:102.94ms | |
step:450/1770 train_time:46329ms step_avg:102.95ms | |
step:451/1770 train_time:46440ms step_avg:102.97ms | |
step:452/1770 train_time:46550ms step_avg:102.99ms | |
step:453/1770 train_time:46661ms step_avg:103.00ms | |
step:454/1770 train_time:46771ms step_avg:103.02ms | |
step:455/1770 train_time:46882ms step_avg:103.04ms | |
step:456/1770 train_time:46991ms step_avg:103.05ms | |
step:457/1770 train_time:47102ms step_avg:103.07ms | |
step:458/1770 train_time:47213ms step_avg:103.08ms | |
step:459/1770 train_time:47323ms step_avg:103.10ms | |
step:460/1770 train_time:47433ms step_avg:103.12ms | |
step:461/1770 train_time:47545ms step_avg:103.13ms | |
step:462/1770 train_time:47655ms step_avg:103.15ms | |
step:463/1770 train_time:47766ms step_avg:103.17ms | |
step:464/1770 train_time:47876ms step_avg:103.18ms | |
step:465/1770 train_time:47987ms step_avg:103.20ms | |
step:466/1770 train_time:48098ms step_avg:103.21ms | |
step:467/1770 train_time:48208ms step_avg:103.23ms | |
step:468/1770 train_time:48319ms step_avg:103.25ms | |
step:469/1770 train_time:48429ms step_avg:103.26ms | |
step:470/1770 train_time:48540ms step_avg:103.28ms | |
step:471/1770 train_time:48650ms step_avg:103.29ms | |
step:472/1770 train_time:48760ms step_avg:103.31ms | |
step:473/1770 train_time:48870ms step_avg:103.32ms | |
step:474/1770 train_time:48981ms step_avg:103.34ms | |
step:475/1770 train_time:49091ms step_avg:103.35ms | |
step:476/1770 train_time:49202ms step_avg:103.37ms | |
step:477/1770 train_time:49312ms step_avg:103.38ms | |
step:478/1770 train_time:49423ms step_avg:103.40ms | |
step:479/1770 train_time:49535ms step_avg:103.41ms | |
step:480/1770 train_time:49644ms step_avg:103.43ms | |
step:481/1770 train_time:49755ms step_avg:103.44ms | |
step:482/1770 train_time:49865ms step_avg:103.45ms | |
step:483/1770 train_time:49976ms step_avg:103.47ms | |
step:484/1770 train_time:50087ms step_avg:103.48ms | |
step:485/1770 train_time:50198ms step_avg:103.50ms | |
step:486/1770 train_time:50308ms step_avg:103.51ms | |
step:487/1770 train_time:50420ms step_avg:103.53ms | |
step:488/1770 train_time:50530ms step_avg:103.55ms | |
step:489/1770 train_time:50642ms step_avg:103.56ms | |
step:490/1770 train_time:50751ms step_avg:103.57ms | |
step:491/1770 train_time:50862ms step_avg:103.59ms | |
step:492/1770 train_time:50973ms step_avg:103.60ms | |
step:493/1770 train_time:51083ms step_avg:103.62ms | |
step:494/1770 train_time:51193ms step_avg:103.63ms | |
step:495/1770 train_time:51303ms step_avg:103.64ms | |
step:496/1770 train_time:51413ms step_avg:103.66ms | |
step:497/1770 train_time:51523ms step_avg:103.67ms | |
step:498/1770 train_time:51634ms step_avg:103.68ms | |
step:499/1770 train_time:51746ms step_avg:103.70ms | |
step:500/1770 train_time:51855ms step_avg:103.71ms | |
step:500/1770 val_loss:3.7512 train_time:51965ms step_avg:103.93ms | |
step:501/1770 train_time:51984ms step_avg:103.76ms | |
step:502/1770 train_time:52086ms step_avg:103.76ms | |
step:503/1770 train_time:52196ms step_avg:103.77ms | |
step:504/1770 train_time:52307ms step_avg:103.78ms | |
step:505/1770 train_time:52418ms step_avg:103.80ms | |
step:506/1770 train_time:52528ms step_avg:103.81ms | |
step:507/1770 train_time:52638ms step_avg:103.82ms | |
step:508/1770 train_time:52748ms step_avg:103.83ms | |
step:509/1770 train_time:52858ms step_avg:103.85ms | |
step:510/1770 train_time:52968ms step_avg:103.86ms | |
step:511/1770 train_time:53079ms step_avg:103.87ms | |
step:512/1770 train_time:53189ms step_avg:103.88ms | |
step:513/1770 train_time:53299ms step_avg:103.90ms | |
step:514/1770 train_time:53409ms step_avg:103.91ms | |
step:515/1770 train_time:53520ms step_avg:103.92ms | |
step:516/1770 train_time:53630ms step_avg:103.93ms | |
step:517/1770 train_time:53740ms step_avg:103.95ms | |
step:518/1770 train_time:53851ms step_avg:103.96ms | |
step:519/1770 train_time:53961ms step_avg:103.97ms | |
step:520/1770 train_time:54071ms step_avg:103.98ms | |
step:521/1770 train_time:54181ms step_avg:103.99ms | |
step:522/1770 train_time:54292ms step_avg:104.01ms | |
step:523/1770 train_time:54402ms step_avg:104.02ms | |
step:524/1770 train_time:54512ms step_avg:104.03ms | |
step:525/1770 train_time:54623ms step_avg:104.04ms | |
step:526/1770 train_time:54733ms step_avg:104.05ms | |
step:527/1770 train_time:54844ms step_avg:104.07ms | |
step:528/1770 train_time:54956ms step_avg:104.08ms | |
step:529/1770 train_time:55067ms step_avg:104.10ms | |
step:530/1770 train_time:55180ms step_avg:104.11ms | |
step:531/1770 train_time:55292ms step_avg:104.13ms | |
step:532/1770 train_time:55404ms step_avg:104.14ms | |
step:533/1770 train_time:55515ms step_avg:104.15ms | |
step:534/1770 train_time:55626ms step_avg:104.17ms | |
step:535/1770 train_time:55738ms step_avg:104.18ms | |
step:536/1770 train_time:55850ms step_avg:104.20ms | |
step:537/1770 train_time:55962ms step_avg:104.21ms | |
step:538/1770 train_time:56073ms step_avg:104.22ms | |
step:539/1770 train_time:56185ms step_avg:104.24ms | |
step:540/1770 train_time:56296ms step_avg:104.25ms | |
step:541/1770 train_time:56408ms step_avg:104.27ms | |
step:542/1770 train_time:56520ms step_avg:104.28ms | |
step:543/1770 train_time:56632ms step_avg:104.29ms | |
step:544/1770 train_time:56743ms step_avg:104.31ms | |
step:545/1770 train_time:56855ms step_avg:104.32ms | |
step:546/1770 train_time:56966ms step_avg:104.33ms | |
step:547/1770 train_time:57078ms step_avg:104.35ms | |
step:548/1770 train_time:57190ms step_avg:104.36ms | |
step:549/1770 train_time:57302ms step_avg:104.37ms | |
step:550/1770 train_time:57414ms step_avg:104.39ms | |
step:551/1770 train_time:57524ms step_avg:104.40ms | |
step:552/1770 train_time:57635ms step_avg:104.41ms | |
step:553/1770 train_time:57746ms step_avg:104.42ms | |
step:554/1770 train_time:57858ms step_avg:104.44ms | |
step:555/1770 train_time:57970ms step_avg:104.45ms | |
step:556/1770 train_time:58082ms step_avg:104.46ms | |
step:557/1770 train_time:58193ms step_avg:104.48ms | |
step:558/1770 train_time:58305ms step_avg:104.49ms | |
step:559/1770 train_time:58416ms step_avg:104.50ms | |
step:560/1770 train_time:58528ms step_avg:104.51ms | |
step:561/1770 train_time:58640ms step_avg:104.53ms | |
step:562/1770 train_time:58751ms step_avg:104.54ms | |
step:563/1770 train_time:58863ms step_avg:104.55ms | |
step:564/1770 train_time:58974ms step_avg:104.56ms | |
step:565/1770 train_time:59086ms step_avg:104.58ms | |
step:566/1770 train_time:59197ms step_avg:104.59ms | |
step:567/1770 train_time:59309ms step_avg:104.60ms | |
step:568/1770 train_time:59422ms step_avg:104.62ms | |
step:569/1770 train_time:59533ms step_avg:104.63ms | |
step:570/1770 train_time:59645ms step_avg:104.64ms | |
step:571/1770 train_time:59756ms step_avg:104.65ms | |
step:572/1770 train_time:59868ms step_avg:104.66ms | |
step:573/1770 train_time:59980ms step_avg:104.68ms | |
step:574/1770 train_time:60091ms step_avg:104.69ms | |
step:575/1770 train_time:60203ms step_avg:104.70ms | |
step:576/1770 train_time:60314ms step_avg:104.71ms | |
step:577/1770 train_time:60427ms step_avg:104.73ms | |
step:578/1770 train_time:60538ms step_avg:104.74ms | |
step:579/1770 train_time:60650ms step_avg:104.75ms | |
step:580/1770 train_time:60761ms step_avg:104.76ms | |
step:581/1770 train_time:60873ms step_avg:104.77ms | |
step:582/1770 train_time:60984ms step_avg:104.78ms | |
step:583/1770 train_time:61096ms step_avg:104.80ms | |
step:584/1770 train_time:61207ms step_avg:104.81ms | |
step:585/1770 train_time:61319ms step_avg:104.82ms | |
step:586/1770 train_time:61431ms step_avg:104.83ms | |
step:587/1770 train_time:61543ms step_avg:104.84ms | |
step:588/1770 train_time:61654ms step_avg:104.85ms | |
step:589/1770 train_time:61765ms step_avg:104.86ms | |
step:590/1770 train_time:61877ms step_avg:104.88ms | |
step:591/1770 train_time:61989ms step_avg:104.89ms | |
step:592/1770 train_time:62101ms step_avg:104.90ms | |
step:593/1770 train_time:62212ms step_avg:104.91ms | |
step:594/1770 train_time:62324ms step_avg:104.92ms | |
step:595/1770 train_time:62437ms step_avg:104.94ms | |
step:596/1770 train_time:62548ms step_avg:104.95ms | |
step:597/1770 train_time:62660ms step_avg:104.96ms | |
step:598/1770 train_time:62771ms step_avg:104.97ms | |
step:599/1770 train_time:62883ms step_avg:104.98ms | |
step:600/1770 train_time:62995ms step_avg:104.99ms | |
step:601/1770 train_time:63107ms step_avg:105.00ms | |
step:602/1770 train_time:63220ms step_avg:105.02ms | |
step:603/1770 train_time:63331ms step_avg:105.03ms | |
step:604/1770 train_time:63443ms step_avg:105.04ms | |
step:605/1770 train_time:63554ms step_avg:105.05ms | |
step:606/1770 train_time:63666ms step_avg:105.06ms | |
step:607/1770 train_time:63779ms step_avg:105.07ms | |
step:608/1770 train_time:63890ms step_avg:105.08ms | |
step:609/1770 train_time:64002ms step_avg:105.09ms | |
step:610/1770 train_time:64114ms step_avg:105.10ms | |
step:611/1770 train_time:64226ms step_avg:105.12ms | |
step:612/1770 train_time:64337ms step_avg:105.13ms | |
step:613/1770 train_time:64449ms step_avg:105.14ms | |
step:614/1770 train_time:64560ms step_avg:105.15ms | |
step:615/1770 train_time:64671ms step_avg:105.16ms | |
step:616/1770 train_time:64784ms step_avg:105.17ms | |
step:617/1770 train_time:64896ms step_avg:105.18ms | |
step:618/1770 train_time:65009ms step_avg:105.19ms | |
step:619/1770 train_time:65120ms step_avg:105.20ms | |
step:620/1770 train_time:65232ms step_avg:105.21ms | |
step:621/1770 train_time:65344ms step_avg:105.22ms | |
step:622/1770 train_time:65455ms step_avg:105.23ms | |
step:623/1770 train_time:65566ms step_avg:105.24ms | |
step:624/1770 train_time:65677ms step_avg:105.25ms | |
step:625/1770 train_time:65789ms step_avg:105.26ms | |
step:625/1770 val_loss:3.6638 train_time:65900ms step_avg:105.44ms | |
step:626/1770 train_time:65917ms step_avg:105.30ms | |
step:627/1770 train_time:66019ms step_avg:105.29ms | |
step:628/1770 train_time:66131ms step_avg:105.30ms | |
step:629/1770 train_time:66243ms step_avg:105.31ms | |
step:630/1770 train_time:66355ms step_avg:105.33ms | |
step:631/1770 train_time:66467ms step_avg:105.34ms | |
step:632/1770 train_time:66578ms step_avg:105.35ms | |
step:633/1770 train_time:66690ms step_avg:105.36ms | |
step:634/1770 train_time:66801ms step_avg:105.36ms | |
step:635/1770 train_time:66913ms step_avg:105.37ms | |
step:636/1770 train_time:67025ms step_avg:105.39ms | |
step:637/1770 train_time:67137ms step_avg:105.40ms | |
step:638/1770 train_time:67248ms step_avg:105.40ms | |
step:639/1770 train_time:67360ms step_avg:105.41ms | |
step:640/1770 train_time:67472ms step_avg:105.43ms | |
step:641/1770 train_time:67584ms step_avg:105.44ms | |
step:642/1770 train_time:67695ms step_avg:105.44ms | |
step:643/1770 train_time:67807ms step_avg:105.45ms | |
step:644/1770 train_time:67919ms step_avg:105.46ms | |
step:645/1770 train_time:68030ms step_avg:105.47ms | |
step:646/1770 train_time:68142ms step_avg:105.48ms | |
step:647/1770 train_time:68253ms step_avg:105.49ms | |
step:648/1770 train_time:68365ms step_avg:105.50ms | |
step:649/1770 train_time:68477ms step_avg:105.51ms | |
step:650/1770 train_time:68588ms step_avg:105.52ms | |
step:651/1770 train_time:68700ms step_avg:105.53ms | |
step:652/1770 train_time:68812ms step_avg:105.54ms | |
step:653/1770 train_time:68923ms step_avg:105.55ms | |
step:654/1770 train_time:69035ms step_avg:105.56ms | |
step:655/1770 train_time:69146ms step_avg:105.57ms | |
step:656/1770 train_time:69258ms step_avg:105.58ms | |
step:657/1770 train_time:69369ms step_avg:105.58ms | |
step:658/1770 train_time:69486ms step_avg:105.60ms | |
step:659/1770 train_time:69602ms step_avg:105.62ms | |
step:660/1770 train_time:69719ms step_avg:105.63ms | |
step:661/1770 train_time:69835ms step_avg:105.65ms | |
step:662/1770 train_time:69952ms step_avg:105.67ms | |
step:663/1770 train_time:70069ms step_avg:105.68ms | |
step:664/1770 train_time:70186ms step_avg:105.70ms | |
step:665/1770 train_time:70302ms step_avg:105.72ms | |
step:666/1770 train_time:70418ms step_avg:105.73ms | |
step:667/1770 train_time:70536ms step_avg:105.75ms | |
step:668/1770 train_time:70652ms step_avg:105.77ms | |
step:669/1770 train_time:70769ms step_avg:105.78ms | |
step:670/1770 train_time:70886ms step_avg:105.80ms | |
step:671/1770 train_time:71003ms step_avg:105.82ms | |
step:672/1770 train_time:71120ms step_avg:105.83ms | |
step:673/1770 train_time:71237ms step_avg:105.85ms | |
step:674/1770 train_time:71353ms step_avg:105.87ms | |
step:675/1770 train_time:71471ms step_avg:105.88ms | |
step:676/1770 train_time:71587ms step_avg:105.90ms | |
step:677/1770 train_time:71705ms step_avg:105.92ms | |
step:678/1770 train_time:71821ms step_avg:105.93ms | |
step:679/1770 train_time:71937ms step_avg:105.95ms | |
step:680/1770 train_time:72053ms step_avg:105.96ms | |
step:681/1770 train_time:72170ms step_avg:105.98ms | |
step:682/1770 train_time:72287ms step_avg:105.99ms | |
step:683/1770 train_time:72406ms step_avg:106.01ms | |
step:684/1770 train_time:72523ms step_avg:106.03ms | |
step:685/1770 train_time:72640ms step_avg:106.04ms | |
step:686/1770 train_time:72757ms step_avg:106.06ms | |
step:687/1770 train_time:72874ms step_avg:106.08ms | |
step:688/1770 train_time:72991ms step_avg:106.09ms | |
step:689/1770 train_time:73109ms step_avg:106.11ms | |
step:690/1770 train_time:73226ms step_avg:106.12ms | |
step:691/1770 train_time:73342ms step_avg:106.14ms | |
step:692/1770 train_time:73458ms step_avg:106.15ms | |
step:693/1770 train_time:73575ms step_avg:106.17ms | |
step:694/1770 train_time:73691ms step_avg:106.18ms | |
step:695/1770 train_time:73808ms step_avg:106.20ms | |
step:696/1770 train_time:73925ms step_avg:106.21ms | |
step:697/1770 train_time:74041ms step_avg:106.23ms | |
step:698/1770 train_time:74158ms step_avg:106.24ms | |
step:699/1770 train_time:74274ms step_avg:106.26ms | |
step:700/1770 train_time:74392ms step_avg:106.27ms | |
step:701/1770 train_time:74508ms step_avg:106.29ms | |
step:702/1770 train_time:74625ms step_avg:106.30ms | |
step:703/1770 train_time:74742ms step_avg:106.32ms | |
step:704/1770 train_time:74858ms step_avg:106.33ms | |
step:705/1770 train_time:74975ms step_avg:106.35ms | |
step:706/1770 train_time:75092ms step_avg:106.36ms | |
step:707/1770 train_time:75208ms step_avg:106.38ms | |
step:708/1770 train_time:75324ms step_avg:106.39ms | |
step:709/1770 train_time:75441ms step_avg:106.41ms | |
step:710/1770 train_time:75558ms step_avg:106.42ms | |
step:711/1770 train_time:75674ms step_avg:106.43ms | |
step:712/1770 train_time:75791ms step_avg:106.45ms | |
step:713/1770 train_time:75908ms step_avg:106.46ms | |
step:714/1770 train_time:76026ms step_avg:106.48ms | |
step:715/1770 train_time:76143ms step_avg:106.49ms | |
step:716/1770 train_time:76260ms step_avg:106.51ms | |
step:717/1770 train_time:76376ms step_avg:106.52ms | |
step:718/1770 train_time:76493ms step_avg:106.54ms | |
step:719/1770 train_time:76610ms step_avg:106.55ms | |
step:720/1770 train_time:76728ms step_avg:106.57ms | |
step:721/1770 train_time:76844ms step_avg:106.58ms | |
step:722/1770 train_time:76960ms step_avg:106.59ms | |
step:723/1770 train_time:77076ms step_avg:106.61ms | |
step:724/1770 train_time:77193ms step_avg:106.62ms | |
step:725/1770 train_time:77310ms step_avg:106.63ms | |
step:726/1770 train_time:77426ms step_avg:106.65ms | |
step:727/1770 train_time:77543ms step_avg:106.66ms | |
step:728/1770 train_time:77660ms step_avg:106.68ms | |
step:729/1770 train_time:77776ms step_avg:106.69ms | |
step:730/1770 train_time:77894ms step_avg:106.70ms | |
step:731/1770 train_time:78010ms step_avg:106.72ms | |
step:732/1770 train_time:78127ms step_avg:106.73ms | |
step:733/1770 train_time:78245ms step_avg:106.75ms | |
step:734/1770 train_time:78362ms step_avg:106.76ms | |
step:735/1770 train_time:78479ms step_avg:106.77ms | |
step:736/1770 train_time:78595ms step_avg:106.79ms | |
step:737/1770 train_time:78712ms step_avg:106.80ms | |
step:738/1770 train_time:78829ms step_avg:106.81ms | |
step:739/1770 train_time:78947ms step_avg:106.83ms | |
step:740/1770 train_time:79063ms step_avg:106.84ms | |
step:741/1770 train_time:79179ms step_avg:106.85ms | |
step:742/1770 train_time:79296ms step_avg:106.87ms | |
step:743/1770 train_time:79413ms step_avg:106.88ms | |
step:744/1770 train_time:79530ms step_avg:106.90ms | |
step:745/1770 train_time:79647ms step_avg:106.91ms | |
step:746/1770 train_time:79764ms step_avg:106.92ms | |
step:747/1770 train_time:79880ms step_avg:106.93ms | |
step:748/1770 train_time:79997ms step_avg:106.95ms | |
step:749/1770 train_time:80115ms step_avg:106.96ms | |
step:750/1770 train_time:80231ms step_avg:106.98ms | |
step:750/1770 val_loss:3.5987 train_time:80347ms step_avg:107.13ms | |
step:751/1770 train_time:80365ms step_avg:107.01ms | |
step:752/1770 train_time:80470ms step_avg:107.01ms | |
step:753/1770 train_time:80587ms step_avg:107.02ms | |
step:754/1770 train_time:80704ms step_avg:107.03ms | |
step:755/1770 train_time:80820ms step_avg:107.05ms | |
step:756/1770 train_time:80939ms step_avg:107.06ms | |
step:757/1770 train_time:81054ms step_avg:107.07ms | |
step:758/1770 train_time:81171ms step_avg:107.09ms | |
step:759/1770 train_time:81288ms step_avg:107.10ms | |
step:760/1770 train_time:81405ms step_avg:107.11ms | |
step:761/1770 train_time:81522ms step_avg:107.12ms | |
step:762/1770 train_time:81639ms step_avg:107.14ms | |
step:763/1770 train_time:81757ms step_avg:107.15ms | |
step:764/1770 train_time:81873ms step_avg:107.16ms | |
step:765/1770 train_time:81990ms step_avg:107.18ms | |
step:766/1770 train_time:82107ms step_avg:107.19ms | |
step:767/1770 train_time:82224ms step_avg:107.20ms | |
step:768/1770 train_time:82342ms step_avg:107.22ms | |
step:769/1770 train_time:82458ms step_avg:107.23ms | |
step:770/1770 train_time:82575ms step_avg:107.24ms | |
step:771/1770 train_time:82691ms step_avg:107.25ms | |
step:772/1770 train_time:82808ms step_avg:107.26ms | |
step:773/1770 train_time:82924ms step_avg:107.28ms | |
step:774/1770 train_time:83041ms step_avg:107.29ms | |
step:775/1770 train_time:83159ms step_avg:107.30ms | |
step:776/1770 train_time:83275ms step_avg:107.31ms | |
step:777/1770 train_time:83392ms step_avg:107.33ms | |
step:778/1770 train_time:83508ms step_avg:107.34ms | |
step:779/1770 train_time:83626ms step_avg:107.35ms | |
step:780/1770 train_time:83743ms step_avg:107.36ms | |
step:781/1770 train_time:83860ms step_avg:107.38ms | |
step:782/1770 train_time:83977ms step_avg:107.39ms | |
step:783/1770 train_time:84094ms step_avg:107.40ms | |
step:784/1770 train_time:84210ms step_avg:107.41ms | |
step:785/1770 train_time:84326ms step_avg:107.42ms | |
step:786/1770 train_time:84443ms step_avg:107.43ms | |
step:787/1770 train_time:84561ms step_avg:107.45ms | |
step:788/1770 train_time:84678ms step_avg:107.46ms | |
step:789/1770 train_time:84796ms step_avg:107.47ms | |
step:790/1770 train_time:84914ms step_avg:107.49ms | |
step:791/1770 train_time:85031ms step_avg:107.50ms | |
step:792/1770 train_time:85148ms step_avg:107.51ms | |
step:793/1770 train_time:85266ms step_avg:107.52ms | |
step:794/1770 train_time:85384ms step_avg:107.54ms | |
step:795/1770 train_time:85503ms step_avg:107.55ms | |
step:796/1770 train_time:85621ms step_avg:107.56ms | |
step:797/1770 train_time:85740ms step_avg:107.58ms | |
step:798/1770 train_time:85859ms step_avg:107.59ms | |
step:799/1770 train_time:85977ms step_avg:107.61ms | |
step:800/1770 train_time:86095ms step_avg:107.62ms | |
step:801/1770 train_time:86212ms step_avg:107.63ms | |
step:802/1770 train_time:86330ms step_avg:107.64ms | |
step:803/1770 train_time:86448ms step_avg:107.66ms | |
step:804/1770 train_time:86565ms step_avg:107.67ms | |
step:805/1770 train_time:86683ms step_avg:107.68ms | |
step:806/1770 train_time:86801ms step_avg:107.69ms | |
step:807/1770 train_time:86919ms step_avg:107.71ms | |
step:808/1770 train_time:87036ms step_avg:107.72ms | |
step:809/1770 train_time:87155ms step_avg:107.73ms | |
step:810/1770 train_time:87273ms step_avg:107.75ms | |
step:811/1770 train_time:87390ms step_avg:107.76ms | |
step:812/1770 train_time:87509ms step_avg:107.77ms | |
step:813/1770 train_time:87627ms step_avg:107.78ms | |
step:814/1770 train_time:87746ms step_avg:107.80ms | |
step:815/1770 train_time:87864ms step_avg:107.81ms | |
step:816/1770 train_time:87981ms step_avg:107.82ms | |
step:817/1770 train_time:88099ms step_avg:107.83ms | |
step:818/1770 train_time:88217ms step_avg:107.84ms | |
step:819/1770 train_time:88335ms step_avg:107.86ms | |
step:820/1770 train_time:88452ms step_avg:107.87ms | |
step:821/1770 train_time:88569ms step_avg:107.88ms | |
step:822/1770 train_time:88687ms step_avg:107.89ms | |
step:823/1770 train_time:88805ms step_avg:107.90ms | |
step:824/1770 train_time:88923ms step_avg:107.92ms | |
step:825/1770 train_time:89041ms step_avg:107.93ms | |
step:826/1770 train_time:89159ms step_avg:107.94ms | |
step:827/1770 train_time:89277ms step_avg:107.95ms | |
step:828/1770 train_time:89395ms step_avg:107.96ms | |
step:829/1770 train_time:89513ms step_avg:107.98ms | |
step:830/1770 train_time:89630ms step_avg:107.99ms | |
step:831/1770 train_time:89748ms step_avg:108.00ms | |
step:832/1770 train_time:89865ms step_avg:108.01ms | |
step:833/1770 train_time:89982ms step_avg:108.02ms | |
step:834/1770 train_time:90101ms step_avg:108.03ms | |
step:835/1770 train_time:90218ms step_avg:108.05ms | |
step:836/1770 train_time:90337ms step_avg:108.06ms | |
step:837/1770 train_time:90456ms step_avg:108.07ms | |
step:838/1770 train_time:90573ms step_avg:108.08ms | |
step:839/1770 train_time:90691ms step_avg:108.09ms | |
step:840/1770 train_time:90808ms step_avg:108.10ms | |
step:841/1770 train_time:90926ms step_avg:108.12ms | |
step:842/1770 train_time:91044ms step_avg:108.13ms | |
step:843/1770 train_time:91163ms step_avg:108.14ms | |
step:844/1770 train_time:91280ms step_avg:108.15ms | |
step:845/1770 train_time:91398ms step_avg:108.16ms | |
step:846/1770 train_time:91515ms step_avg:108.17ms | |
step:847/1770 train_time:91633ms step_avg:108.19ms | |
step:848/1770 train_time:91751ms step_avg:108.20ms | |
step:849/1770 train_time:91869ms step_avg:108.21ms | |
step:850/1770 train_time:91987ms step_avg:108.22ms | |
step:851/1770 train_time:92105ms step_avg:108.23ms | |
step:852/1770 train_time:92222ms step_avg:108.24ms | |
step:853/1770 train_time:92341ms step_avg:108.25ms | |
step:854/1770 train_time:92458ms step_avg:108.27ms | |
step:855/1770 train_time:92577ms step_avg:108.28ms | |
step:856/1770 train_time:92695ms step_avg:108.29ms | |
step:857/1770 train_time:92813ms step_avg:108.30ms | |
step:858/1770 train_time:92930ms step_avg:108.31ms | |
step:859/1770 train_time:93048ms step_avg:108.32ms | |
step:860/1770 train_time:93165ms step_avg:108.33ms | |
step:861/1770 train_time:93283ms step_avg:108.34ms | |
step:862/1770 train_time:93401ms step_avg:108.35ms | |
step:863/1770 train_time:93519ms step_avg:108.36ms | |
step:864/1770 train_time:93636ms step_avg:108.38ms | |
step:865/1770 train_time:93753ms step_avg:108.39ms | |
step:866/1770 train_time:93872ms step_avg:108.40ms | |
step:867/1770 train_time:93989ms step_avg:108.41ms | |
step:868/1770 train_time:94107ms step_avg:108.42ms | |
step:869/1770 train_time:94226ms step_avg:108.43ms | |
step:870/1770 train_time:94344ms step_avg:108.44ms | |
step:871/1770 train_time:94462ms step_avg:108.45ms | |
step:872/1770 train_time:94581ms step_avg:108.46ms | |
step:873/1770 train_time:94699ms step_avg:108.48ms | |
step:874/1770 train_time:94816ms step_avg:108.49ms | |
step:875/1770 train_time:94934ms step_avg:108.50ms | |
step:875/1770 val_loss:3.5496 train_time:95050ms step_avg:108.63ms | |
step:876/1770 train_time:95068ms step_avg:108.53ms | |
step:877/1770 train_time:95172ms step_avg:108.52ms | |
step:878/1770 train_time:95291ms step_avg:108.53ms | |
step:879/1770 train_time:95408ms step_avg:108.54ms | |
step:880/1770 train_time:95526ms step_avg:108.55ms | |
step:881/1770 train_time:95644ms step_avg:108.56ms | |
step:882/1770 train_time:95761ms step_avg:108.57ms | |
step:883/1770 train_time:95879ms step_avg:108.58ms | |
step:884/1770 train_time:95997ms step_avg:108.59ms | |
step:885/1770 train_time:96114ms step_avg:108.60ms | |
step:886/1770 train_time:96230ms step_avg:108.61ms | |
step:887/1770 train_time:96348ms step_avg:108.62ms | |
step:888/1770 train_time:96466ms step_avg:108.63ms | |
step:889/1770 train_time:96584ms step_avg:108.64ms | |
step:890/1770 train_time:96701ms step_avg:108.65ms | |
step:891/1770 train_time:96820ms step_avg:108.66ms | |
step:892/1770 train_time:96938ms step_avg:108.68ms | |
step:893/1770 train_time:97056ms step_avg:108.68ms | |
step:894/1770 train_time:97175ms step_avg:108.70ms | |
step:895/1770 train_time:97292ms step_avg:108.71ms | |
step:896/1770 train_time:97410ms step_avg:108.72ms | |
step:897/1770 train_time:97528ms step_avg:108.73ms | |
step:898/1770 train_time:97648ms step_avg:108.74ms | |
step:899/1770 train_time:97765ms step_avg:108.75ms | |
step:900/1770 train_time:97883ms step_avg:108.76ms | |
step:901/1770 train_time:98001ms step_avg:108.77ms | |
step:902/1770 train_time:98119ms step_avg:108.78ms | |
step:903/1770 train_time:98237ms step_avg:108.79ms | |
step:904/1770 train_time:98355ms step_avg:108.80ms | |
step:905/1770 train_time:98473ms step_avg:108.81ms | |
step:906/1770 train_time:98590ms step_avg:108.82ms | |
step:907/1770 train_time:98708ms step_avg:108.83ms | |
step:908/1770 train_time:98826ms step_avg:108.84ms | |
step:909/1770 train_time:98944ms step_avg:108.85ms | |
step:910/1770 train_time:99062ms step_avg:108.86ms | |
step:911/1770 train_time:99180ms step_avg:108.87ms | |
step:912/1770 train_time:99297ms step_avg:108.88ms | |
step:913/1770 train_time:99415ms step_avg:108.89ms | |
step:914/1770 train_time:99533ms step_avg:108.90ms | |
step:915/1770 train_time:99651ms step_avg:108.91ms | |
step:916/1770 train_time:99769ms step_avg:108.92ms | |
step:917/1770 train_time:99888ms step_avg:108.93ms | |
step:918/1770 train_time:100005ms step_avg:108.94ms | |
step:919/1770 train_time:100123ms step_avg:108.95ms | |
step:920/1770 train_time:100247ms step_avg:108.96ms | |
step:921/1770 train_time:100370ms step_avg:108.98ms | |
step:922/1770 train_time:100493ms step_avg:108.99ms | |
step:923/1770 train_time:100617ms step_avg:109.01ms | |
step:924/1770 train_time:100739ms step_avg:109.03ms | |
step:925/1770 train_time:100862ms step_avg:109.04ms | |
step:926/1770 train_time:100983ms step_avg:109.05ms | |
step:927/1770 train_time:101106ms step_avg:109.07ms | |
step:928/1770 train_time:101228ms step_avg:109.08ms | |
step:929/1770 train_time:101350ms step_avg:109.10ms | |
step:930/1770 train_time:101472ms step_avg:109.11ms | |
step:931/1770 train_time:101595ms step_avg:109.12ms | |
step:932/1770 train_time:101717ms step_avg:109.14ms | |
step:933/1770 train_time:101840ms step_avg:109.15ms | |
step:934/1770 train_time:101962ms step_avg:109.17ms | |
step:935/1770 train_time:102085ms step_avg:109.18ms | |
step:936/1770 train_time:102207ms step_avg:109.20ms | |
step:937/1770 train_time:102330ms step_avg:109.21ms | |
step:938/1770 train_time:102453ms step_avg:109.23ms | |
step:939/1770 train_time:102576ms step_avg:109.24ms | |
step:940/1770 train_time:102699ms step_avg:109.25ms | |
step:941/1770 train_time:102821ms step_avg:109.27ms | |
step:942/1770 train_time:102944ms step_avg:109.28ms | |
step:943/1770 train_time:103068ms step_avg:109.30ms | |
step:944/1770 train_time:103189ms step_avg:109.31ms | |
step:945/1770 train_time:103312ms step_avg:109.32ms | |
step:946/1770 train_time:103436ms step_avg:109.34ms | |
step:947/1770 train_time:103558ms step_avg:109.35ms | |
step:948/1770 train_time:103680ms step_avg:109.37ms | |
step:949/1770 train_time:103804ms step_avg:109.38ms | |
step:950/1770 train_time:103927ms step_avg:109.40ms | |
step:951/1770 train_time:104050ms step_avg:109.41ms | |
step:952/1770 train_time:104174ms step_avg:109.43ms | |
step:953/1770 train_time:104296ms step_avg:109.44ms | |
step:954/1770 train_time:104419ms step_avg:109.45ms | |
step:955/1770 train_time:104541ms step_avg:109.47ms | |
step:956/1770 train_time:104663ms step_avg:109.48ms | |
step:957/1770 train_time:104785ms step_avg:109.49ms | |
step:958/1770 train_time:104907ms step_avg:109.51ms | |
step:959/1770 train_time:105030ms step_avg:109.52ms | |
step:960/1770 train_time:105151ms step_avg:109.53ms | |
step:961/1770 train_time:105273ms step_avg:109.55ms | |
step:962/1770 train_time:105397ms step_avg:109.56ms | |
step:963/1770 train_time:105519ms step_avg:109.57ms | |
step:964/1770 train_time:105642ms step_avg:109.59ms | |
step:965/1770 train_time:105763ms step_avg:109.60ms | |
step:966/1770 train_time:105885ms step_avg:109.61ms | |
step:967/1770 train_time:106009ms step_avg:109.63ms | |
step:968/1770 train_time:106133ms step_avg:109.64ms | |
step:969/1770 train_time:106256ms step_avg:109.66ms | |
step:970/1770 train_time:106379ms step_avg:109.67ms | |
step:971/1770 train_time:106502ms step_avg:109.68ms | |
step:972/1770 train_time:106625ms step_avg:109.70ms | |
step:973/1770 train_time:106747ms step_avg:109.71ms | |
step:974/1770 train_time:106870ms step_avg:109.72ms | |
step:975/1770 train_time:106995ms step_avg:109.74ms | |
step:976/1770 train_time:107118ms step_avg:109.75ms | |
step:977/1770 train_time:107240ms step_avg:109.76ms | |
step:978/1770 train_time:107363ms step_avg:109.78ms | |
step:979/1770 train_time:107487ms step_avg:109.79ms | |
step:980/1770 train_time:107610ms step_avg:109.81ms | |
step:981/1770 train_time:107732ms step_avg:109.82ms | |
step:982/1770 train_time:107855ms step_avg:109.83ms | |
step:983/1770 train_time:107977ms step_avg:109.84ms | |
step:984/1770 train_time:108100ms step_avg:109.86ms | |
step:985/1770 train_time:108224ms step_avg:109.87ms | |
step:986/1770 train_time:108346ms step_avg:109.88ms | |
step:987/1770 train_time:108468ms step_avg:109.90ms | |
step:988/1770 train_time:108590ms step_avg:109.91ms | |
step:989/1770 train_time:108714ms step_avg:109.92ms | |
step:990/1770 train_time:108836ms step_avg:109.94ms | |
step:991/1770 train_time:108958ms step_avg:109.95ms | |
step:992/1770 train_time:109080ms step_avg:109.96ms | |
step:993/1770 train_time:109203ms step_avg:109.97ms | |
step:994/1770 train_time:109326ms step_avg:109.99ms | |
step:995/1770 train_time:109451ms step_avg:110.00ms | |
step:996/1770 train_time:109574ms step_avg:110.01ms | |
step:997/1770 train_time:109696ms step_avg:110.03ms | |
step:998/1770 train_time:109818ms step_avg:110.04ms | |
step:999/1770 train_time:109939ms step_avg:110.05ms | |
step:1000/1770 train_time:110062ms step_avg:110.06ms | |
step:1000/1770 val_loss:3.5109 train_time:110183ms step_avg:110.18ms | |
step:1001/1770 train_time:110201ms step_avg:110.09ms | |
step:1002/1770 train_time:110310ms step_avg:110.09ms | |
step:1003/1770 train_time:110434ms step_avg:110.10ms | |
step:1004/1770 train_time:110557ms step_avg:110.12ms | |
step:1005/1770 train_time:110679ms step_avg:110.13ms | |
step:1006/1770 train_time:110801ms step_avg:110.14ms | |
step:1007/1770 train_time:110924ms step_avg:110.15ms | |
step:1008/1770 train_time:111045ms step_avg:110.16ms | |
step:1009/1770 train_time:111167ms step_avg:110.18ms | |
step:1010/1770 train_time:111289ms step_avg:110.19ms | |
step:1011/1770 train_time:111413ms step_avg:110.20ms | |
step:1012/1770 train_time:111537ms step_avg:110.21ms | |
step:1013/1770 train_time:111659ms step_avg:110.23ms | |
step:1014/1770 train_time:111783ms step_avg:110.24ms | |
step:1015/1770 train_time:111905ms step_avg:110.25ms | |
step:1016/1770 train_time:112027ms step_avg:110.26ms | |
step:1017/1770 train_time:112149ms step_avg:110.27ms | |
step:1018/1770 train_time:112271ms step_avg:110.29ms | |
step:1019/1770 train_time:112394ms step_avg:110.30ms | |
step:1020/1770 train_time:112517ms step_avg:110.31ms | |
step:1021/1770 train_time:112640ms step_avg:110.32ms | |
step:1022/1770 train_time:112762ms step_avg:110.33ms | |
step:1023/1770 train_time:112884ms step_avg:110.35ms | |
step:1024/1770 train_time:113006ms step_avg:110.36ms | |
step:1025/1770 train_time:113129ms step_avg:110.37ms | |
step:1026/1770 train_time:113252ms step_avg:110.38ms | |
step:1027/1770 train_time:113377ms step_avg:110.40ms | |
step:1028/1770 train_time:113500ms step_avg:110.41ms | |
step:1029/1770 train_time:113622ms step_avg:110.42ms | |
step:1030/1770 train_time:113745ms step_avg:110.43ms | |
step:1031/1770 train_time:113866ms step_avg:110.44ms | |
step:1032/1770 train_time:113988ms step_avg:110.45ms | |
step:1033/1770 train_time:114110ms step_avg:110.46ms | |
step:1034/1770 train_time:114233ms step_avg:110.48ms | |
step:1035/1770 train_time:114355ms step_avg:110.49ms | |
step:1036/1770 train_time:114477ms step_avg:110.50ms | |
step:1037/1770 train_time:114599ms step_avg:110.51ms | |
step:1038/1770 train_time:114721ms step_avg:110.52ms | |
step:1039/1770 train_time:114844ms step_avg:110.53ms | |
step:1040/1770 train_time:114965ms step_avg:110.54ms | |
step:1041/1770 train_time:115087ms step_avg:110.55ms | |
step:1042/1770 train_time:115210ms step_avg:110.57ms | |
step:1043/1770 train_time:115332ms step_avg:110.58ms | |
step:1044/1770 train_time:115455ms step_avg:110.59ms | |
step:1045/1770 train_time:115578ms step_avg:110.60ms | |
step:1046/1770 train_time:115701ms step_avg:110.61ms | |
step:1047/1770 train_time:115824ms step_avg:110.62ms | |
step:1048/1770 train_time:115947ms step_avg:110.64ms | |
step:1049/1770 train_time:116069ms step_avg:110.65ms | |
step:1050/1770 train_time:116191ms step_avg:110.66ms | |
step:1051/1770 train_time:116314ms step_avg:110.67ms | |
step:1052/1770 train_time:116437ms step_avg:110.68ms | |
step:1053/1770 train_time:116562ms step_avg:110.69ms | |
step:1054/1770 train_time:116684ms step_avg:110.71ms | |
step:1055/1770 train_time:116807ms step_avg:110.72ms | |
step:1056/1770 train_time:116930ms step_avg:110.73ms | |
step:1057/1770 train_time:117054ms step_avg:110.74ms | |
step:1058/1770 train_time:117177ms step_avg:110.75ms | |
step:1059/1770 train_time:117299ms step_avg:110.76ms | |
step:1060/1770 train_time:117424ms step_avg:110.78ms | |
step:1061/1770 train_time:117547ms step_avg:110.79ms | |
step:1062/1770 train_time:117671ms step_avg:110.80ms | |
step:1063/1770 train_time:117797ms step_avg:110.82ms | |
step:1064/1770 train_time:117922ms step_avg:110.83ms | |
step:1065/1770 train_time:118045ms step_avg:110.84ms | |
step:1066/1770 train_time:118168ms step_avg:110.85ms | |
step:1067/1770 train_time:118291ms step_avg:110.86ms | |
step:1068/1770 train_time:118417ms step_avg:110.88ms | |
step:1069/1770 train_time:118539ms step_avg:110.89ms | |
step:1070/1770 train_time:118663ms step_avg:110.90ms | |
step:1071/1770 train_time:118787ms step_avg:110.91ms | |
step:1072/1770 train_time:118910ms step_avg:110.92ms | |
step:1073/1770 train_time:119032ms step_avg:110.93ms | |
step:1074/1770 train_time:119155ms step_avg:110.94ms | |
step:1075/1770 train_time:119277ms step_avg:110.96ms | |
step:1076/1770 train_time:119401ms step_avg:110.97ms | |
step:1077/1770 train_time:119524ms step_avg:110.98ms | |
step:1078/1770 train_time:119648ms step_avg:110.99ms | |
step:1079/1770 train_time:119771ms step_avg:111.00ms | |
step:1080/1770 train_time:119895ms step_avg:111.01ms | |
step:1081/1770 train_time:120018ms step_avg:111.02ms | |
step:1082/1770 train_time:120142ms step_avg:111.04ms | |
step:1083/1770 train_time:120264ms step_avg:111.05ms | |
step:1084/1770 train_time:120388ms step_avg:111.06ms | |
step:1085/1770 train_time:120512ms step_avg:111.07ms | |
step:1086/1770 train_time:120634ms step_avg:111.08ms | |
step:1087/1770 train_time:120757ms step_avg:111.09ms | |
step:1088/1770 train_time:120881ms step_avg:111.10ms | |
step:1089/1770 train_time:121004ms step_avg:111.11ms | |
step:1090/1770 train_time:121129ms step_avg:111.13ms | |
step:1091/1770 train_time:121252ms step_avg:111.14ms | |
step:1092/1770 train_time:121375ms step_avg:111.15ms | |
step:1093/1770 train_time:121498ms step_avg:111.16ms | |
step:1094/1770 train_time:121624ms step_avg:111.17ms | |
step:1095/1770 train_time:121748ms step_avg:111.18ms | |
step:1096/1770 train_time:121872ms step_avg:111.20ms | |
step:1097/1770 train_time:121995ms step_avg:111.21ms | |
step:1098/1770 train_time:122118ms step_avg:111.22ms | |
step:1099/1770 train_time:122241ms step_avg:111.23ms | |
step:1100/1770 train_time:122364ms step_avg:111.24ms | |
step:1101/1770 train_time:122487ms step_avg:111.25ms | |
step:1102/1770 train_time:122611ms step_avg:111.26ms | |
step:1103/1770 train_time:122734ms step_avg:111.27ms | |
step:1104/1770 train_time:122858ms step_avg:111.28ms | |
step:1105/1770 train_time:122982ms step_avg:111.30ms | |
step:1106/1770 train_time:123105ms step_avg:111.31ms | |
step:1107/1770 train_time:123228ms step_avg:111.32ms | |
step:1108/1770 train_time:123352ms step_avg:111.33ms | |
step:1109/1770 train_time:123476ms step_avg:111.34ms | |
step:1110/1770 train_time:123600ms step_avg:111.35ms | |
step:1111/1770 train_time:123724ms step_avg:111.36ms | |
step:1112/1770 train_time:123850ms step_avg:111.38ms | |
step:1113/1770 train_time:123973ms step_avg:111.39ms | |
step:1114/1770 train_time:124097ms step_avg:111.40ms | |
step:1115/1770 train_time:124221ms step_avg:111.41ms | |
step:1116/1770 train_time:124345ms step_avg:111.42ms | |
step:1117/1770 train_time:124470ms step_avg:111.43ms | |
step:1118/1770 train_time:124592ms step_avg:111.44ms | |
step:1119/1770 train_time:124717ms step_avg:111.45ms | |
step:1120/1770 train_time:124839ms step_avg:111.46ms | |
step:1121/1770 train_time:124962ms step_avg:111.47ms | |
step:1122/1770 train_time:125086ms step_avg:111.48ms | |
step:1123/1770 train_time:125208ms step_avg:111.49ms | |
step:1124/1770 train_time:125333ms step_avg:111.51ms | |
step:1125/1770 train_time:125457ms step_avg:111.52ms | |
step:1125/1770 val_loss:3.4706 train_time:125579ms step_avg:111.63ms | |
step:1126/1770 train_time:125597ms step_avg:111.54ms | |
step:1127/1770 train_time:125706ms step_avg:111.54ms | |
step:1128/1770 train_time:125829ms step_avg:111.55ms | |
step:1129/1770 train_time:125952ms step_avg:111.56ms | |
step:1130/1770 train_time:126076ms step_avg:111.57ms | |
step:1131/1770 train_time:126200ms step_avg:111.58ms | |
step:1132/1770 train_time:126323ms step_avg:111.59ms | |
step:1133/1770 train_time:126446ms step_avg:111.60ms | |
step:1134/1770 train_time:126570ms step_avg:111.61ms | |
step:1135/1770 train_time:126693ms step_avg:111.62ms | |
step:1136/1770 train_time:126817ms step_avg:111.63ms | |
step:1137/1770 train_time:126943ms step_avg:111.65ms | |
step:1138/1770 train_time:127065ms step_avg:111.66ms | |
step:1139/1770 train_time:127190ms step_avg:111.67ms | |
step:1140/1770 train_time:127314ms step_avg:111.68ms | |
step:1141/1770 train_time:127436ms step_avg:111.69ms | |
step:1142/1770 train_time:127560ms step_avg:111.70ms | |
step:1143/1770 train_time:127682ms step_avg:111.71ms | |
step:1144/1770 train_time:127805ms step_avg:111.72ms | |
step:1145/1770 train_time:127928ms step_avg:111.73ms | |
step:1146/1770 train_time:128053ms step_avg:111.74ms | |
step:1147/1770 train_time:128177ms step_avg:111.75ms | |
step:1148/1770 train_time:128301ms step_avg:111.76ms | |
step:1149/1770 train_time:128424ms step_avg:111.77ms | |
step:1150/1770 train_time:128547ms step_avg:111.78ms | |
step:1151/1770 train_time:128671ms step_avg:111.79ms | |
step:1152/1770 train_time:128794ms step_avg:111.80ms | |
step:1153/1770 train_time:128918ms step_avg:111.81ms | |
step:1154/1770 train_time:129042ms step_avg:111.82ms | |
step:1155/1770 train_time:129165ms step_avg:111.83ms | |
step:1156/1770 train_time:129288ms step_avg:111.84ms | |
step:1157/1770 train_time:129414ms step_avg:111.85ms | |
step:1158/1770 train_time:129538ms step_avg:111.86ms | |
step:1159/1770 train_time:129660ms step_avg:111.87ms | |
step:1160/1770 train_time:129783ms step_avg:111.88ms | |
step:1161/1770 train_time:129907ms step_avg:111.89ms | |
step:1162/1770 train_time:130032ms step_avg:111.90ms | |
step:1163/1770 train_time:130156ms step_avg:111.91ms | |
step:1164/1770 train_time:130278ms step_avg:111.92ms | |
step:1165/1770 train_time:130403ms step_avg:111.93ms | |
step:1166/1770 train_time:130526ms step_avg:111.94ms | |
step:1167/1770 train_time:130649ms step_avg:111.95ms | |
step:1168/1770 train_time:130772ms step_avg:111.96ms | |
step:1169/1770 train_time:130895ms step_avg:111.97ms | |
step:1170/1770 train_time:131018ms step_avg:111.98ms | |
step:1171/1770 train_time:131141ms step_avg:111.99ms | |
step:1172/1770 train_time:131264ms step_avg:112.00ms | |
step:1173/1770 train_time:131387ms step_avg:112.01ms | |
step:1174/1770 train_time:131511ms step_avg:112.02ms | |
step:1175/1770 train_time:131633ms step_avg:112.03ms | |
step:1176/1770 train_time:131757ms step_avg:112.04ms | |
step:1177/1770 train_time:131880ms step_avg:112.05ms | |
step:1178/1770 train_time:132004ms step_avg:112.06ms | |
step:1179/1770 train_time:132128ms step_avg:112.07ms | |
step:1180/1770 train_time:132253ms step_avg:112.08ms | |
step:1181/1770 train_time:132375ms step_avg:112.09ms | |
step:1182/1770 train_time:132500ms step_avg:112.10ms | |
step:1183/1770 train_time:132627ms step_avg:112.11ms | |
step:1184/1770 train_time:132757ms step_avg:112.13ms | |
step:1185/1770 train_time:132884ms step_avg:112.14ms | |
step:1186/1770 train_time:133013ms step_avg:112.15ms | |
step:1187/1770 train_time:133144ms step_avg:112.17ms | |
step:1188/1770 train_time:133270ms step_avg:112.18ms | |
step:1189/1770 train_time:133398ms step_avg:112.19ms | |
step:1190/1770 train_time:133526ms step_avg:112.21ms | |
step:1191/1770 train_time:133653ms step_avg:112.22ms | |
step:1192/1770 train_time:133780ms step_avg:112.23ms | |
step:1193/1770 train_time:133908ms step_avg:112.24ms | |
step:1194/1770 train_time:134036ms step_avg:112.26ms | |
step:1195/1770 train_time:134165ms step_avg:112.27ms | |
step:1196/1770 train_time:134295ms step_avg:112.29ms | |
step:1197/1770 train_time:134421ms step_avg:112.30ms | |
step:1198/1770 train_time:134548ms step_avg:112.31ms | |
step:1199/1770 train_time:134677ms step_avg:112.32ms | |
step:1200/1770 train_time:134806ms step_avg:112.34ms | |
step:1201/1770 train_time:134937ms step_avg:112.35ms | |
step:1202/1770 train_time:135062ms step_avg:112.36ms | |
step:1203/1770 train_time:135191ms step_avg:112.38ms | |
step:1204/1770 train_time:135319ms step_avg:112.39ms | |
step:1205/1770 train_time:135446ms step_avg:112.40ms | |
step:1206/1770 train_time:135576ms step_avg:112.42ms | |
step:1207/1770 train_time:135704ms step_avg:112.43ms | |
step:1208/1770 train_time:135831ms step_avg:112.44ms | |
step:1209/1770 train_time:135960ms step_avg:112.46ms | |
step:1210/1770 train_time:136087ms step_avg:112.47ms | |
step:1211/1770 train_time:136215ms step_avg:112.48ms | |
step:1212/1770 train_time:136346ms step_avg:112.50ms | |
step:1213/1770 train_time:136474ms step_avg:112.51ms | |
step:1214/1770 train_time:136602ms step_avg:112.52ms | |
step:1215/1770 train_time:136729ms step_avg:112.53ms | |
step:1216/1770 train_time:136859ms step_avg:112.55ms | |
step:1217/1770 train_time:136986ms step_avg:112.56ms | |
step:1218/1770 train_time:137114ms step_avg:112.57ms | |
step:1219/1770 train_time:137242ms step_avg:112.59ms | |
step:1220/1770 train_time:137371ms step_avg:112.60ms | |
step:1221/1770 train_time:137499ms step_avg:112.61ms | |
step:1222/1770 train_time:137628ms step_avg:112.62ms | |
step:1223/1770 train_time:137755ms step_avg:112.64ms | |
step:1224/1770 train_time:137883ms step_avg:112.65ms | |
step:1225/1770 train_time:138012ms step_avg:112.66ms | |
step:1226/1770 train_time:138139ms step_avg:112.67ms | |
step:1227/1770 train_time:138267ms step_avg:112.69ms | |
step:1228/1770 train_time:138397ms step_avg:112.70ms | |
step:1229/1770 train_time:138526ms step_avg:112.71ms | |
step:1230/1770 train_time:138654ms step_avg:112.73ms | |
step:1231/1770 train_time:138782ms step_avg:112.74ms | |
step:1232/1770 train_time:138910ms step_avg:112.75ms | |
step:1233/1770 train_time:139037ms step_avg:112.76ms | |
step:1234/1770 train_time:139165ms step_avg:112.78ms | |
step:1235/1770 train_time:139293ms step_avg:112.79ms | |
step:1236/1770 train_time:139423ms step_avg:112.80ms | |
step:1237/1770 train_time:139551ms step_avg:112.81ms | |
step:1238/1770 train_time:139679ms step_avg:112.83ms | |
step:1239/1770 train_time:139808ms step_avg:112.84ms | |
step:1240/1770 train_time:139937ms step_avg:112.85ms | |
step:1241/1770 train_time:140065ms step_avg:112.86ms | |
step:1242/1770 train_time:140193ms step_avg:112.88ms | |
step:1243/1770 train_time:140321ms step_avg:112.89ms | |
step:1244/1770 train_time:140448ms step_avg:112.90ms | |
step:1245/1770 train_time:140577ms step_avg:112.91ms | |
step:1246/1770 train_time:140705ms step_avg:112.93ms | |
step:1247/1770 train_time:140834ms step_avg:112.94ms | |
step:1248/1770 train_time:140963ms step_avg:112.95ms | |
step:1249/1770 train_time:141091ms step_avg:112.96ms | |
step:1250/1770 train_time:141219ms step_avg:112.98ms | |
step:1250/1770 val_loss:3.4221 train_time:141348ms step_avg:113.08ms | |
step:1251/1770 train_time:141367ms step_avg:113.00ms | |
step:1252/1770 train_time:141479ms step_avg:113.00ms | |
step:1253/1770 train_time:141608ms step_avg:113.02ms | |
step:1254/1770 train_time:141736ms step_avg:113.03ms | |
step:1255/1770 train_time:141867ms step_avg:113.04ms | |
step:1256/1770 train_time:141994ms step_avg:113.05ms | |
step:1257/1770 train_time:142120ms step_avg:113.06ms | |
step:1258/1770 train_time:142249ms step_avg:113.08ms | |
step:1259/1770 train_time:142378ms step_avg:113.09ms | |
step:1260/1770 train_time:142505ms step_avg:113.10ms | |
step:1261/1770 train_time:142636ms step_avg:113.11ms | |
step:1262/1770 train_time:142765ms step_avg:113.13ms | |
step:1263/1770 train_time:142893ms step_avg:113.14ms | |
step:1264/1770 train_time:143023ms step_avg:113.15ms | |
step:1265/1770 train_time:143151ms step_avg:113.16ms | |
step:1266/1770 train_time:143280ms step_avg:113.18ms | |
step:1267/1770 train_time:143407ms step_avg:113.19ms | |
step:1268/1770 train_time:143537ms step_avg:113.20ms | |
step:1269/1770 train_time:143665ms step_avg:113.21ms | |
step:1270/1770 train_time:143794ms step_avg:113.22ms | |
step:1271/1770 train_time:143922ms step_avg:113.24ms | |
step:1272/1770 train_time:144048ms step_avg:113.25ms | |
step:1273/1770 train_time:144179ms step_avg:113.26ms | |
step:1274/1770 train_time:144307ms step_avg:113.27ms | |
step:1275/1770 train_time:144434ms step_avg:113.28ms | |
step:1276/1770 train_time:144561ms step_avg:113.29ms | |
step:1277/1770 train_time:144689ms step_avg:113.30ms | |
step:1278/1770 train_time:144818ms step_avg:113.32ms | |
step:1279/1770 train_time:144948ms step_avg:113.33ms | |
step:1280/1770 train_time:145078ms step_avg:113.34ms | |
step:1281/1770 train_time:145205ms step_avg:113.35ms | |
step:1282/1770 train_time:145333ms step_avg:113.36ms | |
step:1283/1770 train_time:145461ms step_avg:113.38ms | |
step:1284/1770 train_time:145590ms step_avg:113.39ms | |
step:1285/1770 train_time:145717ms step_avg:113.40ms | |
step:1286/1770 train_time:145848ms step_avg:113.41ms | |
step:1287/1770 train_time:145978ms step_avg:113.43ms | |
step:1288/1770 train_time:146108ms step_avg:113.44ms | |
step:1289/1770 train_time:146235ms step_avg:113.45ms | |
step:1290/1770 train_time:146362ms step_avg:113.46ms | |
step:1291/1770 train_time:146490ms step_avg:113.47ms | |
step:1292/1770 train_time:146617ms step_avg:113.48ms | |
step:1293/1770 train_time:146746ms step_avg:113.49ms | |
step:1294/1770 train_time:146874ms step_avg:113.50ms | |
step:1295/1770 train_time:147002ms step_avg:113.52ms | |
step:1296/1770 train_time:147130ms step_avg:113.53ms | |
step:1297/1770 train_time:147256ms step_avg:113.54ms | |
step:1298/1770 train_time:147384ms step_avg:113.55ms | |
step:1299/1770 train_time:147512ms step_avg:113.56ms | |
step:1300/1770 train_time:147640ms step_avg:113.57ms | |
step:1301/1770 train_time:147769ms step_avg:113.58ms | |
step:1302/1770 train_time:147897ms step_avg:113.59ms | |
step:1303/1770 train_time:148024ms step_avg:113.60ms | |
step:1304/1770 train_time:148151ms step_avg:113.61ms | |
step:1305/1770 train_time:148279ms step_avg:113.62ms | |
step:1306/1770 train_time:148407ms step_avg:113.63ms | |
step:1307/1770 train_time:148534ms step_avg:113.64ms | |
step:1308/1770 train_time:148661ms step_avg:113.66ms | |
step:1309/1770 train_time:148788ms step_avg:113.67ms | |
step:1310/1770 train_time:148916ms step_avg:113.68ms | |
step:1311/1770 train_time:149044ms step_avg:113.69ms | |
step:1312/1770 train_time:149171ms step_avg:113.70ms | |
step:1313/1770 train_time:149296ms step_avg:113.71ms | |
step:1314/1770 train_time:149424ms step_avg:113.72ms | |
step:1315/1770 train_time:149552ms step_avg:113.73ms | |
step:1316/1770 train_time:149681ms step_avg:113.74ms | |
step:1317/1770 train_time:149809ms step_avg:113.75ms | |
step:1318/1770 train_time:149941ms step_avg:113.76ms | |
step:1319/1770 train_time:150071ms step_avg:113.78ms | |
step:1320/1770 train_time:150199ms step_avg:113.79ms | |
step:1321/1770 train_time:150328ms step_avg:113.80ms | |
step:1322/1770 train_time:150456ms step_avg:113.81ms | |
step:1323/1770 train_time:150585ms step_avg:113.82ms | |
step:1324/1770 train_time:150715ms step_avg:113.83ms | |
step:1325/1770 train_time:150844ms step_avg:113.84ms | |
step:1326/1770 train_time:150972ms step_avg:113.85ms | |
step:1327/1770 train_time:151104ms step_avg:113.87ms | |
step:1328/1770 train_time:151231ms step_avg:113.88ms | |
step:1329/1770 train_time:151359ms step_avg:113.89ms | |
step:1330/1770 train_time:151486ms step_avg:113.90ms | |
step:1331/1770 train_time:151613ms step_avg:113.91ms | |
step:1332/1770 train_time:151742ms step_avg:113.92ms | |
step:1333/1770 train_time:151869ms step_avg:113.93ms | |
step:1334/1770 train_time:151995ms step_avg:113.94ms | |
step:1335/1770 train_time:152123ms step_avg:113.95ms | |
step:1336/1770 train_time:152251ms step_avg:113.96ms | |
step:1337/1770 train_time:152380ms step_avg:113.97ms | |
step:1338/1770 train_time:152508ms step_avg:113.98ms | |
step:1339/1770 train_time:152636ms step_avg:113.99ms | |
step:1340/1770 train_time:152767ms step_avg:114.01ms | |
step:1341/1770 train_time:152896ms step_avg:114.02ms | |
step:1342/1770 train_time:153025ms step_avg:114.03ms | |
step:1343/1770 train_time:153156ms step_avg:114.04ms | |
step:1344/1770 train_time:153284ms step_avg:114.05ms | |
step:1345/1770 train_time:153413ms step_avg:114.06ms | |
step:1346/1770 train_time:153540ms step_avg:114.07ms | |
step:1347/1770 train_time:153667ms step_avg:114.08ms | |
step:1348/1770 train_time:153799ms step_avg:114.09ms | |
step:1349/1770 train_time:153929ms step_avg:114.11ms | |
step:1350/1770 train_time:154058ms step_avg:114.12ms | |
step:1351/1770 train_time:154187ms step_avg:114.13ms | |
step:1352/1770 train_time:154314ms step_avg:114.14ms | |
step:1353/1770 train_time:154445ms step_avg:114.15ms | |
step:1354/1770 train_time:154571ms step_avg:114.16ms | |
step:1355/1770 train_time:154699ms step_avg:114.17ms | |
step:1356/1770 train_time:154827ms step_avg:114.18ms | |
step:1357/1770 train_time:154956ms step_avg:114.19ms | |
step:1358/1770 train_time:155085ms step_avg:114.20ms | |
step:1359/1770 train_time:155212ms step_avg:114.21ms | |
step:1360/1770 train_time:155344ms step_avg:114.22ms | |
step:1361/1770 train_time:155472ms step_avg:114.23ms | |
step:1362/1770 train_time:155601ms step_avg:114.24ms | |
step:1363/1770 train_time:155730ms step_avg:114.26ms | |
step:1364/1770 train_time:155858ms step_avg:114.27ms | |
step:1365/1770 train_time:155986ms step_avg:114.28ms | |
step:1366/1770 train_time:156114ms step_avg:114.29ms | |
step:1367/1770 train_time:156244ms step_avg:114.30ms | |
step:1368/1770 train_time:156370ms step_avg:114.31ms | |
step:1369/1770 train_time:156501ms step_avg:114.32ms | |
step:1370/1770 train_time:156631ms step_avg:114.33ms | |
step:1371/1770 train_time:156760ms step_avg:114.34ms | |
step:1372/1770 train_time:156888ms step_avg:114.35ms | |
step:1373/1770 train_time:157016ms step_avg:114.36ms | |
step:1374/1770 train_time:157148ms step_avg:114.37ms | |
step:1375/1770 train_time:157278ms step_avg:114.38ms | |
step:1375/1770 val_loss:3.3801 train_time:157405ms step_avg:114.48ms | |
step:1376/1770 train_time:157423ms step_avg:114.41ms | |
step:1377/1770 train_time:157539ms step_avg:114.41ms | |
step:1378/1770 train_time:157668ms step_avg:114.42ms | |
step:1379/1770 train_time:157794ms step_avg:114.43ms | |
step:1380/1770 train_time:157923ms step_avg:114.44ms | |
step:1381/1770 train_time:158053ms step_avg:114.45ms | |
step:1382/1770 train_time:158180ms step_avg:114.46ms | |
step:1383/1770 train_time:158310ms step_avg:114.47ms | |
step:1384/1770 train_time:158438ms step_avg:114.48ms | |
step:1385/1770 train_time:158568ms step_avg:114.49ms | |
step:1386/1770 train_time:158698ms step_avg:114.50ms | |
step:1387/1770 train_time:158826ms step_avg:114.51ms | |
step:1388/1770 train_time:158955ms step_avg:114.52ms | |
step:1389/1770 train_time:159084ms step_avg:114.53ms | |
step:1390/1770 train_time:159213ms step_avg:114.54ms | |
step:1391/1770 train_time:159341ms step_avg:114.55ms | |
step:1392/1770 train_time:159469ms step_avg:114.56ms | |
step:1393/1770 train_time:159596ms step_avg:114.57ms | |
step:1394/1770 train_time:159723ms step_avg:114.58ms | |
step:1395/1770 train_time:159852ms step_avg:114.59ms | |
step:1396/1770 train_time:159981ms step_avg:114.60ms | |
step:1397/1770 train_time:160111ms step_avg:114.61ms | |
step:1398/1770 train_time:160239ms step_avg:114.62ms | |
step:1399/1770 train_time:160367ms step_avg:114.63ms | |
step:1400/1770 train_time:160496ms step_avg:114.64ms | |
step:1401/1770 train_time:160624ms step_avg:114.65ms | |
step:1402/1770 train_time:160753ms step_avg:114.66ms | |
step:1403/1770 train_time:160880ms step_avg:114.67ms | |
step:1404/1770 train_time:161011ms step_avg:114.68ms | |
step:1405/1770 train_time:161139ms step_avg:114.69ms | |
step:1406/1770 train_time:161268ms step_avg:114.70ms | |
step:1407/1770 train_time:161395ms step_avg:114.71ms | |
step:1408/1770 train_time:161523ms step_avg:114.72ms | |
step:1409/1770 train_time:161651ms step_avg:114.73ms | |
step:1410/1770 train_time:161781ms step_avg:114.74ms | |
step:1411/1770 train_time:161908ms step_avg:114.75ms | |
step:1412/1770 train_time:162036ms step_avg:114.76ms | |
step:1413/1770 train_time:162163ms step_avg:114.77ms | |
step:1414/1770 train_time:162293ms step_avg:114.78ms | |
step:1415/1770 train_time:162423ms step_avg:114.79ms | |
step:1416/1770 train_time:162553ms step_avg:114.80ms | |
step:1417/1770 train_time:162681ms step_avg:114.81ms | |
step:1418/1770 train_time:162809ms step_avg:114.82ms | |
step:1419/1770 train_time:162937ms step_avg:114.83ms | |
step:1420/1770 train_time:163066ms step_avg:114.84ms | |
step:1421/1770 train_time:163193ms step_avg:114.84ms | |
step:1422/1770 train_time:163321ms step_avg:114.85ms | |
step:1423/1770 train_time:163450ms step_avg:114.86ms | |
step:1424/1770 train_time:163580ms step_avg:114.87ms | |
step:1425/1770 train_time:163707ms step_avg:114.88ms | |
step:1426/1770 train_time:163836ms step_avg:114.89ms | |
step:1427/1770 train_time:163963ms step_avg:114.90ms | |
step:1428/1770 train_time:164093ms step_avg:114.91ms | |
step:1429/1770 train_time:164221ms step_avg:114.92ms | |
step:1430/1770 train_time:164350ms step_avg:114.93ms | |
step:1431/1770 train_time:164482ms step_avg:114.94ms | |
step:1432/1770 train_time:164610ms step_avg:114.95ms | |
step:1433/1770 train_time:164737ms step_avg:114.96ms | |
step:1434/1770 train_time:164865ms step_avg:114.97ms | |
step:1435/1770 train_time:164994ms step_avg:114.98ms | |
step:1436/1770 train_time:165127ms step_avg:114.99ms | |
step:1437/1770 train_time:165256ms step_avg:115.00ms | |
step:1438/1770 train_time:165384ms step_avg:115.01ms | |
step:1439/1770 train_time:165512ms step_avg:115.02ms | |
step:1440/1770 train_time:165639ms step_avg:115.03ms | |
step:1441/1770 train_time:165771ms step_avg:115.04ms | |
step:1442/1770 train_time:165899ms step_avg:115.05ms | |
step:1443/1770 train_time:166027ms step_avg:115.06ms | |
step:1444/1770 train_time:166158ms step_avg:115.07ms | |
step:1445/1770 train_time:166289ms step_avg:115.08ms | |
step:1446/1770 train_time:166421ms step_avg:115.09ms | |
step:1447/1770 train_time:166553ms step_avg:115.10ms | |
step:1448/1770 train_time:166686ms step_avg:115.11ms | |
step:1449/1770 train_time:166822ms step_avg:115.13ms | |
step:1450/1770 train_time:166953ms step_avg:115.14ms | |
step:1451/1770 train_time:167086ms step_avg:115.15ms | |
step:1452/1770 train_time:167219ms step_avg:115.16ms | |
step:1453/1770 train_time:167351ms step_avg:115.18ms | |
step:1454/1770 train_time:167482ms step_avg:115.19ms | |
step:1455/1770 train_time:167618ms step_avg:115.20ms | |
step:1456/1770 train_time:167753ms step_avg:115.21ms | |
step:1457/1770 train_time:167886ms step_avg:115.23ms | |
step:1458/1770 train_time:168019ms step_avg:115.24ms | |
step:1459/1770 train_time:168153ms step_avg:115.25ms | |
step:1460/1770 train_time:168284ms step_avg:115.26ms | |
step:1461/1770 train_time:168416ms step_avg:115.27ms | |
step:1462/1770 train_time:168548ms step_avg:115.29ms | |
step:1463/1770 train_time:168680ms step_avg:115.30ms | |
step:1464/1770 train_time:168814ms step_avg:115.31ms | |
step:1465/1770 train_time:168948ms step_avg:115.32ms | |
step:1466/1770 train_time:169084ms step_avg:115.34ms | |
step:1467/1770 train_time:169221ms step_avg:115.35ms | |
step:1468/1770 train_time:169355ms step_avg:115.36ms | |
step:1469/1770 train_time:169487ms step_avg:115.38ms | |
step:1470/1770 train_time:169617ms step_avg:115.39ms | |
step:1471/1770 train_time:169751ms step_avg:115.40ms | |
step:1472/1770 train_time:169884ms step_avg:115.41ms | |
step:1473/1770 train_time:170018ms step_avg:115.42ms | |
step:1474/1770 train_time:170152ms step_avg:115.44ms | |
step:1475/1770 train_time:170283ms step_avg:115.45ms | |
step:1476/1770 train_time:170414ms step_avg:115.46ms | |
step:1477/1770 train_time:170551ms step_avg:115.47ms | |
step:1478/1770 train_time:170686ms step_avg:115.48ms | |
step:1479/1770 train_time:170819ms step_avg:115.50ms | |
step:1480/1770 train_time:170950ms step_avg:115.51ms | |
step:1481/1770 train_time:171089ms step_avg:115.52ms | |
step:1482/1770 train_time:171220ms step_avg:115.53ms | |
step:1483/1770 train_time:171352ms step_avg:115.54ms | |
step:1484/1770 train_time:171483ms step_avg:115.55ms | |
step:1485/1770 train_time:171616ms step_avg:115.57ms | |
step:1486/1770 train_time:171748ms step_avg:115.58ms | |
step:1487/1770 train_time:171880ms step_avg:115.59ms | |
step:1488/1770 train_time:172013ms step_avg:115.60ms | |
step:1489/1770 train_time:172149ms step_avg:115.61ms | |
step:1490/1770 train_time:172281ms step_avg:115.62ms | |
step:1491/1770 train_time:172415ms step_avg:115.64ms | |
step:1492/1770 train_time:172548ms step_avg:115.65ms | |
step:1493/1770 train_time:172686ms step_avg:115.66ms | |
step:1494/1770 train_time:172825ms step_avg:115.68ms | |
step:1495/1770 train_time:172957ms step_avg:115.69ms | |
step:1496/1770 train_time:173089ms step_avg:115.70ms | |
step:1497/1770 train_time:173222ms step_avg:115.71ms | |
step:1498/1770 train_time:173353ms step_avg:115.72ms | |
step:1499/1770 train_time:173484ms step_avg:115.73ms | |
step:1500/1770 train_time:173614ms step_avg:115.74ms | |
step:1500/1770 val_loss:3.3418 train_time:173746ms step_avg:115.83ms | |
step:1501/1770 train_time:173764ms step_avg:115.77ms | |
step:1502/1770 train_time:173887ms step_avg:115.77ms | |
step:1503/1770 train_time:174016ms step_avg:115.78ms | |
step:1504/1770 train_time:174149ms step_avg:115.79ms | |
step:1505/1770 train_time:174286ms step_avg:115.80ms | |
step:1506/1770 train_time:174420ms step_avg:115.82ms | |
step:1507/1770 train_time:174553ms step_avg:115.83ms | |
step:1508/1770 train_time:174687ms step_avg:115.84ms | |
step:1509/1770 train_time:174819ms step_avg:115.85ms | |
step:1510/1770 train_time:174951ms step_avg:115.86ms | |
step:1511/1770 train_time:175085ms step_avg:115.87ms | |
step:1512/1770 train_time:175216ms step_avg:115.88ms | |
step:1513/1770 train_time:175350ms step_avg:115.90ms | |
step:1514/1770 train_time:175483ms step_avg:115.91ms | |
step:1515/1770 train_time:175616ms step_avg:115.92ms | |
step:1516/1770 train_time:175750ms step_avg:115.93ms | |
step:1517/1770 train_time:175883ms step_avg:115.94ms | |
step:1518/1770 train_time:176019ms step_avg:115.95ms | |
step:1519/1770 train_time:176149ms step_avg:115.96ms | |
step:1520/1770 train_time:176284ms step_avg:115.98ms | |
step:1521/1770 train_time:176416ms step_avg:115.99ms | |
step:1522/1770 train_time:176548ms step_avg:116.00ms | |
step:1523/1770 train_time:176684ms step_avg:116.01ms | |
step:1524/1770 train_time:176815ms step_avg:116.02ms | |
step:1525/1770 train_time:176947ms step_avg:116.03ms | |
step:1526/1770 train_time:177080ms step_avg:116.04ms | |
step:1527/1770 train_time:177211ms step_avg:116.05ms | |
step:1528/1770 train_time:177348ms step_avg:116.07ms | |
step:1529/1770 train_time:177480ms step_avg:116.08ms | |
step:1530/1770 train_time:177614ms step_avg:116.09ms | |
step:1531/1770 train_time:177747ms step_avg:116.10ms | |
step:1532/1770 train_time:177881ms step_avg:116.11ms | |
step:1533/1770 train_time:178015ms step_avg:116.12ms | |
step:1534/1770 train_time:178148ms step_avg:116.13ms | |
step:1535/1770 train_time:178279ms step_avg:116.14ms | |
step:1536/1770 train_time:178412ms step_avg:116.15ms | |
step:1537/1770 train_time:178547ms step_avg:116.17ms | |
step:1538/1770 train_time:178682ms step_avg:116.18ms | |
step:1539/1770 train_time:178812ms step_avg:116.19ms | |
step:1540/1770 train_time:178949ms step_avg:116.20ms | |
step:1541/1770 train_time:179085ms step_avg:116.21ms | |
step:1542/1770 train_time:179219ms step_avg:116.23ms | |
step:1543/1770 train_time:179352ms step_avg:116.24ms | |
step:1544/1770 train_time:179488ms step_avg:116.25ms | |
step:1545/1770 train_time:179620ms step_avg:116.26ms | |
step:1546/1770 train_time:179754ms step_avg:116.27ms | |
step:1547/1770 train_time:179885ms step_avg:116.28ms | |
step:1548/1770 train_time:180017ms step_avg:116.29ms | |
step:1549/1770 train_time:180149ms step_avg:116.30ms | |
step:1550/1770 train_time:180282ms step_avg:116.31ms | |
step:1551/1770 train_time:180412ms step_avg:116.32ms | |
step:1552/1770 train_time:180550ms step_avg:116.33ms | |
step:1553/1770 train_time:180683ms step_avg:116.34ms | |
step:1554/1770 train_time:180814ms step_avg:116.35ms | |
step:1555/1770 train_time:180949ms step_avg:116.37ms | |
step:1556/1770 train_time:181080ms step_avg:116.38ms | |
step:1557/1770 train_time:181213ms step_avg:116.39ms | |
step:1558/1770 train_time:181346ms step_avg:116.40ms | |
step:1559/1770 train_time:181478ms step_avg:116.41ms | |
step:1560/1770 train_time:181609ms step_avg:116.42ms | |
step:1561/1770 train_time:181747ms step_avg:116.43ms | |
step:1562/1770 train_time:181880ms step_avg:116.44ms | |
step:1563/1770 train_time:182012ms step_avg:116.45ms | |
step:1564/1770 train_time:182145ms step_avg:116.46ms | |
step:1565/1770 train_time:182276ms step_avg:116.47ms | |
step:1566/1770 train_time:182407ms step_avg:116.48ms | |
step:1567/1770 train_time:182539ms step_avg:116.49ms | |
step:1568/1770 train_time:182672ms step_avg:116.50ms | |
step:1569/1770 train_time:182809ms step_avg:116.51ms | |
step:1570/1770 train_time:182943ms step_avg:116.52ms | |
step:1571/1770 train_time:183075ms step_avg:116.53ms | |
step:1572/1770 train_time:183210ms step_avg:116.55ms | |
step:1573/1770 train_time:183346ms step_avg:116.56ms | |
step:1574/1770 train_time:183479ms step_avg:116.57ms | |
step:1575/1770 train_time:183610ms step_avg:116.58ms | |
step:1576/1770 train_time:183742ms step_avg:116.59ms | |
step:1577/1770 train_time:183878ms step_avg:116.60ms | |
step:1578/1770 train_time:184014ms step_avg:116.61ms | |
step:1579/1770 train_time:184145ms step_avg:116.62ms | |
step:1580/1770 train_time:184277ms step_avg:116.63ms | |
step:1581/1770 train_time:184414ms step_avg:116.64ms | |
step:1582/1770 train_time:184549ms step_avg:116.66ms | |
step:1583/1770 train_time:184682ms step_avg:116.67ms | |
step:1584/1770 train_time:184814ms step_avg:116.68ms | |
step:1585/1770 train_time:184947ms step_avg:116.69ms | |
step:1586/1770 train_time:185085ms step_avg:116.70ms | |
step:1587/1770 train_time:185221ms step_avg:116.71ms | |
step:1588/1770 train_time:185353ms step_avg:116.72ms | |
step:1589/1770 train_time:185490ms step_avg:116.73ms | |
step:1590/1770 train_time:185623ms step_avg:116.74ms | |
step:1591/1770 train_time:185754ms step_avg:116.75ms | |
step:1592/1770 train_time:185891ms step_avg:116.77ms | |
step:1593/1770 train_time:186024ms step_avg:116.78ms | |
step:1594/1770 train_time:186157ms step_avg:116.79ms | |
step:1595/1770 train_time:186289ms step_avg:116.80ms | |
step:1596/1770 train_time:186424ms step_avg:116.81ms | |
step:1597/1770 train_time:186556ms step_avg:116.82ms | |
step:1598/1770 train_time:186690ms step_avg:116.83ms | |
step:1599/1770 train_time:186825ms step_avg:116.84ms | |
step:1600/1770 train_time:186965ms step_avg:116.85ms | |
step:1601/1770 train_time:187101ms step_avg:116.87ms | |
step:1602/1770 train_time:187236ms step_avg:116.88ms | |
step:1603/1770 train_time:187369ms step_avg:116.89ms | |
step:1604/1770 train_time:187500ms step_avg:116.90ms | |
step:1605/1770 train_time:187632ms step_avg:116.90ms | |
step:1606/1770 train_time:187766ms step_avg:116.92ms | |
step:1607/1770 train_time:187906ms step_avg:116.93ms | |
step:1608/1770 train_time:188038ms step_avg:116.94ms | |
step:1609/1770 train_time:188171ms step_avg:116.95ms | |
step:1610/1770 train_time:188307ms step_avg:116.96ms | |
step:1611/1770 train_time:188442ms step_avg:116.97ms | |
step:1612/1770 train_time:188577ms step_avg:116.98ms | |
step:1613/1770 train_time:188710ms step_avg:116.99ms | |
step:1614/1770 train_time:188842ms step_avg:117.00ms | |
step:1615/1770 train_time:188977ms step_avg:117.01ms | |
step:1616/1770 train_time:189110ms step_avg:117.02ms | |
step:1617/1770 train_time:189246ms step_avg:117.04ms | |
step:1618/1770 train_time:189382ms step_avg:117.05ms | |
step:1619/1770 train_time:189517ms step_avg:117.06ms | |
step:1620/1770 train_time:189652ms step_avg:117.07ms | |
step:1621/1770 train_time:189786ms step_avg:117.08ms | |
step:1622/1770 train_time:189920ms step_avg:117.09ms | |
step:1623/1770 train_time:190059ms step_avg:117.10ms | |
step:1624/1770 train_time:190192ms step_avg:117.11ms | |
step:1625/1770 train_time:190324ms step_avg:117.12ms | |
step:1625/1770 val_loss:3.3071 train_time:190456ms step_avg:117.20ms | |
step:1626/1770 train_time:190475ms step_avg:117.14ms | |
step:1627/1770 train_time:190590ms step_avg:117.14ms | |
step:1628/1770 train_time:190721ms step_avg:117.15ms | |
step:1629/1770 train_time:190852ms step_avg:117.16ms | |
step:1630/1770 train_time:190984ms step_avg:117.17ms | |
step:1631/1770 train_time:191116ms step_avg:117.18ms | |
step:1632/1770 train_time:191248ms step_avg:117.19ms | |
step:1633/1770 train_time:191383ms step_avg:117.20ms | |
step:1634/1770 train_time:191515ms step_avg:117.21ms | |
step:1635/1770 train_time:191648ms step_avg:117.22ms | |
step:1636/1770 train_time:191781ms step_avg:117.23ms | |
step:1637/1770 train_time:191916ms step_avg:117.24ms | |
step:1638/1770 train_time:192047ms step_avg:117.24ms | |
step:1639/1770 train_time:192181ms step_avg:117.26ms | |
step:1640/1770 train_time:192315ms step_avg:117.27ms | |
step:1641/1770 train_time:192447ms step_avg:117.27ms | |
step:1642/1770 train_time:192579ms step_avg:117.28ms | |
step:1643/1770 train_time:192712ms step_avg:117.29ms | |
step:1644/1770 train_time:192847ms step_avg:117.30ms | |
step:1645/1770 train_time:192979ms step_avg:117.31ms | |
step:1646/1770 train_time:193115ms step_avg:117.32ms | |
step:1647/1770 train_time:193249ms step_avg:117.33ms | |
step:1648/1770 train_time:193380ms step_avg:117.34ms | |
step:1649/1770 train_time:193513ms step_avg:117.35ms | |
step:1650/1770 train_time:193646ms step_avg:117.36ms | |
step:1651/1770 train_time:193777ms step_avg:117.37ms | |
step:1652/1770 train_time:193911ms step_avg:117.38ms | |
step:1653/1770 train_time:194044ms step_avg:117.39ms | |
step:1654/1770 train_time:194181ms step_avg:117.40ms | |
step:1655/1770 train_time:194319ms step_avg:117.41ms | |
step:1656/1770 train_time:194452ms step_avg:117.42ms | |
step:1657/1770 train_time:194587ms step_avg:117.43ms | |
step:1658/1770 train_time:194720ms step_avg:117.44ms | |
step:1659/1770 train_time:194856ms step_avg:117.45ms | |
step:1660/1770 train_time:194990ms step_avg:117.46ms | |
step:1661/1770 train_time:195125ms step_avg:117.47ms | |
step:1662/1770 train_time:195259ms step_avg:117.48ms | |
step:1663/1770 train_time:195392ms step_avg:117.49ms | |
step:1664/1770 train_time:195525ms step_avg:117.50ms | |
step:1665/1770 train_time:195656ms step_avg:117.51ms | |
step:1666/1770 train_time:195790ms step_avg:117.52ms | |
step:1667/1770 train_time:195922ms step_avg:117.53ms | |
step:1668/1770 train_time:196054ms step_avg:117.54ms | |
step:1669/1770 train_time:196184ms step_avg:117.55ms | |
step:1670/1770 train_time:196316ms step_avg:117.55ms | |
step:1671/1770 train_time:196452ms step_avg:117.57ms | |
step:1672/1770 train_time:196585ms step_avg:117.57ms | |
step:1673/1770 train_time:196721ms step_avg:117.59ms | |
step:1674/1770 train_time:196853ms step_avg:117.59ms | |
step:1675/1770 train_time:196985ms step_avg:117.60ms | |
step:1676/1770 train_time:197119ms step_avg:117.61ms | |
step:1677/1770 train_time:197258ms step_avg:117.63ms | |
step:1678/1770 train_time:197389ms step_avg:117.63ms | |
step:1679/1770 train_time:197523ms step_avg:117.64ms | |
step:1680/1770 train_time:197657ms step_avg:117.65ms | |
step:1681/1770 train_time:197791ms step_avg:117.66ms | |
step:1682/1770 train_time:197927ms step_avg:117.67ms | |
step:1683/1770 train_time:198058ms step_avg:117.68ms | |
step:1684/1770 train_time:198189ms step_avg:117.69ms | |
step:1685/1770 train_time:198321ms step_avg:117.70ms | |
step:1686/1770 train_time:198456ms step_avg:117.71ms | |
step:1687/1770 train_time:198592ms step_avg:117.72ms | |
step:1688/1770 train_time:198726ms step_avg:117.73ms | |
step:1689/1770 train_time:198859ms step_avg:117.74ms | |
step:1690/1770 train_time:198992ms step_avg:117.75ms | |
step:1691/1770 train_time:199126ms step_avg:117.76ms | |
step:1692/1770 train_time:199262ms step_avg:117.77ms | |
step:1693/1770 train_time:199398ms step_avg:117.78ms | |
step:1694/1770 train_time:199530ms step_avg:117.79ms | |
step:1695/1770 train_time:199664ms step_avg:117.80ms | |
step:1696/1770 train_time:199800ms step_avg:117.81ms | |
step:1697/1770 train_time:199939ms step_avg:117.82ms | |
step:1698/1770 train_time:200074ms step_avg:117.83ms | |
step:1699/1770 train_time:200206ms step_avg:117.84ms | |
step:1700/1770 train_time:200337ms step_avg:117.85ms | |
step:1701/1770 train_time:200471ms step_avg:117.85ms | |
step:1702/1770 train_time:200604ms step_avg:117.86ms | |
step:1703/1770 train_time:200736ms step_avg:117.87ms | |
step:1704/1770 train_time:200868ms step_avg:117.88ms | |
step:1705/1770 train_time:201000ms step_avg:117.89ms | |
step:1706/1770 train_time:201133ms step_avg:117.90ms | |
step:1707/1770 train_time:201268ms step_avg:117.91ms | |
step:1708/1770 train_time:201403ms step_avg:117.92ms | |
step:1709/1770 train_time:201541ms step_avg:117.93ms | |
step:1710/1770 train_time:201684ms step_avg:117.94ms | |
step:1711/1770 train_time:201825ms step_avg:117.96ms | |
step:1712/1770 train_time:201962ms step_avg:117.97ms | |
step:1713/1770 train_time:202095ms step_avg:117.98ms | |
step:1714/1770 train_time:202231ms step_avg:117.99ms | |
step:1715/1770 train_time:202365ms step_avg:118.00ms | |
step:1716/1770 train_time:202502ms step_avg:118.01ms | |
step:1717/1770 train_time:202637ms step_avg:118.02ms | |
step:1718/1770 train_time:202775ms step_avg:118.03ms | |
step:1719/1770 train_time:202912ms step_avg:118.04ms | |
step:1720/1770 train_time:203050ms step_avg:118.05ms | |
step:1721/1770 train_time:203186ms step_avg:118.06ms | |
step:1722/1770 train_time:203326ms step_avg:118.08ms | |
step:1723/1770 train_time:203467ms step_avg:118.09ms | |
step:1724/1770 train_time:203607ms step_avg:118.10ms | |
step:1725/1770 train_time:203747ms step_avg:118.11ms | |
step:1726/1770 train_time:203889ms step_avg:118.13ms | |
step:1727/1770 train_time:204024ms step_avg:118.14ms | |
step:1728/1770 train_time:204164ms step_avg:118.15ms | |
step:1729/1770 train_time:204300ms step_avg:118.16ms | |
step:1730/1770 train_time:204439ms step_avg:118.17ms | |
step:1731/1770 train_time:204580ms step_avg:118.19ms | |
step:1732/1770 train_time:204714ms step_avg:118.20ms | |
step:1733/1770 train_time:204855ms step_avg:118.21ms | |
step:1734/1770 train_time:204988ms step_avg:118.22ms | |
step:1735/1770 train_time:205127ms step_avg:118.23ms | |
step:1736/1770 train_time:205262ms step_avg:118.24ms | |
step:1737/1770 train_time:205396ms step_avg:118.25ms | |
step:1738/1770 train_time:205534ms step_avg:118.26ms | |
step:1739/1770 train_time:205669ms step_avg:118.27ms | |
step:1740/1770 train_time:205803ms step_avg:118.28ms | |
step:1741/1770 train_time:205946ms step_avg:118.29ms | |
step:1742/1770 train_time:206087ms step_avg:118.30ms | |
step:1743/1770 train_time:206225ms step_avg:118.32ms | |
step:1744/1770 train_time:206361ms step_avg:118.33ms | |
step:1745/1770 train_time:206497ms step_avg:118.34ms | |
step:1746/1770 train_time:206640ms step_avg:118.35ms | |
step:1747/1770 train_time:206775ms step_avg:118.36ms | |
step:1748/1770 train_time:206917ms step_avg:118.37ms | |
step:1749/1770 train_time:207056ms step_avg:118.39ms | |
step:1750/1770 train_time:207191ms step_avg:118.39ms | |
step:1750/1770 val_loss:3.2806 train_time:207327ms step_avg:118.47ms | |
step:1751/1770 train_time:207346ms step_avg:118.42ms | |
step:1752/1770 train_time:207466ms step_avg:118.42ms | |
step:1753/1770 train_time:207603ms step_avg:118.43ms | |
step:1754/1770 train_time:207739ms step_avg:118.44ms | |
step:1755/1770 train_time:207876ms step_avg:118.45ms | |
step:1756/1770 train_time:208014ms step_avg:118.46ms | |
step:1757/1770 train_time:208150ms step_avg:118.47ms | |
step:1758/1770 train_time:208286ms step_avg:118.48ms | |
step:1759/1770 train_time:208423ms step_avg:118.49ms | |
step:1760/1770 train_time:208558ms step_avg:118.50ms | |
step:1761/1770 train_time:208698ms step_avg:118.51ms | |
step:1762/1770 train_time:208841ms step_avg:118.53ms | |
step:1763/1770 train_time:208975ms step_avg:118.53ms | |
step:1764/1770 train_time:209111ms step_avg:118.54ms | |
step:1765/1770 train_time:209248ms step_avg:118.55ms | |
step:1766/1770 train_time:209390ms step_avg:118.57ms | |
step:1767/1770 train_time:209524ms step_avg:118.58ms | |
step:1768/1770 train_time:209662ms step_avg:118.59ms | |
step:1769/1770 train_time:209796ms step_avg:118.60ms | |
step:1770/1770 train_time:209930ms step_avg:118.60ms | |
step:1770/1770 val_loss:3.2774 train_time:210068ms step_avg:118.68ms | |
peak memory allocated: 30724 MiB reserved: 45392 MiB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment