yifuwang/gist:6b7ea7ff90b070b7518c7b4c20d582b9

## gistfile1.txt
tensor([0.0002]) tensor([0], dtype=torch.int32)
tensor([0.0002]) tensor([-1], dtype=torch.int32)
WARNING:root:Could not determine LOCAL_WORLD_SIZE from environment, falling back to WORLD_SIZE.
WARNING:root:Could not determine LOCAL_WORLD_SIZE from environment, falling back to WORLD_SIZE.
torch.float32
torch.float32
old weight tensor([[-0.0285,  0.0301,  0.0173,  ..., -0.0305, -0.0288, -0.0027],
        [-0.0224, -0.0263,  0.0212,  ..., -0.0249,  0.0071, -0.0202],
        [ 0.0125,  0.0225,  0.0154,  ..., -0.0155, -0.0169,  0.0253],
        ...,
        [ 0.0006,  0.0041,  0.0118,  ..., -0.0078,  0.0023, -0.0024],
        [-0.0161, -0.0222, -0.0155,  ..., -0.0104, -0.0312,  0.0072],
        [-0.0046,  0.0148, -0.0016,  ...,  0.0116,  0.0208, -0.0281]])
new weight tensor([[-0.0284,  0.0301,  0.0172,  ..., -0.0306, -0.0287, -0.0027],
        [-0.0223, -0.0262,  0.0211,  ..., -0.0250,  0.0071, -0.0201],
        [ 0.0125,  0.0225,  0.0154,  ..., -0.0154, -0.0169,  0.0252],
        ...,
        [ 0.0007,  0.0042,  0.0118,  ..., -0.0078,  0.0025, -0.0025],
        [-0.0162, -0.0223, -0.0154,  ..., -0.0103, -0.0311,  0.0071],
        [-0.0047,  0.0147, -0.0015,  ...,  0.0115,  0.0208, -0.0282]])
tensor([[-0.0284,  0.0301,  0.0172,  ...,  0.0230, -0.0120,  0.0135],
        [-0.0223, -0.0262,  0.0211,  ...,  0.0203,  0.0265, -0.0037],
        [ 0.0125,  0.0225,  0.0154,  ..., -0.0243,  0.0238, -0.0051],
        ...,
        [ 0.0120,  0.0056,  0.0135,  ...,  0.0179,  0.0039, -0.0037],
        [ 0.0088, -0.0150,  0.0076,  ...,  0.0287, -0.0270,  0.0145],
        [-0.0311,  0.0221, -0.0164,  ...,  0.0252,  0.0206,  0.0199]],
       grad_fn=<SplitLookupFunction_sgd_Op>>)
tensor([[-0.0285,  0.0301,  0.0173,  ...,  0.0231, -0.0121,  0.0136],
        [-0.0224, -0.0263,  0.0212,  ...,  0.0204,  0.0266, -0.0037],
        [ 0.0125,  0.0225,  0.0154,  ..., -0.0244,  0.0237, -0.0052],
        ...,
        [ 0.0121,  0.0057,  0.0134,  ...,  0.0178,  0.0040, -0.0038],
        [ 0.0087, -0.0150,  0.0076,  ...,  0.0286, -0.0269,  0.0145],
        [-0.0311,  0.0220, -0.0164,  ...,  0.0252,  0.0207,  0.0199]],
       grad_fn=<SplitLookupFunction_sgd_Op>>)
*******************
*******************
1049520
264696
0.25220672307340497
	tensor([0.0002]) tensor([0], dtype=torch.int32)
	tensor([0.0002]) tensor([-1], dtype=torch.int32)
	WARNING:root:Could not determine LOCAL_WORLD_SIZE from environment, falling back to WORLD_SIZE.
	WARNING:root:Could not determine LOCAL_WORLD_SIZE from environment, falling back to WORLD_SIZE.
	torch.float32
	torch.float32
	old weight tensor([[-0.0285, 0.0301, 0.0173, ..., -0.0305, -0.0288, -0.0027],
	[-0.0224, -0.0263, 0.0212, ..., -0.0249, 0.0071, -0.0202],
	[ 0.0125, 0.0225, 0.0154, ..., -0.0155, -0.0169, 0.0253],
	...,
	[ 0.0006, 0.0041, 0.0118, ..., -0.0078, 0.0023, -0.0024],
	[-0.0161, -0.0222, -0.0155, ..., -0.0104, -0.0312, 0.0072],
	[-0.0046, 0.0148, -0.0016, ..., 0.0116, 0.0208, -0.0281]])
	new weight tensor([[-0.0284, 0.0301, 0.0172, ..., -0.0306, -0.0287, -0.0027],
	[-0.0223, -0.0262, 0.0211, ..., -0.0250, 0.0071, -0.0201],
	[ 0.0125, 0.0225, 0.0154, ..., -0.0154, -0.0169, 0.0252],
	...,
	[ 0.0007, 0.0042, 0.0118, ..., -0.0078, 0.0025, -0.0025],
	[-0.0162, -0.0223, -0.0154, ..., -0.0103, -0.0311, 0.0071],
	[-0.0047, 0.0147, -0.0015, ..., 0.0115, 0.0208, -0.0282]])
	tensor([[-0.0284, 0.0301, 0.0172, ..., 0.0230, -0.0120, 0.0135],
	[-0.0223, -0.0262, 0.0211, ..., 0.0203, 0.0265, -0.0037],
	[ 0.0125, 0.0225, 0.0154, ..., -0.0243, 0.0238, -0.0051],
	...,
	[ 0.0120, 0.0056, 0.0135, ..., 0.0179, 0.0039, -0.0037],
	[ 0.0088, -0.0150, 0.0076, ..., 0.0287, -0.0270, 0.0145],
	[-0.0311, 0.0221, -0.0164, ..., 0.0252, 0.0206, 0.0199]],
	grad_fn=<SplitLookupFunction_sgd_Op>>)
	tensor([[-0.0285, 0.0301, 0.0173, ..., 0.0231, -0.0121, 0.0136],
	[-0.0224, -0.0263, 0.0212, ..., 0.0204, 0.0266, -0.0037],
	[ 0.0125, 0.0225, 0.0154, ..., -0.0244, 0.0237, -0.0052],
	...,
	[ 0.0121, 0.0057, 0.0134, ..., 0.0178, 0.0040, -0.0038],
	[ 0.0087, -0.0150, 0.0076, ..., 0.0286, -0.0269, 0.0145],
	[-0.0311, 0.0220, -0.0164, ..., 0.0252, 0.0207, 0.0199]],
	grad_fn=<SplitLookupFunction_sgd_Op>>)
	*******************
	*******************
	1049520
	264696
	0.25220672307340497