Yun Dai yundai424

## softcap_jsd_test_failure.txt
======================================================================= FAILURES ========================================================================
____________________________________ test_correctness_functional[30.0-1.0-0.5--100-0.5-dtype1-1e-05-0.0005-2-2-8-8] _____________________________________

B = 2, T = 2, H = 8, V = 8, scalar = 0.5, dtype = torch.float32, beta = 0.5, ignore_index = -100, temperature = 1.0, softcap = 30.0, atol = 1e-05
rtol = 0.0005

    @pytest.mark.parametrize(
        "B, T, H, V",
        [
            (2, 2, 8, 8),

## requirements.txt
torch==2.1
transformers==4.37.2
# the fix is shipped with deepspeed==0.13.5
deepspeed==0.13.4
tokenizers==0.15.1
	======================================================================= FAILURES ========================================================================
	____________________________________ test_correctness_functional[30.0-1.0-0.5--100-0.5-dtype1-1e-05-0.0005-2-2-8-8] _____________________________________

	B = 2, T = 2, H = 8, V = 8, scalar = 0.5, dtype = torch.float32, beta = 0.5, ignore_index = -100, temperature = 1.0, softcap = 30.0, atol = 1e-05
	rtol = 0.0005

	@pytest.mark.parametrize(
	"B, T, H, V",
	[
	(2, 2, 8, 8),
	torch==2.1
	transformers==4.37.2
	# the fix is shipped with deepspeed==0.13.5
	deepspeed==0.13.4
	tokenizers==0.15.1