Created
January 18, 2024 00:18
-
-
Save vanbasten23/4a1ddddf615b4e0d78767ae936e6fb25 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(.venv) root@67df528db184:~# accelerate test | |
Running: accelerate-launch /root/accelerate/src/accelerate/test_utils/scripts/test_script.py | |
stderr: WARNING:root:Unsupported nprocs (4), ignoring... | |
stderr: E0118 00:10:43.499819043 110902 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
stderr: "error": "invalid_grant", | |
stderr: "error_description": "reauth related error (invalid_rapt)", | |
stderr: "error_uri": "https://support.google.com/a/answer/9368756", | |
stderr: "error_subtype": "invalid_rapt" | |
stderr: }]. | |
stderr: E0118 00:10:43.500200794 111869 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
stderr: "error": "invalid_grant", | |
stderr: "error_description": "reauth related error (invalid_rapt)", | |
stderr: "error_uri": "https://support.google.com/a/answer/9368756", | |
stderr: "error_subtype": "invalid_rapt" | |
stderr: }]. | |
stderr: E0118 00:10:43.511760329 110904 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
stderr: "error": "invalid_grant", | |
stderr: "error_description": "reauth related error (invalid_rapt)", | |
stderr: "error_uri": "https://support.google.com/a/answer/9368756", | |
stderr: "error_subtype": "invalid_rapt" | |
stderr: }]. | |
stderr: E0118 00:10:43.512661948 110903 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
stderr: "error": "invalid_grant", | |
stderr: "error_description": "reauth related error (invalid_rapt)", | |
stderr: "error_uri": "https://support.google.com/a/answer/9368756", | |
stderr: "error_subtype": "invalid_rapt" | |
stderr: }]. | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 2 | |
stdout: Local process index: 4 | |
stdout: Device: xla:0 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 3 | |
stdout: Local process index: 5 | |
stdout: Device: xla:1 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: 2 3 tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:1') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:0') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: **Initialization** | |
stdout: Testing, testing. 1, 2, 3. | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 0 | |
stdout: Local process index: 0 | |
stdout: Device: xla:0 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 1 | |
stdout: Local process index: 1 | |
stdout: Device: xla:1 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: | |
stdout: **Test process execution** | |
stdout: | |
stdout: **Test split between processes as a list** | |
stdout: | |
stdout: **Test split between processes as a dict** | |
stdout: | |
stdout: **Test split between processes as a tensor** | |
stdout: | |
stdout: **Test random number generator synchronization** | |
stdout: All rng are properly synched. | |
stdout: | |
stdout: **DataLoader integration test** | |
stdout: 1 0 tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:0') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:1') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: Non-shuffled dataloader passing. | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 4 | |
stdout: Local process index: 2 | |
stdout: Device: xla:0 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 5 | |
stdout: Local process index: 3 | |
stdout: Device: xla:1 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: 5 4 tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:1') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:0') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 6 | |
stdout: Local process index: 6 | |
stdout: Device: xla:0 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: Distributed environment: TPU | |
stdout: Num processes: 8 | |
stdout: Process index: 7 | |
stdout: Local process index: 7 | |
stdout: Device: xla:1 | |
stdout: | |
stdout: Mixed precision type: no | |
stdout: | |
stdout: 6 7 tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:1') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stdout: tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, | |
stdout: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, | |
stdout: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, | |
stdout: 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, | |
stdout: 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
stdout: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, | |
stdout: 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, | |
stdout: 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, | |
stdout: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, | |
stdout: 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
stdout: 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, | |
stdout: 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, | |
stdout: 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, | |
stdout: 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, | |
stdout: 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
stdout: 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, | |
stdout: 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, | |
stdout: 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
stdout: 252, 253, 254, 255], device='xla:0') <class 'accelerate.data_loader.MpDeviceLoaderWrapper'> | |
stderr: concurrent.futures.process._RemoteTraceback: | |
stderr: """ | |
stderr: Traceback (most recent call last): | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker | |
stderr: r = call_item.fn(*call_item.args, **call_item.kwargs) | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 198, in _process_chunk | |
stderr: return [fn(*args) for args in chunk] | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 198, in <listcomp> | |
stderr: return [fn(*args) for args in chunk] | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/runtime.py", line 88, in wrapper | |
stderr: return fn(*args, **kwargs) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 78, in _run_thread_per_device | |
stderr: replica_results = list( | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator | |
stderr: yield fs.pop().result() | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 444, in result | |
stderr: return self.__get_result() | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result | |
stderr: raise self._exception | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/thread.py", line 57, in run | |
stderr: result = self.fn(*self.args, **self.kwargs) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 71, in _thread_fn | |
stderr: return fn() | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 183, in __call__ | |
stderr: self.fn(runtime.global_ordinal(), *self.args, **self.kwargs) | |
stderr: File "/root/accelerate/src/accelerate/utils/launch.py", line 570, in __call__ | |
stderr: self.launcher(*args) | |
stderr: File "/root/accelerate/src/accelerate/test_utils/scripts/test_script.py", line 666, in main | |
stderr: dl_preparation_check() | |
stderr: File "/root/accelerate/src/accelerate/test_utils/scripts/test_script.py", line 205, in dl_preparation_check | |
stderr: assert result == list(range(length)), "Wrong shuffled dataloader result." | |
stderr: AssertionError: Wrong shuffled dataloader result. | |
stderr: """ | |
stderr: | |
stderr: The above exception was the direct cause of the following exception: | |
stderr: | |
stderr: Traceback (most recent call last): | |
stderr: File "/ansible/.venv/bin/accelerate-launch", line 8, in <module> | |
stderr: sys.exit(main()) | |
stderr: File "/root/accelerate/src/accelerate/commands/launch.py", line 1029, in main | |
stderr: launch_command(args) | |
stderr: File "/root/accelerate/src/accelerate/commands/launch.py", line 1019, in launch_command | |
stderr: tpu_launcher(args) | |
stderr: File "/root/accelerate/src/accelerate/commands/launch.py", line 762, in tpu_launcher | |
stderr: xmp.spawn(PrepareForLaunch(main_function), args=(), nprocs=args.num_processes) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/runtime.py", line 88, in wrapper | |
stderr: return fn(*args, **kwargs) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/distributed/xla_multiprocessing.py", line 38, in spawn | |
stderr: return pjrt.spawn(fn, nprocs, start_method, args) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 207, in spawn | |
stderr: run_multiprocess(spawn_fn, start_method=start_method) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/runtime.py", line 88, in wrapper | |
stderr: return fn(*args, **kwargs) | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 167, in run_multiprocess | |
stderr: replica_results = list( | |
stderr: File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 168, in <genexpr> | |
stderr: itertools.chain.from_iterable( | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 484, in _chain_from_iterable_of_lists | |
stderr: for element in iterable: | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator | |
stderr: yield fs.pop().result() | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 444, in result | |
stderr: return self.__get_result() | |
stderr: File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result | |
stderr: raise self._exception | |
stderr: AssertionError: Wrong shuffled dataloader result. | |
Traceback (most recent call last): | |
File "/ansible/.venv/bin/accelerate", line 8, in <module> | |
sys.exit(main()) | |
File "/root/accelerate/src/accelerate/commands/accelerate_cli.py", line 47, in main | |
args.func(args) | |
File "/root/accelerate/src/accelerate/commands/test.py", line 54, in test_command | |
result = execute_subprocess_async(cmd, env=os.environ.copy()) | |
File "/root/accelerate/src/accelerate/test_utils/testing.py", line 466, in execute_subprocess_async | |
raise RuntimeError( | |
RuntimeError: 'accelerate-launch /root/accelerate/src/accelerate/test_utils/scripts/test_script.py' failed with returncode 1 | |
The combined stderr from workers follows: | |
WARNING:root:Unsupported nprocs (4), ignoring... | |
E0118 00:10:43.499819043 110902 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
"error": "invalid_grant", | |
"error_description": "reauth related error (invalid_rapt)", | |
"error_uri": "https://support.google.com/a/answer/9368756", | |
"error_subtype": "invalid_rapt" | |
}]. | |
E0118 00:10:43.500200794 111869 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
"error": "invalid_grant", | |
"error_description": "reauth related error (invalid_rapt)", | |
"error_uri": "https://support.google.com/a/answer/9368756", | |
"error_subtype": "invalid_rapt" | |
}]. | |
E0118 00:10:43.511760329 110904 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
"error": "invalid_grant", | |
"error_description": "reauth related error (invalid_rapt)", | |
"error_uri": "https://support.google.com/a/answer/9368756", | |
"error_subtype": "invalid_rapt" | |
}]. | |
E0118 00:10:43.512661948 110903 oauth2_credentials.cc:176] Call to http server ended with error 400 [{ | |
"error": "invalid_grant", | |
"error_description": "reauth related error (invalid_rapt)", | |
"error_uri": "https://support.google.com/a/answer/9368756", | |
"error_subtype": "invalid_rapt" | |
}]. | |
concurrent.futures.process._RemoteTraceback: | |
""" | |
Traceback (most recent call last): | |
File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker | |
r = call_item.fn(*call_item.args, **call_item.kwargs) | |
File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 198, in _process_chunk | |
return [fn(*args) for args in chunk] | |
File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 198, in <listcomp> | |
return [fn(*args) for args in chunk] | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/runtime.py", line 88, in wrapper | |
return fn(*args, **kwargs) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 78, in _run_thread_per_device | |
replica_results = list( | |
File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator | |
yield fs.pop().result() | |
File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 444, in result | |
return self.__get_result() | |
File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result | |
raise self._exception | |
File "/usr/local/lib/python3.8/concurrent/futures/thread.py", line 57, in run | |
result = self.fn(*self.args, **self.kwargs) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 71, in _thread_fn | |
return fn() | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 183, in __call__ | |
self.fn(runtime.global_ordinal(), *self.args, **self.kwargs) | |
File "/root/accelerate/src/accelerate/utils/launch.py", line 570, in __call__ | |
self.launcher(*args) | |
File "/root/accelerate/src/accelerate/test_utils/scripts/test_script.py", line 666, in main | |
dl_preparation_check() | |
File "/root/accelerate/src/accelerate/test_utils/scripts/test_script.py", line 205, in dl_preparation_check | |
assert result == list(range(length)), "Wrong shuffled dataloader result." | |
AssertionError: Wrong shuffled dataloader result. | |
""" | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "/ansible/.venv/bin/accelerate-launch", line 8, in <module> | |
sys.exit(main()) | |
File "/root/accelerate/src/accelerate/commands/launch.py", line 1029, in main | |
launch_command(args) | |
File "/root/accelerate/src/accelerate/commands/launch.py", line 1019, in launch_command | |
tpu_launcher(args) | |
File "/root/accelerate/src/accelerate/commands/launch.py", line 762, in tpu_launcher | |
xmp.spawn(PrepareForLaunch(main_function), args=(), nprocs=args.num_processes) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/runtime.py", line 88, in wrapper | |
return fn(*args, **kwargs) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/distributed/xla_multiprocessing.py", line 38, in spawn | |
return pjrt.spawn(fn, nprocs, start_method, args) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 207, in spawn | |
run_multiprocess(spawn_fn, start_method=start_method) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/runtime.py", line 88, in wrapper | |
return fn(*args, **kwargs) | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 167, in run_multiprocess | |
replica_results = list( | |
File "/ansible/.venv/lib/python3.8/site-packages/torch_xla/_internal/pjrt.py", line 168, in <genexpr> | |
itertools.chain.from_iterable( | |
File "/usr/local/lib/python3.8/concurrent/futures/process.py", line 484, in _chain_from_iterable_of_lists | |
for element in iterable: | |
File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator | |
yield fs.pop().result() | |
File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 444, in result | |
return self.__get_result() | |
File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result | |
raise self._exception | |
AssertionError: Wrong shuffled dataloader result. | |
(.venv) root@67df528db184:~# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment