Skip to content

Instantly share code, notes, and snippets.

@yifuwang
Created January 11, 2024 21:37
Show Gist options
  • Save yifuwang/c4a9fefa3d1e1dc017744e094be57c6c to your computer and use it in GitHub Desktop.
Save yifuwang/c4a9fefa3d1e1dc017744e094be57c6c to your computer and use it in GitHub Desktop.
num_params=150 world_size=8 mixed=True Param size: 0.059 GB Copy bandwidth: 560.946 GB/s (gpu ms/iter: 0.105, cpu ms/iter 1.066)
num_params=54 world_size=8 mixed=True Param size: 1.453 GB Copy bandwidth: 732.657 GB/s (gpu ms/iter: 1.984, cpu ms/iter 0.417)
num_params=54 world_size=8 mixed=True Param size: 0.512 GB Copy bandwidth: 753.514 GB/s (gpu ms/iter: 0.679, cpu ms/iter 0.419)
num_params=50 world_size=8 mixed=True Param size: 0.200 GB Copy bandwidth: 719.400 GB/s (gpu ms/iter: 0.279, cpu ms/iter 0.410)
num_params=3 world_size=8 mixed=True Param size: 0.983 GB Copy bandwidth: 782.121 GB/s (gpu ms/iter: 1.257, cpu ms/iter 0.098)
num_params=9 world_size=8 mixed=True Param size: 0.802 GB Copy bandwidth: 766.458 GB/s (gpu ms/iter: 1.047, cpu ms/iter 0.134)
num_params=3 world_size=8 mixed=True Param size: 1.573 GB Copy bandwidth: 790.611 GB/s (gpu ms/iter: 1.989, cpu ms/iter 0.099)
num_params=9 world_size=8 mixed=True Param size: 2.248 GB Copy bandwidth: 789.754 GB/s (gpu ms/iter: 2.847, cpu ms/iter 0.138)
num_params=150 world_size=128 mixed=True Param size: 0.064 GB Copy bandwidth: 565.667 GB/s (gpu ms/iter: 0.113, cpu ms/iter 0.996)
num_params=54 world_size=128 mixed=True Param size: 1.458 GB Copy bandwidth: 670.681 GB/s (gpu ms/iter: 2.174, cpu ms/iter 0.289)
num_params=54 world_size=128 mixed=True Param size: 0.515 GB Copy bandwidth: 676.135 GB/s (gpu ms/iter: 0.762, cpu ms/iter 0.264)
num_params=50 world_size=128 mixed=True Param size: 0.203 GB Copy bandwidth: 662.603 GB/s (gpu ms/iter: 0.306, cpu ms/iter 0.249)
num_params=3 world_size=128 mixed=True Param size: 0.983 GB Copy bandwidth: 769.283 GB/s (gpu ms/iter: 1.278, cpu ms/iter 0.078)
num_params=9 world_size=128 mixed=True Param size: 0.802 GB Copy bandwidth: 761.057 GB/s (gpu ms/iter: 1.054, cpu ms/iter 0.104)
num_params=3 world_size=128 mixed=True Param size: 1.573 GB Copy bandwidth: 774.325 GB/s (gpu ms/iter: 2.031, cpu ms/iter 0.075)
num_params=9 world_size=128 mixed=True Param size: 2.248 GB Copy bandwidth: 773.048 GB/s (gpu ms/iter: 2.908, cpu ms/iter 0.099)
num_params=150 world_size=1024 mixed=True Param size: 0.202 GB Copy bandwidth: 641.405 GB/s (gpu ms/iter: 0.315, cpu ms/iter 0.616)
num_params=54 world_size=1024 mixed=True Param size: 1.524 GB Copy bandwidth: 646.772 GB/s (gpu ms/iter: 2.356, cpu ms/iter 0.276)
num_params=54 world_size=1024 mixed=True Param size: 0.575 GB Copy bandwidth: 658.157 GB/s (gpu ms/iter: 0.874, cpu ms/iter 0.278)
num_params=50 world_size=1024 mixed=True Param size: 0.246 GB Copy bandwidth: 642.032 GB/s (gpu ms/iter: 0.383, cpu ms/iter 0.245)
num_params=3 world_size=1024 mixed=True Param size: 1.007 GB Copy bandwidth: 728.990 GB/s (gpu ms/iter: 1.381, cpu ms/iter 0.080)
num_params=9 world_size=1024 mixed=True Param size: 0.818 GB Copy bandwidth: 689.763 GB/s (gpu ms/iter: 1.186, cpu ms/iter 0.102)
num_params=3 world_size=1024 mixed=True Param size: 1.611 GB Copy bandwidth: 765.507 GB/s (gpu ms/iter: 2.104, cpu ms/iter 0.078)
num_params=9 world_size=1024 mixed=True Param size: 2.248 GB Copy bandwidth: 757.626 GB/s (gpu ms/iter: 2.967, cpu ms/iter 0.106)
num_params=150 world_size=8 mixed=False Param size: 0.035 GB Copy bandwidth: 584.272 GB/s (gpu ms/iter: 0.060, cpu ms/iter 0.656)
num_params=54 world_size=8 mixed=False Param size: 0.961 GB Copy bandwidth: 728.234 GB/s (gpu ms/iter: 1.319, cpu ms/iter 0.264)
num_params=54 world_size=8 mixed=False Param size: 0.282 GB Copy bandwidth: 730.059 GB/s (gpu ms/iter: 0.386, cpu ms/iter 0.279)
num_params=50 world_size=8 mixed=False Param size: 0.149 GB Copy bandwidth: 670.899 GB/s (gpu ms/iter: 0.222, cpu ms/iter 0.274)
num_params=3 world_size=8 mixed=False Param size: 0.655 GB Copy bandwidth: 775.699 GB/s (gpu ms/iter: 0.845, cpu ms/iter 0.077)
num_params=9 world_size=8 mixed=False Param size: 0.634 GB Copy bandwidth: 773.612 GB/s (gpu ms/iter: 0.820, cpu ms/iter 0.112)
num_params=3 world_size=8 mixed=False Param size: 1.049 GB Copy bandwidth: 781.395 GB/s (gpu ms/iter: 1.342, cpu ms/iter 0.081)
num_params=9 world_size=8 mixed=False Param size: 1.711 GB Copy bandwidth: 789.156 GB/s (gpu ms/iter: 2.169, cpu ms/iter 0.116)
num_params=150 world_size=128 mixed=False Param size: 0.038 GB Copy bandwidth: 517.056 GB/s (gpu ms/iter: 0.073, cpu ms/iter 0.632)
num_params=54 world_size=128 mixed=False Param size: 0.963 GB Copy bandwidth: 684.246 GB/s (gpu ms/iter: 1.407, cpu ms/iter 0.294)
num_params=54 world_size=128 mixed=False Param size: 0.283 GB Copy bandwidth: 680.593 GB/s (gpu ms/iter: 0.416, cpu ms/iter 0.286)
num_params=50 world_size=128 mixed=False Param size: 0.151 GB Copy bandwidth: 682.197 GB/s (gpu ms/iter: 0.221, cpu ms/iter 0.255)
num_params=3 world_size=128 mixed=False Param size: 0.655 GB Copy bandwidth: 759.470 GB/s (gpu ms/iter: 0.863, cpu ms/iter 0.074)
num_params=9 world_size=128 mixed=False Param size: 0.634 GB Copy bandwidth: 765.694 GB/s (gpu ms/iter: 0.829, cpu ms/iter 0.094)
num_params=3 world_size=128 mixed=False Param size: 1.049 GB Copy bandwidth: 766.535 GB/s (gpu ms/iter: 1.368, cpu ms/iter 0.075)
num_params=9 world_size=128 mixed=False Param size: 1.711 GB Copy bandwidth: 787.608 GB/s (gpu ms/iter: 2.173, cpu ms/iter 0.105)
num_params=150 world_size=1024 mixed=False Param size: 0.122 GB Copy bandwidth: 640.203 GB/s (gpu ms/iter: 0.191, cpu ms/iter 0.668)
num_params=54 world_size=1024 mixed=False Param size: 1.000 GB Copy bandwidth: 713.947 GB/s (gpu ms/iter: 1.401, cpu ms/iter 0.274)
num_params=54 world_size=1024 mixed=False Param size: 0.318 GB Copy bandwidth: 642.855 GB/s (gpu ms/iter: 0.494, cpu ms/iter 0.276)
num_params=50 world_size=1024 mixed=False Param size: 0.185 GB Copy bandwidth: 643.297 GB/s (gpu ms/iter: 0.288, cpu ms/iter 0.262)
num_params=3 world_size=1024 mixed=False Param size: 0.671 GB Copy bandwidth: 690.626 GB/s (gpu ms/iter: 0.972, cpu ms/iter 0.078)
num_params=9 world_size=1024 mixed=False Param size: 0.645 GB Copy bandwidth: 754.431 GB/s (gpu ms/iter: 0.855, cpu ms/iter 0.109)
num_params=3 world_size=1024 mixed=False Param size: 1.074 GB Copy bandwidth: 769.985 GB/s (gpu ms/iter: 1.395, cpu ms/iter 0.080)
num_params=9 world_size=1024 mixed=False Param size: 1.711 GB Copy bandwidth: 766.337 GB/s (gpu ms/iter: 2.233, cpu ms/iter 0.103)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment