zhijiang zhijxu-MS

## dist.py
import torch
import torch.distributed as dist
import os

import torch.distributed

def init_process_group(backend=None, init_method='env://', world_size=1, rank=0):
    """
    Initialize the distributed process group.

## deepspeed-stage2.md

      
              3 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                zhijxu-MS
                / deepspeed-stage2.md
            
            
              Last active
              February 17, 2025 13:18
            
              
                project readings
              
          
    过程

每个self.optimizer.param_group里所有的param flatten后， 切分到各个dp rank上
self.single_partition_of_fp32_groups即属于该dp rank的params
initialize_optimizer_states: per single_partition_of_fp32_groups创建对应的optimizer master weight和grad
self.round_robin_gradients
在切分前shuffle params， 这样torch module相邻的param可以属于不同的dp rank， 从而gradient allreduce后每个dp rank都能有属于自己负责的gradient

  
## 1.py
# from vllm;
class WorkerMonitor(threading.Thread):
    """Monitor worker status (in background thread)"""

    def __init__(self, workers: List['ProcessWorkerWrapper'],
                 result_handler: ResultHandler):
        super().__init__(daemon=True)
        self.workers = workers
        self.result_handler = result_handler
        self._close = False

## 0. c++ basic.md

      
              11 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                zhijxu-MS
                / 0. c++ basic.md
            
            
              Last active
              September 23, 2025 11:23
            
              
                lc-exp
              
          
    c++ basic

// basic
auto ptr = new int;
auto p_ptr = &ptr;
*p_ptr = (*p_ptr)->val; // (**p_ptr).val, *的优先级低于->和.
numeric_limits<int>::max() >>得到各个dtype的max value
//每个val间隔去赋值vector
vector<int> a(n, 0);

  
## finetune_dolly.sh
#!/bin/bash

ds_config=`mktemp --suffix ".json"`
echo the deepspeed config is put at $ds_config
cat << EOF > $ds_config
{
  "fp16": {
    "enabled": true,
    "loss_scale": 0,
    "loss_scale_window": 1000,

## cheat_sheet.txt
GDB commands by function - simple guide
---------------------------------------
More important commands have a (*) by them.

Startup
% gdb -help         	print startup help, show switches
*% gdb object      	normal debug
*% gdb object core 	core debug (must specify core file)
%% gdb object pid  	attach to running process
% gdb        		use file command to load object
	import torch
	import torch.distributed as dist
	import os

	import torch.distributed

	def init_process_group(backend=None, init_method='env://', world_size=1, rank=0):
	"""
	Initialize the distributed process group.
	# from vllm;
	class WorkerMonitor(threading.Thread):
	"""Monitor worker status (in background thread)"""

	def __init__(self, workers: List['ProcessWorkerWrapper'],
	result_handler: ResultHandler):
	super().__init__(daemon=True)
	self.workers = workers
	self.result_handler = result_handler
	self._close = False
	#!/bin/bash

	ds_config=`mktemp --suffix ".json"`
	echo the deepspeed config is put at $ds_config
	cat << EOF > $ds_config
	{
	"fp16": {
	"enabled": true,
	"loss_scale": 0,
	"loss_scale_window": 1000,
	GDB commands by function - simple guide
	---------------------------------------
	More important commands have a (*) by them.

	Startup
	% gdb -help print startup help, show switches
	*% gdb object normal debug
	*% gdb object core core debug (must specify core file)
	%% gdb object pid attach to running process
	% gdb use file command to load object