Cheng Li cli99

## fsdp_actckpt.py
# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

import os
import argparse
from functools import partial
import contextlib
import torch
import torch.distributed as dist

## test_te_attn.py
import torch
import torch.nn as nn
from typing import Optional, Dict, Union, List
import math
from llmfoundry.models.layers.attention import GroupedQueryAttention
from contextlib import nullcontext
from omegaconf import OmegaConf as om
from einops import rearrange
import transformer_engine.pytorch as te
from transformer_engine.common import recipe

## fp8_memory_mpt_test.py
import datetime
import gc
import pathlib
import torch

from composer.utils import get_device
from omegaconf import OmegaConf as om
from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
from composer.core import Precision
from composer import Trainer

## memory_stats.py
import torch
import torch.nn as nn
from typing import Optional,Dict,Union,List
from contextlib import nullcontext

_MEMORY_KEYS = {
    'allocated_bytes.all.current': 'current_allocated_mem',
    'active_bytes.all.current': 'current_active_mem',
    'inactive_split_bytes.all.current': 'current_inactive_mem',
    'reserved_bytes.all.current': 'current_reserved_mem',

## fsdp_mnist.py
# Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

## nsight.sh
# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

# https://developer.nvidia.com/nsight-systems
# https://docs.nvidia.com/nsight-systems/profiling/index.html

# My preferred nsys (command line executable used to create profiles) commands
#
# In your script, write
# torch.cuda.nvtx.range_push("region name")
# ...

## ipc_demo.py
#!/usr/bin/env python

"""
Demonstrate how to pass IPC handles to GPU data between processes in Python.
"""

import ctypes
import numpy as np
import multiprocessing as mp
import zmq

## gist:b5ec2c4c9fcfd3f79807406461872585
from csv import QUOTE_NONE
from platform import libc_ver
from transformers import pipeline
import transformers
import deepspeed
import torch
import os

from transformers import BertLayer
from deepspeed.module_inject.replace_policy import HFBertLayerPolicy

## gist:4e1b78f09ad9b468c6af895641a0801d
from transformers import pipeline, AutoModel, AutoTokenizer, AutoConfig
import transformers
import deepspeed
import torch
import os

local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

def see_memory_usage():

## gist:c95f9ae4a79adab64ebb9d8aedebd52b
import deepspeed
import transformers
import torch
import datetime
import os
import time
from transformers import pipeline, AutoModel, AutoTokenizer, AutoConfig
import sys

def run_model(name="gpt2", enable_cuda_graph=False, batch=2, seq=65):
	# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# See LICENSE for license information.

	import os
	import argparse
	from functools import partial
	import contextlib
	import torch
	import torch.distributed as dist
	import torch
	import torch.nn as nn
	from typing import Optional, Dict, Union, List
	import math
	from llmfoundry.models.layers.attention import GroupedQueryAttention
	from contextlib import nullcontext
	from omegaconf import OmegaConf as om
	from einops import rearrange
	import transformer_engine.pytorch as te
	from transformer_engine.common import recipe
	import datetime
	import gc
	import pathlib
	import torch

	from composer.utils import get_device
	from omegaconf import OmegaConf as om
	from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
	from composer.core import Precision
	from composer import Trainer
	import torch
	import torch.nn as nn
	from typing import Optional,Dict,Union,List
	from contextlib import nullcontext

	_MEMORY_KEYS = {
	'allocated_bytes.all.current': 'current_allocated_mem',
	'active_bytes.all.current': 'current_active_mem',
	'inactive_split_bytes.all.current': 'current_inactive_mem',
	'reserved_bytes.all.current': 'current_reserved_mem',
	# Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
	import os
	import argparse
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	from torchvision import datasets, transforms
	from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
	from transformers import AutoTokenizer
	# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

	# https://developer.nvidia.com/nsight-systems
	# https://docs.nvidia.com/nsight-systems/profiling/index.html

	# My preferred nsys (command line executable used to create profiles) commands
	#
	# In your script, write
	# torch.cuda.nvtx.range_push("region name")
	# ...
	#!/usr/bin/env python

	"""
	Demonstrate how to pass IPC handles to GPU data between processes in Python.
	"""

	import ctypes
	import numpy as np
	import multiprocessing as mp
	import zmq
	from csv import QUOTE_NONE
	from platform import libc_ver
	from transformers import pipeline
	import transformers
	import deepspeed
	import torch
	import os

	from transformers import BertLayer
	from deepspeed.module_inject.replace_policy import HFBertLayerPolicy
	from transformers import pipeline, AutoModel, AutoTokenizer, AutoConfig
	import transformers
	import deepspeed
	import torch
	import os

	local_rank = int(os.getenv('LOCAL_RANK', '0'))
	world_size = int(os.getenv('WORLD_SIZE', '1'))

	def see_memory_usage():