Keith Hon Keith-Hon

## distillator.py
## Imports
from typing import Tuple
import torch
from torch import Module, Tensor
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaConfig, RobertaModel, RobertaEncoder
from torch.nn import CrossEntropyLoss, CosineEmbeddingLoss


## Function

## jserv.py
# Near Simplest Language model API, with room to expand!
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
# change "seq" (which is the context size) to adjust footprint
#
# seq   vram usage
# 512   14.7G
# 900   15.3G

# uses FastAPI, so install that
# https://fastapi.tiangolo.com/tutorial/

## j6b_train_hf_ds.py
#  So now you want to finetune that GPT-J-6B on a 3090/TITAN GPU ... okay
#  More exploratory coding. It uses the Huggingface model port, deepspeed and reads all text/md files from a target directory
#  It is a fragment of a larger system with remote editing, but that's another story
#  This is the raw, training tester. Items to look out for:
#  - uses DeepSpeed and has a DS config
#  - to save space uses SGD instead of ADAM
#  - uses gradient checkpointing
#  - freezes 25% of the layers to fit

# Assumes you can already run https://gist.github.com/kinoc/2d636a68876cd3de7b6e9c9452b61089
	## Imports
	from typing import Tuple
	import torch
	from torch import Module, Tensor
	from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaConfig, RobertaModel, RobertaEncoder
	from torch.nn import CrossEntropyLoss, CosineEmbeddingLoss



	## Function
	# Near Simplest Language model API, with room to expand!
	# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
	# change "seq" (which is the context size) to adjust footprint
	#
	# seq vram usage
	# 512 14.7G
	# 900 15.3G

	# uses FastAPI, so install that
	# https://fastapi.tiangolo.com/tutorial/
	# So now you want to finetune that GPT-J-6B on a 3090/TITAN GPU ... okay
	# More exploratory coding. It uses the Huggingface model port, deepspeed and reads all text/md files from a target directory
	# It is a fragment of a larger system with remote editing, but that's another story
	# This is the raw, training tester. Items to look out for:
	# - uses DeepSpeed and has a DS config
	# - to save space uses SGD instead of ADAM
	# - uses gradient checkpointing
	# - freezes 25% of the layers to fit

	# Assumes you can already run https://gist.github.com/kinoc/2d636a68876cd3de7b6e9c9452b61089