kinoc

## llama4openai-api.py
# a simple Flask API to emulate OpenAI's using llama models and/or transformers
# runs on 3080

import sys
import time
import torch
import json
from peft import PeftModel

from flask import Flask, make_response, request, abort

## HF-GPT-J-6B-3080.py
# Running GPT-J-6B on a 3080 (semi-)direct from HuggingFace
#
# My laptop card has 15458 MiB free, but has problems loading the model directly from HF.
# This code fixes it using the "no_init" method by @kurumuz
# found in https://discord.com/channels/729741769192767510/851918317039255592/916463267264725083
# it is both loads faster and fixes the intermediate out-of-memory
# I now have 2564 MiB left over
#
# BEFORE LOADING
# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]

## Contradiction_test1.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification # 4.0.1
import torch # 1.7

# use "contra activate dual" on laptop
# https://github.com/facebookresearch/ParlAI/issues/3391
# https://github.com/facebookresearch/ParlAI/issues/3665
# https://arxiv.org/abs/2012.13391
# https://huggingface.co/ynie/roberta-large_conv_contradiction_detector_v0

if __name__ == '__main__':

## j6b_train_hf_ds.py
#  So now you want to finetune that GPT-J-6B on a 3090/TITAN GPU ... okay
#  More exploratory coding. It uses the Huggingface model port, deepspeed and reads all text/md files from a target directory
#  It is a fragment of a larger system with remote editing, but that's another story
#  This is the raw, training tester. Items to look out for:
#  - uses DeepSpeed and has a DS config
#  - to save space uses SGD instead of ADAM
#  - uses gradient checkpointing
#  - freezes 25% of the layers to fit

# Assumes you can already run https://gist.github.com/kinoc/2d636a68876cd3de7b6e9c9452b61089

## jserv_hf_fast.py

# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky.
# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW
# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download
# Uses GDOWN to get the image
# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion)

# Near Simplest Language model API, with room to expand!
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
# change "seq" (which is the context size) to adjust footprint

## jserv.py
# Near Simplest Language model API, with room to expand!
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
# change "seq" (which is the context size) to adjust footprint
#
# seq   vram usage
# 512   14.7G
# 900   15.3G

# uses FastAPI, so install that
# https://fastapi.tiangolo.com/tutorial/

## trainLMS_pub.py
#!/usr/bin/env python3
# Usage:
#  PYTHONPATH=src ./train --dataset <file|directory|glob>

# Got 1558M to train on a TITAN RTX using IBM Tensorflow_Large_Model_Support

# TLMS can insert explicit swaps in the graph between GPU and CPU, to extend the memory
# But the graph has While_Loop, so you have to use the TFLMSv2 version (which works with TF 1.x)
#
# Download, expand, get the egg out and install.
	# a simple Flask API to emulate OpenAI's using llama models and/or transformers
	# runs on 3080

	import sys
	import time
	import torch
	import json
	from peft import PeftModel

	from flask import Flask, make_response, request, abort
	# Running GPT-J-6B on a 3080 (semi-)direct from HuggingFace
	#
	# My laptop card has 15458 MiB free, but has problems loading the model directly from HF.
	# This code fixes it using the "no_init" method by @kurumuz
	# found in https://discord.com/channels/729741769192767510/851918317039255592/916463267264725083
	# it is both loads faster and fixes the intermediate out-of-memory
	# I now have 2564 MiB left over
	#
	# BEFORE LOADING
	# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
	from transformers import AutoTokenizer, AutoModelForSequenceClassification # 4.0.1
	import torch # 1.7

	# use "contra activate dual" on laptop
	# https://github.com/facebookresearch/ParlAI/issues/3391
	# https://github.com/facebookresearch/ParlAI/issues/3665
	# https://arxiv.org/abs/2012.13391
	# https://huggingface.co/ynie/roberta-large_conv_contradiction_detector_v0

	if __name__ == '__main__':
	# So now you want to finetune that GPT-J-6B on a 3090/TITAN GPU ... okay
	# More exploratory coding. It uses the Huggingface model port, deepspeed and reads all text/md files from a target directory
	# It is a fragment of a larger system with remote editing, but that's another story
	# This is the raw, training tester. Items to look out for:
	# - uses DeepSpeed and has a DS config
	# - to save space uses SGD instead of ADAM
	# - uses gradient checkpointing
	# - freezes 25% of the layers to fit

	# Assumes you can already run https://gist.github.com/kinoc/2d636a68876cd3de7b6e9c9452b61089

	# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky.
	# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW
	# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download
	# Uses GDOWN to get the image
	# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion)

	# Near Simplest Language model API, with room to expand!
	# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
	# change "seq" (which is the context size) to adjust footprint
	#!/usr/bin/env python3
	# Usage:
	# PYTHONPATH=src ./train --dataset <file\|directory\|glob>

	# Got 1558M to train on a TITAN RTX using IBM Tensorflow_Large_Model_Support

	# TLMS can insert explicit swaps in the graph between GPU and CPU, to extend the memory
	# But the graph has While_Loop, so you have to use the TFLMSv2 version (which works with TF 1.x)
	#
	# Download, expand, get the egg out and install.