kinoc/jserv_hf_fast.py

## jserv_hf_fast.py

# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky.
# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW
# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download
# Uses GDOWN to get the image
# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion)

# Near Simplest Language model API, with room to expand!
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
# change "seq" (which is the context size) to adjust footprint
#
# JAX-based
# seq   vram usage
# 512   14.7G
# 900   15.3G
#
# HF-based
# seq   vram usage
# 512   15.6 G
# 900   --.- G
#

# uses FastAPI, so install that
# https://fastapi.tiangolo.com/tutorial/
#   pip install fastapi
#   pip install uvicorn[standard]
#   pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3
#   pip install termcolor
#   #`pip install flask-ngrok
#   #`pip install flask_cloudflared
#   pip install pyngrok
#   pip install nest-asyncio

#   pip install gdown
#   gdown --id 1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1 --output ../j6b_ckpt.tar
#   (resutls 12.6GB [18:19], 11.4MB/s]
#
# note: for my setup I needed to perform symlink suggested ny myjr52 in https://github.com/google/jax/issues/5231
# https://pytorch.org/get-started/previous-versions/
# for cuda 10.1
# pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
# for cuda 11.2
# pip install torch==1.8.1+cu112 torchvision==0.9.1+cu112 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

# conda install python-multipart

#--------------------------------------
#chek pyngrok â€” https://github.com/alexdlaird/pyngrok
#install
#   pip install pyngrok
#
#    Set up your ngrok Authtoken
# ngrok authtoken xxxxxxxxxxxxx

# GO: local execution
# XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_ALLOCATOR=platform CUDA_VISIBLE_DEVICES=0 python3 jserv_hf_fast.py

# When done try
# http://localhost:8051/docs#/default/read_completions_engines_completions_post

# now you are in FastAPI + EleutherAI land
# note: needs async on the read_completions otherwise jax gets upset
# REMEMBER: adjust the location of the checkpoint image TAR_PATH

#

# Using plain HF instead of Jax so can comment out JAX related for this install
# -----------------------------------------
# # uses https://github.com/kingoflolz/mesh-transformer-jax

# # so install jax on your system so recommend you get it working with your GPU first
# # !apt install zstd
#
# #
# # the "slim" version contain only bf16 weights and no optimizer parameters, which minimizes bandwidth and memory
# # wget https://the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd

# # tar -I zstd -xf step_383500_slim.tar.zstd

# # git clone https://github.com/kingoflolz/mesh-transformer-jax.git
# # pip install -r mesh-transformer-jax/requirements.txt

# # jax 0.2.12 is required due to a regression with xmap in 0.2.13
# # pip install mesh-transformer-jax/ jax==0.2.12
# # I have cuda 10.1 and python 3.9 so had to update
# # pip3 install --upgrade "https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.66+cuda101-cp39-none-manylinux2010_x86_64.whl"

# -----------------------------------------
#
# Started 2021-06-19 (USA Juneteenth) and released to freedom under MIT
#


from termcolor import colored

#from flask import Flask, redirect, url_for, request
import json
import torch
import requests
import subprocess
import tarfile
import os
import re
import time
from threading import Timer


from typing import Optional
from typing import Dict
from fastapi import FastAPI,Request,Body
import uvicorn
import nest_asyncio
from pyngrok import ngrok


import threading
import numpy as np
import transformers

from transformers import GPTNeoForCausalLM, AutoConfig,AutoTokenizer,GPT2Tokenizer


print(colored("Server Initialization ...", "magenta"))
connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"]

#if connect_method == "Cloudflare":
#   from flask_cloudflared import run_with_cloudflared
#elif connect_method == "Ngrok":
#   from flask_ngrok import run_with_ngrok

model         = None
tokenizer     = None


#------------------------------------------
# REMEMBER: Change these settings to local values

active_model=''
runtime_gpu="cuda:0"
training_gpu="cuda:0"

TAR_PATH ="../"
check_point_dir="../j6b_ckpt"
SERVER_PORT = 9995
NGROK_AUTH_TOKEN ="xxxxxxxxx"

#-----------------------------------------
#https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu
report_color ="green"
if (not torch.cuda.is_available()): report_color="red"

print(colored("   torch.cuda.is_available() = "+str(torch.cuda.is_available()), report_color))
print(colored("   torch.cuda.current_device() = "+str(torch.cuda.current_device()), report_color))
print(colored("   torch.cuda.device_count() = "+str(torch.cuda.device_count()), report_color))
print(colored("   torch.cuda.get_device_name(0) = "+str(torch.cuda.get_device_name()), report_color))
print(colored("   Mem Allocated:{}GB".format(round(torch.cuda.memory_allocated(0)/1024**3,1)), report_color))
print(colored("   Mem Cached: {}GB".format(round(torch.cuda.memory_reserved(0)/1024**3,1)), report_color))

# Set path to tar file and unpack it
model_on_drive = TAR_PATH +"j6b_ckpt.tar"
print(colored("Checking j6b_ckpt ...", "magenta"))
print(colored("   TAR_PATH ={}".format(TAR_PATH),"green"))
print(colored("   check_point_dir ={}".format(check_point_dir),"green"))
print(colored("   model_on_drive ={}".format(model_on_drive),"green"))

if (not os.path.isdir(check_point_dir)):
    print(colored("Unpacking tar file, please wait...", "magenta"))
    tar = tarfile.open(model_on_drive, "r")
    tar.extractall()
    tar.close()

else:
    print( colored("Expanded Checkpoint directory found", "green") )

# Initialize the model
print(colored("Initializing model, please wait...", "magenta"))
config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B")
config.attention_layers = ["global"] * 28
config.attention_types = [["global"], 28]
config.num_layers = 28
config.num_heads = 16
config.hidden_size = 256 * config.num_heads
config.vocab_size = 50400
config.rotary = True
config.rotary_dim = 64
config.jax = True

try:
    from collections.abc import MutableMapping
except ImportError:
    from collections import MutableMapping
from pathlib import Path

class Checkpoint(MutableMapping):
    def __init__(self, chkpt_dir, device="cpu"):
        self.device = device
        self.chkpt_dir = Path(chkpt_dir)
        self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt")))
    def __len__(self):
        return len(self.checkpoint)
    def __getitem__(self, key):
        path = self.chkpt_dir / Path(self.checkpoint[key]).name
        return torch.load(str(path), map_location=self.device)
    def __setitem__(self, key, value):
        return
    def __delitem__(self, key, value):
        return
    def keys(self):
        return self.checkpoint.keys()
    def __iter__(self):
        for key in self.checkpoint:
            yield (key, self.__getitem__(key))
    def __copy__(self):
        return Checkpoint(self.chkpt_dir, device=self.device)
    def copy(self):
        return Checkpoint(self.chkpt_dir, device=self.device)

def infer(context, top_k=40, top_p=0.9, temp=1.0, gen_len=512,repetition_penalty=1):
	start = time.time()
	tokens = tokenizer(context, return_tensors="pt").input_ids
	ids = tokens.cuda()

	start = time.time()
	#output = network.generate(batched_tokens, length, gen_len, {"top_p": np.ones(total_batch) * top_p, "temp": np.ones(total_batch) * temp})
	output = model.generate(ids,
			do_sample=True,
			min_length=gen_len,
			max_length=gen_len,
			temperature=temp,
			use_cache=True,
			top_p= top_p,
			repetition_penalty =1.5,
			no_repeat_ngram_size=6,
			max_time=60
			)

	samples = []
	for i,out_seq in enumerate(output):
		samples.append(tokenizer.decode(out_seq, skip_special_tokens=True))

	#for o in decoded_tokens[:, :, 0]:
	#	samples.append(tokenizer.decode(o))

	print(colored(f"completion done in {time.time() - start:06}s","green"))
	return samples

def recursive_infer(initial_context, current_context=None, top_k=40, top_p=0.9, temp=1.0, gen_len=256, depth=0, max_depth=5,recursive_refresh=0,repetition_penalty=1):
  lcc=0
  ic = initial_context
  cc = ''
  if current_context :
    lcc = len(current_context)
    cc = current_context
  print (colored("ENTER recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red"))
  print (colored("    in_cc:{}".format(cc),"cyan"))

  c=''
  if not current_context :
    c = initial_context
  else:
    if (recursive_refresh == 1):
      c= initial_context + "\r\n ... \r\n"
    c = c + current_context

  print (colored("loc_c:{}".format(c),"yellow"))
  loc_len = gen_len + (len(c) / 3)
  i = infer( c, top_k, top_p, temp, gen_len,repetition_penalty)[0]
  #yield i[len(c):]
  #yield i
  loc_ans = i[len(c):]
  print (colored("    loc_i:{}".format(i),"white"))
  print (colored("    loc_ans:{}".format(loc_ans),"white"))
  if depth >= max_depth: return ''
  #yield from recursive_infer(initial_context, i[len(c):],top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty)
  recursive_ans = recursive_infer(initial_context, str(loc_ans),top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty)
  returned_ans =  str(loc_ans +' '+ recursive_ans)
  print (colored("    returned_ans:{}".format(returned_ans),"cyan"))
  print (colored("EXIT recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red"))
  return returned_ans

#model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint())
print(colored("loading GPTNeoForCausalLM.from_pretrained","magenta"))
print(colored("   loading from {}".format(check_point_dir),"green"))
model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir))
print(colored("loading GPT2Tokenizer.from_pretrained","magenta"))
#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")


# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags
tokenizer     = AutoTokenizer.from_pretrained("gpt2")
vocab         = tokenizer.get_vocab()
vocab_keys    = vocab.keys()
find_keys     = lambda char : [key for key in vocab_keys if key.find(char) != -1]
bad_words     = []
bad_words_ids = []

bad_words.extend(find_keys("["))
bad_words.extend(find_keys(" ["))
bad_words.extend(find_keys("<|endoftext|>"))
for key in bad_words:
  bad_id = vocab[key]
  bad_words_ids.append([bad_id])

print(colored("    move to GPU","magenta"))
model.to(runtime_gpu)

print(colored(" >>>> DONE! <<<<", "green"))

print(colored("PRETEST: warming up processing pipeline","magenta"))

#warms up the processing on startup
pre_prompt = "I am the EleutherAI / GPT-J-6B based AI language model server. I will"
print (colored("PROMPT:"+pre_prompt,"yellow"))
print(colored(infer(pre_prompt)[0],"cyan"))

# app = Flask(__name__)
app = FastAPI()

#if connect_method == "Cloudflare":
#   run_with_cloudflared(app)
#elif connect_method == "Ngrok":
#   run_with_ngrok(app)

@app.route("/")
def home():
    return "<h1>EleutherAI J6B Service Running!</h1>"


@app.route('/request',methods = ['POST'])
def koboldrequest(request: Request=None):
   if request.method == 'POST':
      try:
        #clear_output()
        js      = request.json
        txt     = js["text"]
        min     = js["min"]
        max     = js["max"]
        rep_pen = js["rep_pen"]
        temp    = js["temperature"]
        top_p   = js["top_p"]

        # Compatability with un-updated clients
        if("numseqs" in js):
          numseqs = js["numseqs"]
        else:
          numseqs = 1

        if("retfultxt" in js):
          retfultxt = js["retfultxt"]
        else:
          retfultxt = True

        print(colored("Received Data: {0}".format(txt), "yellow"))

        torch.cuda.empty_cache()
        print(colored("Generating text, please wait...", "green"))

        tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu")
        ids = tokens.cuda()

        gen_tokens = model.generate(
              ids.long().cuda(),
              do_sample=True,
              min_length=min,
              max_length=max,
              temperature=temp,
              top_p = top_p,
              repetition_penalty = rep_pen,
              use_cache=True,
              bad_words_ids=bad_words_ids,
              num_return_sequences=numseqs
          ).long()

        genout = []
        for tkns in gen_tokens:
          if(not retfultxt):
            # Strip context tokens out of returned sequences
            dif = (len(tkns) - len(tokens[0])) * -1
            tkns = tkns[dif:]
          tkns = list(filter(lambda a: a != 50256, tkns))
          genout.append(tokenizer.decode(tkns))
        torch.cuda.empty_cache()

        if(len(genout) > 0 and genout[0] != ""):
          if(retfultxt):
            # Outdated client, send old JSON format
            print(colored("Generated Text: {0}".format(genout[0]), "cyan"))
            response = app.response_class(
              response=json.dumps({"data": {"text": genout[0]}}),
              status=200,
              mimetype='application/json'
            )
          else:
            # New client format with numseq support
            i = 0
            for seq in genout:
              print(colored("[Result {0}]\n{1}".format(i, seq), "cyan"))
              i += 1
            response = app.response_class(
              response=json.dumps({"data": {"seqs": genout}}),
              status=200,
              mimetype='application/json'
            )

          return response
        else:
          print(colored("[ERROR] Something went wrong during generation!", "red"))
          response = app.response_class(
            response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}),
            status=400,
            mimetype='application/json'
          )

        js         = {}
        tokens     = []
        ids        = []
        gen_tokens = []
        genout     = ""
        response   = {}

      except Exception as e:
        print(colored("[ERROR] Something went wrong during generation!", "red"))
        print(colored("{0}".format(e), "red"))
        response = app.response_class(
          response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}),
          status=400,
          mimetype='application/json'
        )

@app.post("/engines/completions")
async def read_completions(
#engine_id:str,
		prompt:Optional[str] = None,
		max_tokens: Optional[int]=16,
		temperature: Optional[float]=1.0,
		top_p:Optional[float]=1.0,
		top_k:Optional[int]=40,
		n:Optional[int]=1,
		stream:Optional[bool]=False,
		logprobs:Optional[int]=None,
		echo:Optional[bool]=False,
		stop:Optional[list]=None,
		presence_penalty:Optional[float]=0.0001,
		repetition_penalty:Optional[float]=1.0000,
		best_of:Optional[int]=1,
		recursive_depth:Optional[int]=0,
		recursive_refresh:Optional[int]=0,
		logit_bias:Optional[Dict[str,float]]=None,
		request: Request=None
    ):
    global active_model,model,tokenizer
    response={}
    response['params']= dict(request.query_params)
    print(response)

    text = str(prompt)
    text = text.replace("|","\r\n")
    prompt_len = len(text)
    ids = tokenizer(text, return_tensors="pt").input_ids.to(runtime_gpu)
    max_length = max_tokens + ids.shape[1]
    do_sample=True
    use_cache=True
    start = time.time()
    num_return_sequences=n
    num_beams = n
    num_beam_groups=n

    if (recursive_depth== 0):
	    gen_tokens = model.generate(
		ids,
		do_sample=True,
		min_length=max_length,
		max_length=max_length,
		temperature=temperature,
		use_cache=True,
		num_beams = num_beams,
		num_return_sequences=num_return_sequences,
		#        num_beam_groups=num_beam_groups,
		#        early_stopping=True,
		top_p=top_p,

		#        top_k=50,
		repetition_penalty =repetition_penalty,
		no_repeat_ngram_size=6,
		max_time=60
    		)
    else:
        gen_tokens = []
        # do it serial until we figure out parallel for recursive
        for x in range(num_return_sequences):
            ref_text = str(text)
            gen_tokens.append( recursive_infer(initial_context=str(ref_text),
		current_context=None,
		top_p=top_p,top_k=top_k, temp=temperature,
		gen_len=max_length,
		depth=0,
		max_depth = recursive_depth,
		recursive_refresh=recursive_refresh,
		repetition_penalty=repetition_penalty
		))

    last_prompt=text
    choices=[]
    gen_text=''

    for i,out_seq in enumerate(gen_tokens):
        choice={}
        choice['prompt']=last_prompt

        if (recursive_depth== 0):
            choice['text']=tokenizer.decode(out_seq, skip_special_tokens=True)
        else:
            choice['text']=out_seq

        choice['index']=i
        choice['logprobs']=None
        choice['finish_reason']='length'
        choices.append(choice)
        print("GenText[{}]:{}".format(i,choice['text']))
        gen_text = gen_text + choice['text']

        if (recursive_depth==0):
            last_prompt = text
        else:
            last_prompt = text
            #last_prompt = out_seq
            #if (recursive_refresh==1):
            #    last_prompt = text +"\r\n ... \r\n"+out_seq


    #gen_text = tokenizer.batch_decode(gen_tokens)[0]
    fin = time.time()
    elapsed = fin - start
    cps = (len(gen_text)-prompt_len) / elapsed

    print("elapsed:{} len:{} cps:{}".format(elapsed,len(gen_text),cps))

    response['id']=''
    response['object']='text_completion'
    response['created']=''
    response['model']= 'GPT-J-6B_HF' #args.model
    response['choices']=choices


    return(response)

print(colored("Model startup complete! Starting web service....", "green"))
# Setting an auth token allows us to open multiple
# tunnels at the same time
if (NGROK_AUTH_TOKEN is not None) and not ("xxxxxx" in NGROK_AUTH_TOKEN ) :
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(SERVER_PORT)
print(colored("Public_URL = "+str(public_url), "cyan"))
nest_asyncio.apply()
#app.run()
#if __name__ == "__main__":
print(colored("Ready to Serve!", "green"))

uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT)
print (colored("Happy Service!", "green"))

# http://localhost:9995/docs#/default/read_completions_engines_completions_post
# http://<NGROK_URL_ID>.ngrok.io/docs#/default/read_completions_engines_completions_post
# http://<NGROK_URL_ID>.ngrok.io/docs#/default/koboldrequest_request_post

	# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky.
	# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW
	# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download
	# Uses GDOWN to get the image
	# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion)

	# Near Simplest Language model API, with room to expand!
	# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
	# change "seq" (which is the context size) to adjust footprint
	#
	# JAX-based
	# seq vram usage
	# 512 14.7G
	# 900 15.3G
	#
	# HF-based
	# seq vram usage
	# 512 15.6 G
	# 900 --.- G
	#

	# uses FastAPI, so install that
	# https://fastapi.tiangolo.com/tutorial/
	# pip install fastapi
	# pip install uvicorn[standard]
	# pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3
	# pip install termcolor
	# #`pip install flask-ngrok
	# #`pip install flask_cloudflared
	# pip install pyngrok
	# pip install nest-asyncio

	# pip install gdown
	# gdown --id 1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1 --output ../j6b_ckpt.tar
	# (resutls 12.6GB [18:19], 11.4MB/s]
	#
	# note: for my setup I needed to perform symlink suggested ny myjr52 in https://github.com/google/jax/issues/5231
	# https://pytorch.org/get-started/previous-versions/
	# for cuda 10.1
	# pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
	# for cuda 11.2
	# pip install torch==1.8.1+cu112 torchvision==0.9.1+cu112 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

	# conda install python-multipart

	#--------------------------------------
	#chek pyngrok â€” https://github.com/alexdlaird/pyngrok
	#install
	# pip install pyngrok
	#
	# Set up your ngrok Authtoken
	# ngrok authtoken xxxxxxxxxxxxx

	# GO: local execution
	# XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_ALLOCATOR=platform CUDA_VISIBLE_DEVICES=0 python3 jserv_hf_fast.py

	# When done try
	# http://localhost:8051/docs#/default/read_completions_engines_completions_post

	# now you are in FastAPI + EleutherAI land
	# note: needs async on the read_completions otherwise jax gets upset
	# REMEMBER: adjust the location of the checkpoint image TAR_PATH

	#

	# Using plain HF instead of Jax so can comment out JAX related for this install
	# -----------------------------------------
	# # uses https://github.com/kingoflolz/mesh-transformer-jax

	# # so install jax on your system so recommend you get it working with your GPU first
	# # !apt install zstd
	#
	# #
	# # the "slim" version contain only bf16 weights and no optimizer parameters, which minimizes bandwidth and memory
	# # wget https://the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd

	# # tar -I zstd -xf step_383500_slim.tar.zstd

	# # git clone https://github.com/kingoflolz/mesh-transformer-jax.git
	# # pip install -r mesh-transformer-jax/requirements.txt

	# # jax 0.2.12 is required due to a regression with xmap in 0.2.13
	# # pip install mesh-transformer-jax/ jax==0.2.12
	# # I have cuda 10.1 and python 3.9 so had to update
	# # pip3 install --upgrade "https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.66+cuda101-cp39-none-manylinux2010_x86_64.whl"

	# -----------------------------------------
	#
	# Started 2021-06-19 (USA Juneteenth) and released to freedom under MIT
	#


	from termcolor import colored

	#from flask import Flask, redirect, url_for, request
	import json
	import torch
	import requests
	import subprocess
	import tarfile
	import os
	import re
	import time
	from threading import Timer



	from typing import Optional
	from typing import Dict
	from fastapi import FastAPI,Request,Body
	import uvicorn
	import nest_asyncio
	from pyngrok import ngrok



	import threading
	import numpy as np
	import transformers

	from transformers import GPTNeoForCausalLM, AutoConfig,AutoTokenizer,GPT2Tokenizer



	print(colored("Server Initialization ...", "magenta"))
	connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"]

	#if connect_method == "Cloudflare":
	# from flask_cloudflared import run_with_cloudflared
	#elif connect_method == "Ngrok":
	# from flask_ngrok import run_with_ngrok

	model = None
	tokenizer = None


	#------------------------------------------
	# REMEMBER: Change these settings to local values

	active_model=''
	runtime_gpu="cuda:0"
	training_gpu="cuda:0"

	TAR_PATH ="../"
	check_point_dir="../j6b_ckpt"
	SERVER_PORT = 9995
	NGROK_AUTH_TOKEN ="xxxxxxxxx"

	#-----------------------------------------
	#https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu
	report_color ="green"
	if (not torch.cuda.is_available()): report_color="red"

	print(colored(" torch.cuda.is_available() = "+str(torch.cuda.is_available()), report_color))
	print(colored(" torch.cuda.current_device() = "+str(torch.cuda.current_device()), report_color))
	print(colored(" torch.cuda.device_count() = "+str(torch.cuda.device_count()), report_color))
	print(colored(" torch.cuda.get_device_name(0) = "+str(torch.cuda.get_device_name()), report_color))
	print(colored(" Mem Allocated:{}GB".format(round(torch.cuda.memory_allocated(0)/1024**3,1)), report_color))
	print(colored(" Mem Cached: {}GB".format(round(torch.cuda.memory_reserved(0)/1024**3,1)), report_color))

	# Set path to tar file and unpack it
	model_on_drive = TAR_PATH +"j6b_ckpt.tar"
	print(colored("Checking j6b_ckpt ...", "magenta"))
	print(colored(" TAR_PATH ={}".format(TAR_PATH),"green"))
	print(colored(" check_point_dir ={}".format(check_point_dir),"green"))
	print(colored(" model_on_drive ={}".format(model_on_drive),"green"))

	if (not os.path.isdir(check_point_dir)):
	print(colored("Unpacking tar file, please wait...", "magenta"))
	tar = tarfile.open(model_on_drive, "r")
	tar.extractall()
	tar.close()

	else:
	print( colored("Expanded Checkpoint directory found", "green") )

	# Initialize the model
	print(colored("Initializing model, please wait...", "magenta"))
	config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B")
	config.attention_layers = ["global"] * 28
	config.attention_types = [["global"], 28]
	config.num_layers = 28
	config.num_heads = 16
	config.hidden_size = 256 * config.num_heads
	config.vocab_size = 50400
	config.rotary = True
	config.rotary_dim = 64
	config.jax = True

	try:
	from collections.abc import MutableMapping
	except ImportError:
	from collections import MutableMapping
	from pathlib import Path

	class Checkpoint(MutableMapping):
	def __init__(self, chkpt_dir, device="cpu"):
	self.device = device
	self.chkpt_dir = Path(chkpt_dir)
	self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt")))
	def __len__(self):
	return len(self.checkpoint)
	def __getitem__(self, key):
	path = self.chkpt_dir / Path(self.checkpoint[key]).name
	return torch.load(str(path), map_location=self.device)
	def __setitem__(self, key, value):
	return
	def __delitem__(self, key, value):
	return
	def keys(self):
	return self.checkpoint.keys()
	def __iter__(self):
	for key in self.checkpoint:
	yield (key, self.__getitem__(key))
	def __copy__(self):
	return Checkpoint(self.chkpt_dir, device=self.device)
	def copy(self):
	return Checkpoint(self.chkpt_dir, device=self.device)

	def infer(context, top_k=40, top_p=0.9, temp=1.0, gen_len=512,repetition_penalty=1):
	start = time.time()
	tokens = tokenizer(context, return_tensors="pt").input_ids
	ids = tokens.cuda()

	start = time.time()
	#output = network.generate(batched_tokens, length, gen_len, {"top_p": np.ones(total_batch) * top_p, "temp": np.ones(total_batch) * temp})
	output = model.generate(ids,
	do_sample=True,
	min_length=gen_len,
	max_length=gen_len,
	temperature=temp,
	use_cache=True,
	top_p= top_p,
	repetition_penalty =1.5,
	no_repeat_ngram_size=6,
	max_time=60
	)

	samples = []
	for i,out_seq in enumerate(output):
	samples.append(tokenizer.decode(out_seq, skip_special_tokens=True))

	#for o in decoded_tokens[:, :, 0]:
	# samples.append(tokenizer.decode(o))

	print(colored(f"completion done in {time.time() - start:06}s","green"))
	return samples

	def recursive_infer(initial_context, current_context=None, top_k=40, top_p=0.9, temp=1.0, gen_len=256, depth=0, max_depth=5,recursive_refresh=0,repetition_penalty=1):
	lcc=0
	ic = initial_context
	cc = ''
	if current_context :
	lcc = len(current_context)
	cc = current_context
	print (colored("ENTER recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red"))
	print (colored(" in_cc:{}".format(cc),"cyan"))

	c=''
	if not current_context :
	c = initial_context
	else:
	if (recursive_refresh == 1):
	c= initial_context + "\r\n ... \r\n"
	c = c + current_context

	print (colored("loc_c:{}".format(c),"yellow"))
	loc_len = gen_len + (len(c) / 3)
	i = infer( c, top_k, top_p, temp, gen_len,repetition_penalty)[0]
	#yield i[len(c):]
	#yield i
	loc_ans = i[len(c):]
	print (colored(" loc_i:{}".format(i),"white"))
	print (colored(" loc_ans:{}".format(loc_ans),"white"))
	if depth >= max_depth: return ''
	#yield from recursive_infer(initial_context, i[len(c):],top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty)
	recursive_ans = recursive_infer(initial_context, str(loc_ans),top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty)
	returned_ans = str(loc_ans +' '+ recursive_ans)
	print (colored(" returned_ans:{}".format(returned_ans),"cyan"))
	print (colored("EXIT recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red"))
	return returned_ans

	#model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint())
	print(colored("loading GPTNeoForCausalLM.from_pretrained","magenta"))
	print(colored(" loading from {}".format(check_point_dir),"green"))
	model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir))
	print(colored("loading GPT2Tokenizer.from_pretrained","magenta"))
	#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")


	# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags
	tokenizer = AutoTokenizer.from_pretrained("gpt2")
	vocab = tokenizer.get_vocab()
	vocab_keys = vocab.keys()
	find_keys = lambda char : [key for key in vocab_keys if key.find(char) != -1]
	bad_words = []
	bad_words_ids = []

	bad_words.extend(find_keys("["))
	bad_words.extend(find_keys(" ["))
	bad_words.extend(find_keys("<\|endoftext\|>"))
	for key in bad_words:
	bad_id = vocab[key]
	bad_words_ids.append([bad_id])

	print(colored(" move to GPU","magenta"))
	model.to(runtime_gpu)

	print(colored(" >>>> DONE! <<<<", "green"))

	print(colored("PRETEST: warming up processing pipeline","magenta"))

	#warms up the processing on startup
	pre_prompt = "I am the EleutherAI / GPT-J-6B based AI language model server. I will"
	print (colored("PROMPT:"+pre_prompt,"yellow"))
	print(colored(infer(pre_prompt)[0],"cyan"))

	# app = Flask(__name__)
	app = FastAPI()

	#if connect_method == "Cloudflare":
	# run_with_cloudflared(app)
	#elif connect_method == "Ngrok":
	# run_with_ngrok(app)

	@app.route("/")
	def home():
	return "<h1>EleutherAI J6B Service Running!</h1>"


	@app.route('/request',methods = ['POST'])
	def koboldrequest(request: Request=None):
	if request.method == 'POST':
	try:
	#clear_output()
	js = request.json
	txt = js["text"]
	min = js["min"]
	max = js["max"]
	rep_pen = js["rep_pen"]
	temp = js["temperature"]
	top_p = js["top_p"]

	# Compatability with un-updated clients
	if("numseqs" in js):
	numseqs = js["numseqs"]
	else:
	numseqs = 1

	if("retfultxt" in js):
	retfultxt = js["retfultxt"]
	else:
	retfultxt = True

	print(colored("Received Data: {0}".format(txt), "yellow"))

	torch.cuda.empty_cache()
	print(colored("Generating text, please wait...", "green"))

	tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu")
	ids = tokens.cuda()

	gen_tokens = model.generate(
	ids.long().cuda(),
	do_sample=True,
	min_length=min,
	max_length=max,
	temperature=temp,
	top_p = top_p,
	repetition_penalty = rep_pen,
	use_cache=True,
	bad_words_ids=bad_words_ids,
	num_return_sequences=numseqs
	).long()

	genout = []
	for tkns in gen_tokens:
	if(not retfultxt):
	# Strip context tokens out of returned sequences
	dif = (len(tkns) - len(tokens[0])) * -1
	tkns = tkns[dif:]
	tkns = list(filter(lambda a: a != 50256, tkns))
	genout.append(tokenizer.decode(tkns))
	torch.cuda.empty_cache()

	if(len(genout) > 0 and genout[0] != ""):
	if(retfultxt):
	# Outdated client, send old JSON format
	print(colored("Generated Text: {0}".format(genout[0]), "cyan"))
	response = app.response_class(
	response=json.dumps({"data": {"text": genout[0]}}),
	status=200,
	mimetype='application/json'
	)
	else:
	# New client format with numseq support
	i = 0
	for seq in genout:
	print(colored("[Result {0}]\n{1}".format(i, seq), "cyan"))
	i += 1
	response = app.response_class(
	response=json.dumps({"data": {"seqs": genout}}),
	status=200,
	mimetype='application/json'
	)

	return response
	else:
	print(colored("[ERROR] Something went wrong during generation!", "red"))
	response = app.response_class(
	response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}),
	status=400,
	mimetype='application/json'
	)

	js = {}
	tokens = []
	ids = []
	gen_tokens = []
	genout = ""
	response = {}

	except Exception as e:
	print(colored("[ERROR] Something went wrong during generation!", "red"))
	print(colored("{0}".format(e), "red"))
	response = app.response_class(
	response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}),
	status=400,
	mimetype='application/json'
	)

	@app.post("/engines/completions")
	async def read_completions(
	#engine_id:str,
	prompt:Optional[str] = None,
	max_tokens: Optional[int]=16,
	temperature: Optional[float]=1.0,
	top_p:Optional[float]=1.0,
	top_k:Optional[int]=40,
	n:Optional[int]=1,
	stream:Optional[bool]=False,
	logprobs:Optional[int]=None,
	echo:Optional[bool]=False,
	stop:Optional[list]=None,
	presence_penalty:Optional[float]=0.0001,
	repetition_penalty:Optional[float]=1.0000,
	best_of:Optional[int]=1,
	recursive_depth:Optional[int]=0,
	recursive_refresh:Optional[int]=0,
	logit_bias:Optional[Dict[str,float]]=None,
	request: Request=None
	):
	global active_model,model,tokenizer
	response={}
	response['params']= dict(request.query_params)
	print(response)

	text = str(prompt)
	text = text.replace("\|","\r\n")
	prompt_len = len(text)
	ids = tokenizer(text, return_tensors="pt").input_ids.to(runtime_gpu)
	max_length = max_tokens + ids.shape[1]
	do_sample=True
	use_cache=True
	start = time.time()
	num_return_sequences=n
	num_beams = n
	num_beam_groups=n

	if (recursive_depth== 0):
	gen_tokens = model.generate(
	ids,
	do_sample=True,
	min_length=max_length,
	max_length=max_length,
	temperature=temperature,
	use_cache=True,
	num_beams = num_beams,
	num_return_sequences=num_return_sequences,
	# num_beam_groups=num_beam_groups,
	# early_stopping=True,
	top_p=top_p,

	# top_k=50,
	repetition_penalty =repetition_penalty,
	no_repeat_ngram_size=6,
	max_time=60
	)
	else:
	gen_tokens = []
	# do it serial until we figure out parallel for recursive
	for x in range(num_return_sequences):
	ref_text = str(text)
	gen_tokens.append( recursive_infer(initial_context=str(ref_text),
	current_context=None,
	top_p=top_p,top_k=top_k, temp=temperature,
	gen_len=max_length,
	depth=0,
	max_depth = recursive_depth,
	recursive_refresh=recursive_refresh,
	repetition_penalty=repetition_penalty
	))

	last_prompt=text
	choices=[]
	gen_text=''

	for i,out_seq in enumerate(gen_tokens):
	choice={}
	choice['prompt']=last_prompt

	if (recursive_depth== 0):
	choice['text']=tokenizer.decode(out_seq, skip_special_tokens=True)
	else:
	choice['text']=out_seq

	choice['index']=i
	choice['logprobs']=None
	choice['finish_reason']='length'
	choices.append(choice)
	print("GenText[{}]:{}".format(i,choice['text']))
	gen_text = gen_text + choice['text']

	if (recursive_depth==0):
	last_prompt = text
	else:
	last_prompt = text
	#last_prompt = out_seq
	#if (recursive_refresh==1):
	# last_prompt = text +"\r\n ... \r\n"+out_seq



	#gen_text = tokenizer.batch_decode(gen_tokens)[0]
	fin = time.time()
	elapsed = fin - start
	cps = (len(gen_text)-prompt_len) / elapsed

	print("elapsed:{} len:{} cps:{}".format(elapsed,len(gen_text),cps))

	response['id']=''
	response['object']='text_completion'
	response['created']=''
	response['model']= 'GPT-J-6B_HF' #args.model
	response['choices']=choices


	return(response)

	print(colored("Model startup complete! Starting web service....", "green"))
	# Setting an auth token allows us to open multiple
	# tunnels at the same time
	if (NGROK_AUTH_TOKEN is not None) and not ("xxxxxx" in NGROK_AUTH_TOKEN ) :
	ngrok.set_auth_token(NGROK_AUTH_TOKEN)

	public_url = ngrok.connect(SERVER_PORT)
	print(colored("Public_URL = "+str(public_url), "cyan"))
	nest_asyncio.apply()
	#app.run()
	#if __name__ == "__main__":
	print(colored("Ready to Serve!", "green"))

	uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT)
	print (colored("Happy Service!", "green"))

	# http://localhost:9995/docs#/default/read_completions_engines_completions_post
	# http://<NGROK_URL_ID>.ngrok.io/docs#/default/read_completions_engines_completions_post
	# http://<NGROK_URL_ID>.ngrok.io/docs#/default/koboldrequest_request_post