andreaskoepf/install_tgi.sh

## install_tgi.sh
# sent to me by tju01, thx

# install base tools
apt update
apt install protobuf-compiler libssl-dev gcc pkg-config g++ make

# install rust
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
source "$HOME/.cargo/env"

git clone --depth 1 https://github.com/huggingface/text-generation-inference.git
cd text-generation-inference

# We will use *a second virtual environment* specifically for `text-generation-inference`. This is to avoid package conflicts.
python3.10 -m venv .venv
. .venv/bin/activate

# This is needed for parallel builds which are significantly faster.
pip install ninja

# set parallel compile job count (e.g. to number of cores)
export MAX_JOBS=128
BUILD_EXTENSIONS=True make install

cd server
make install-flash-attention
make install-flash-attention-v2
make install-vllm # Installs *a part* of vLLM used in text-generation-inference. This is not the full vLLM that we already installed.


# launch
text-generation-launcher --model-id OpenAssistant/falcon-40b-sft-mix-1226 -p 8080 --quantize bitsandbytes --max-input-length 1024 --max-total-tokens 2048 --max-batch-prefill-tokens 1024

# test
curl 127.0.0.1:8080/generate_stream -X POST -d '{"inputs":"<|prompter|>What is capital of France?</s><|assistant|>","parameters":{"max_new_tokens":100}}' -H 'Content-Type: application/json'
	# sent to me by tju01, thx

	# install base tools
	apt update
	apt install protobuf-compiler libssl-dev gcc pkg-config g++ make

	# install rust
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh
	source "$HOME/.cargo/env"

	git clone --depth 1 https://github.com/huggingface/text-generation-inference.git
	cd text-generation-inference

	# We will use a second virtual environment specifically for `text-generation-inference`. This is to avoid package conflicts.
	python3.10 -m venv .venv
	. .venv/bin/activate

	# This is needed for parallel builds which are significantly faster.
	pip install ninja

	# set parallel compile job count (e.g. to number of cores)
	export MAX_JOBS=128
	BUILD_EXTENSIONS=True make install

	cd server
	make install-flash-attention
	make install-flash-attention-v2
	make install-vllm # Installs a part of vLLM used in text-generation-inference. This is not the full vLLM that we already installed.


	# launch
	text-generation-launcher --model-id OpenAssistant/falcon-40b-sft-mix-1226 -p 8080 --quantize bitsandbytes --max-input-length 1024 --max-total-tokens 2048 --max-batch-prefill-tokens 1024

	# test
	curl 127.0.0.1:8080/generate_stream -X POST -d '{"inputs":"<\|prompter\|>What is capital of France?</s><\|assistant\|>","parameters":{"max_new_tokens":100}}' -H 'Content-Type: application/json'