GeorgeDittmar/docker-compose.yml

## docker-compose.yml
version: '3'

services:
  tgi:
    image: ghcr.io/huggingface/text-generation-inference:latest
    container_name: tgi
    ports:
      - 8080:80
    volumes:
      - ${LOCAL_MODEL_CACHE_DIR}:/model_cache
    environment:
      - HUGGING_FACE_HUB_TOKEN=${LLAMA_TOKEN}
    # need this to access GPU
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command:
      - '--huggingface-hub-cache'
      - '/model_cache'
      - '--model-id'
      - '${MODEL_ID}'
      - '--max-batch-prefill-tokens'
      - '${MAX_PREFILL_TOKENS}'
      - '--quantize'
      - '${QUANT}'
      - '--max-total-tokens'
      - '${MAX_TOTAL_TOKENS}'
      - '--max-input-length'
      - '${MAX_INPUT_LENGTH}'
    shm_size: 1gb
  ui:
    image: localllm-ui:latest
    container_name: ui
    build:
      context: ./chat_ui/
    ports:
      - 7000:7000
  # index:
  #   image:
  #   ports:
  #     - 8088:80

  # api:
  #   image:
	version: '3'

	services:
	tgi:
	image: ghcr.io/huggingface/text-generation-inference:latest
	container_name: tgi
	ports:
	- 8080:80
	volumes:
	- ${LOCAL_MODEL_CACHE_DIR}:/model_cache
	environment:
	- HUGGING_FACE_HUB_TOKEN=${LLAMA_TOKEN}
	# need this to access GPU
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	command:
	- '--huggingface-hub-cache'
	- '/model_cache'
	- '--model-id'
	- '${MODEL_ID}'
	- '--max-batch-prefill-tokens'
	- '${MAX_PREFILL_TOKENS}'
	- '--quantize'
	- '${QUANT}'
	- '--max-total-tokens'
	- '${MAX_TOTAL_TOKENS}'
	- '--max-input-length'
	- '${MAX_INPUT_LENGTH}'
	shm_size: 1gb
	ui:
	image: localllm-ui:latest
	container_name: ui
	build:
	context: ./chat_ui/
	ports:
	- 7000:7000
	# index:
	# image:
	# ports:
	# - 8088:80

	# api:
	# image: