Spaces:

tnt306
/

finetuned-llm-demo-app

Sleeping

App Files Files Community

finetuned-llm-demo-app / Dockerfile

tnt306

Compile llama-server from source code (instead of using binaries)

ad6acb5 8 months ago

raw

history blame contribute delete

3.97 kB

	# Official image from NVIDIA, comes with CUDA/Ubuntu 24.
	FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04

	# The OS already has user/group "ubuntu" that occupy the id of 100 ➔ change them!
	RUN groupmod -g 1001 ubuntu && usermod -u 1001 -g 1001 ubuntu

	# Set up a new user named "user" with user ID 1000.
	RUN useradd -m -u 1000 user

	USER user

	WORKDIR /LLM-App

	### ROOT is still supposed to run from here:
	USER root

	# Be ready to install more, and clean up first.
	RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*

	# For llama-server.
	RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgomp1

	# The llama-server. ➔ Build from Source Code.
	COPY --chown=user llama.cpp.tar.gz .
	RUN tar -xvzf llama.cpp.tar.gz

	RUN apt-get update
	RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
	RUN apt-get -y install apt-transport-https ca-certificates gnupg software-properties-common wget
	RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \| apt-key add -
	RUN apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ bionic main'
	RUN apt-get update
	RUN apt-get -y install cmake
	RUN cmake --version

	RUN apt-get update && apt-get upgrade -y
	RUN apt autoremove -y
	RUN apt-get -y install gcc
	RUN gcc --version
	RUN apt-get update
	RUN apt-get -y install g++
	RUN g++ --version

	RUN apt-get -y install nvidia-cuda-toolkit

	RUN apt-get -y install git
	RUN git --version

	WORKDIR /LLM-App/llama.cpp

	RUN cmake -B build -DGGML_CUDA=ON
	RUN cmake --build build --config Release

	RUN mkdir -p /LLM-App/binaries/bin_gpu
	RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_gpu/
	RUN rm -rf /LLM-App/llama.cpp/build

	RUN cmake -B build -DGGML_CUDA=OFF
	RUN cmake --build build --config Release

	RUN mkdir -p /LLM-App/binaries/bin_cpu
	RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_cpu/

	WORKDIR /LLM-App

	# Install Miniconda
	RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -O /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh
	RUN bash /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -b -p /home/user/miniconda3
	ENV PATH="/home/user/miniconda3/bin:$PATH"
	RUN rm /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh

	# For agentic flow (back-end of Chat Web UI), Pandas, Streamlit - the HTTP Server, For downloading GGUF from HuggingFace (using huggingface-cli).
	COPY --chown=user requirements.txt .
	RUN pip install -r requirements.txt

	# The back-end system.
	COPY --chown=user run_local_llm_server.py .
	COPY --chown=user utils.py .
	COPY --chown=user prompt_parsing.py .
	COPY --chown=user server_ui.py .
	COPY --chown=user fast-backward.gif .

	# Vecto Storage Index (for RAG).
	COPY --chown=user vector_store_index ./vector_store_index

	# The entry-point (the server).
	COPY --chown=user total_run.sh .
	RUN chmod +x total_run.sh

	# Reduce the size of image.
	RUN apt-get clean
	RUN rm -rf /var/lib/apt/lists/*
	RUN rm llama.cpp.tar.gz
	RUN rm -rf llama.cpp

	# Switch to the "user" user
	USER user

	# For HuggingFace-cli.
	ARG HF_TOKEN
	ARG LOCAL_LLM_API_KEY

	# Set home to the user's home directory
	ENV HOME=/home/user
	ENV PATH=/home/user/.local/bin:$PATH
	ENV PATH="/home/user/miniconda3/bin:$PATH"
	ENV HF_TOKEN=$HF_TOKEN
	ENV LOCAL_LLM_API_KEY=$LOCAL_LLM_API_KEY

	# Cache HuggingFace loading.
	RUN python -c "import os; from llama_index.embeddings.huggingface import HuggingFaceEmbedding; from llama_index.core import Settings, StorageContext, load_index_from_storage; from llama_index.core.llms.mock import MockLLM; Settings.llm = MockLLM(); Settings.embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', device='cpu'); os.environ['TIKTOKEN_CACHE_DIR'] = f'{os.getcwd()}/tiktoken_cache'; storage_context = StorageContext.from_defaults(persist_dir='vector_store_index'); index = load_index_from_storage(storage_context)"

	# The final loop.
	CMD ["/usr/bin/bash", "./total_run.sh"]
	# CMD tail -f /dev/null