tnt306's picture
Compile llama-server from source code (instead of using binaries)
ad6acb5
# Official image from NVIDIA, comes with CUDA/Ubuntu 24.
FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04
# The OS already has user/group "ubuntu" that occupy the id of 100 βž” change them!
RUN groupmod -g 1001 ubuntu && usermod -u 1001 -g 1001 ubuntu
# Set up a new user named "user" with user ID 1000.
RUN useradd -m -u 1000 user
USER user
WORKDIR /LLM-App
### ROOT is still supposed to run from here:
USER root
# Be ready to install more, and clean up first.
RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
# For llama-server.
RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgomp1
# The llama-server. βž” Build from Source Code.
COPY --chown=user llama.cpp.tar.gz .
RUN tar -xvzf llama.cpp.tar.gz
RUN apt-get update
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
RUN apt-get -y install apt-transport-https ca-certificates gnupg software-properties-common wget
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ bionic main'
RUN apt-get update
RUN apt-get -y install cmake
RUN cmake --version
RUN apt-get update && apt-get upgrade -y
RUN apt autoremove -y
RUN apt-get -y install gcc
RUN gcc --version
RUN apt-get update
RUN apt-get -y install g++
RUN g++ --version
RUN apt-get -y install nvidia-cuda-toolkit
RUN apt-get -y install git
RUN git --version
WORKDIR /LLM-App/llama.cpp
RUN cmake -B build -DGGML_CUDA=ON
RUN cmake --build build --config Release
RUN mkdir -p /LLM-App/binaries/bin_gpu
RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_gpu/
RUN rm -rf /LLM-App/llama.cpp/build
RUN cmake -B build -DGGML_CUDA=OFF
RUN cmake --build build --config Release
RUN mkdir -p /LLM-App/binaries/bin_cpu
RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_cpu/
WORKDIR /LLM-App
# Install Miniconda
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -O /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh
RUN bash /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -b -p /home/user/miniconda3
ENV PATH="/home/user/miniconda3/bin:$PATH"
RUN rm /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh
# For agentic flow (back-end of Chat Web UI), Pandas, Streamlit - the HTTP Server, For downloading GGUF from HuggingFace (using huggingface-cli).
COPY --chown=user requirements.txt .
RUN pip install -r requirements.txt
# The back-end system.
COPY --chown=user run_local_llm_server.py .
COPY --chown=user utils.py .
COPY --chown=user prompt_parsing.py .
COPY --chown=user server_ui.py .
COPY --chown=user fast-backward.gif .
# Vecto Storage Index (for RAG).
COPY --chown=user vector_store_index ./vector_store_index
# The entry-point (the server).
COPY --chown=user total_run.sh .
RUN chmod +x total_run.sh
# Reduce the size of image.
RUN apt-get clean
RUN rm -rf /var/lib/apt/lists/*
RUN rm llama.cpp.tar.gz
RUN rm -rf llama.cpp
# Switch to the "user" user
USER user
# For HuggingFace-cli.
ARG HF_TOKEN
ARG LOCAL_LLM_API_KEY
# Set home to the user's home directory
ENV HOME=/home/user
ENV PATH=/home/user/.local/bin:$PATH
ENV PATH="/home/user/miniconda3/bin:$PATH"
ENV HF_TOKEN=$HF_TOKEN
ENV LOCAL_LLM_API_KEY=$LOCAL_LLM_API_KEY
# Cache HuggingFace loading.
RUN python -c "import os; from llama_index.embeddings.huggingface import HuggingFaceEmbedding; from llama_index.core import Settings, StorageContext, load_index_from_storage; from llama_index.core.llms.mock import MockLLM; Settings.llm = MockLLM(); Settings.embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', device='cpu'); os.environ['TIKTOKEN_CACHE_DIR'] = f'{os.getcwd()}/tiktoken_cache'; storage_context = StorageContext.from_defaults(persist_dir='vector_store_index'); index = load_index_from_storage(storage_context)"
# The final loop.
CMD ["/usr/bin/bash", "./total_run.sh"]
# CMD tail -f /dev/null