# Official image from NVIDIA, comes with CUDA/Ubuntu 24. FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04 # The OS already has user/group "ubuntu" that occupy the id of 100 ➔ change them! RUN groupmod -g 1001 ubuntu && usermod -u 1001 -g 1001 ubuntu # Set up a new user named "user" with user ID 1000. RUN useradd -m -u 1000 user USER user WORKDIR /LLM-App ### ROOT is still supposed to run from here: USER root # Be ready to install more, and clean up first. RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* # For llama-server. RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgomp1 # The llama-server. ➔ Build from Source Code. COPY --chown=user llama.cpp.tar.gz . RUN tar -xvzf llama.cpp.tar.gz RUN apt-get update RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata RUN apt-get -y install apt-transport-https ca-certificates gnupg software-properties-common wget RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - RUN apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ bionic main' RUN apt-get update RUN apt-get -y install cmake RUN cmake --version RUN apt-get update && apt-get upgrade -y RUN apt autoremove -y RUN apt-get -y install gcc RUN gcc --version RUN apt-get update RUN apt-get -y install g++ RUN g++ --version RUN apt-get -y install nvidia-cuda-toolkit RUN apt-get -y install git RUN git --version WORKDIR /LLM-App/llama.cpp RUN cmake -B build -DGGML_CUDA=ON RUN cmake --build build --config Release RUN mkdir -p /LLM-App/binaries/bin_gpu RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_gpu/ RUN rm -rf /LLM-App/llama.cpp/build RUN cmake -B build -DGGML_CUDA=OFF RUN cmake --build build --config Release RUN mkdir -p /LLM-App/binaries/bin_cpu RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_cpu/ WORKDIR /LLM-App # Install Miniconda RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -O /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh RUN bash /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -b -p /home/user/miniconda3 ENV PATH="/home/user/miniconda3/bin:$PATH" RUN rm /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh # For agentic flow (back-end of Chat Web UI), Pandas, Streamlit - the HTTP Server, For downloading GGUF from HuggingFace (using huggingface-cli). COPY --chown=user requirements.txt . RUN pip install -r requirements.txt # The back-end system. COPY --chown=user run_local_llm_server.py . COPY --chown=user utils.py . COPY --chown=user prompt_parsing.py . COPY --chown=user server_ui.py . COPY --chown=user fast-backward.gif . # Vecto Storage Index (for RAG). COPY --chown=user vector_store_index ./vector_store_index # The entry-point (the server). COPY --chown=user total_run.sh . RUN chmod +x total_run.sh # Reduce the size of image. RUN apt-get clean RUN rm -rf /var/lib/apt/lists/* RUN rm llama.cpp.tar.gz RUN rm -rf llama.cpp # Switch to the "user" user USER user # For HuggingFace-cli. ARG HF_TOKEN ARG LOCAL_LLM_API_KEY # Set home to the user's home directory ENV HOME=/home/user ENV PATH=/home/user/.local/bin:$PATH ENV PATH="/home/user/miniconda3/bin:$PATH" ENV HF_TOKEN=$HF_TOKEN ENV LOCAL_LLM_API_KEY=$LOCAL_LLM_API_KEY # Cache HuggingFace loading. RUN python -c "import os; from llama_index.embeddings.huggingface import HuggingFaceEmbedding; from llama_index.core import Settings, StorageContext, load_index_from_storage; from llama_index.core.llms.mock import MockLLM; Settings.llm = MockLLM(); Settings.embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', device='cpu'); os.environ['TIKTOKEN_CACHE_DIR'] = f'{os.getcwd()}/tiktoken_cache'; storage_context = StorageContext.from_defaults(persist_dir='vector_store_index'); index = load_index_from_storage(storage_context)" # The final loop. CMD ["/usr/bin/bash", "./total_run.sh"] # CMD tail -f /dev/null