Spaces:
Sleeping
Sleeping
| # Official image from NVIDIA, comes with CUDA/Ubuntu 24. | |
| FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04 | |
| # The OS already has user/group "ubuntu" that occupy the id of 100 β change them! | |
| RUN groupmod -g 1001 ubuntu && usermod -u 1001 -g 1001 ubuntu | |
| # Set up a new user named "user" with user ID 1000. | |
| RUN useradd -m -u 1000 user | |
| USER user | |
| WORKDIR /LLM-App | |
| ### ROOT is still supposed to run from here: | |
| USER root | |
| # Be ready to install more, and clean up first. | |
| RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* | |
| # For llama-server. | |
| RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgomp1 | |
| # The llama-server. β Build from Source Code. | |
| COPY --chown=user llama.cpp.tar.gz . | |
| RUN tar -xvzf llama.cpp.tar.gz | |
| RUN apt-get update | |
| RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata | |
| RUN apt-get -y install apt-transport-https ca-certificates gnupg software-properties-common wget | |
| RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - | |
| RUN apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ bionic main' | |
| RUN apt-get update | |
| RUN apt-get -y install cmake | |
| RUN cmake --version | |
| RUN apt-get update && apt-get upgrade -y | |
| RUN apt autoremove -y | |
| RUN apt-get -y install gcc | |
| RUN gcc --version | |
| RUN apt-get update | |
| RUN apt-get -y install g++ | |
| RUN g++ --version | |
| RUN apt-get -y install nvidia-cuda-toolkit | |
| RUN apt-get -y install git | |
| RUN git --version | |
| WORKDIR /LLM-App/llama.cpp | |
| RUN cmake -B build -DGGML_CUDA=ON | |
| RUN cmake --build build --config Release | |
| RUN mkdir -p /LLM-App/binaries/bin_gpu | |
| RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_gpu/ | |
| RUN rm -rf /LLM-App/llama.cpp/build | |
| RUN cmake -B build -DGGML_CUDA=OFF | |
| RUN cmake --build build --config Release | |
| RUN mkdir -p /LLM-App/binaries/bin_cpu | |
| RUN cp -a /LLM-App/llama.cpp/build/bin/. /LLM-App/binaries/bin_cpu/ | |
| WORKDIR /LLM-App | |
| # Install Miniconda | |
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -O /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh | |
| RUN bash /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh -b -p /home/user/miniconda3 | |
| ENV PATH="/home/user/miniconda3/bin:$PATH" | |
| RUN rm /home/user/Miniconda3-py312_25.1.1-2-Linux-x86_64.sh | |
| # For agentic flow (back-end of Chat Web UI), Pandas, Streamlit - the HTTP Server, For downloading GGUF from HuggingFace (using huggingface-cli). | |
| COPY --chown=user requirements.txt . | |
| RUN pip install -r requirements.txt | |
| # The back-end system. | |
| COPY --chown=user run_local_llm_server.py . | |
| COPY --chown=user utils.py . | |
| COPY --chown=user prompt_parsing.py . | |
| COPY --chown=user server_ui.py . | |
| COPY --chown=user fast-backward.gif . | |
| # Vecto Storage Index (for RAG). | |
| COPY --chown=user vector_store_index ./vector_store_index | |
| # The entry-point (the server). | |
| COPY --chown=user total_run.sh . | |
| RUN chmod +x total_run.sh | |
| # Reduce the size of image. | |
| RUN apt-get clean | |
| RUN rm -rf /var/lib/apt/lists/* | |
| RUN rm llama.cpp.tar.gz | |
| RUN rm -rf llama.cpp | |
| # Switch to the "user" user | |
| USER user | |
| # For HuggingFace-cli. | |
| ARG HF_TOKEN | |
| ARG LOCAL_LLM_API_KEY | |
| # Set home to the user's home directory | |
| ENV HOME=/home/user | |
| ENV PATH=/home/user/.local/bin:$PATH | |
| ENV PATH="/home/user/miniconda3/bin:$PATH" | |
| ENV HF_TOKEN=$HF_TOKEN | |
| ENV LOCAL_LLM_API_KEY=$LOCAL_LLM_API_KEY | |
| # Cache HuggingFace loading. | |
| RUN python -c "import os; from llama_index.embeddings.huggingface import HuggingFaceEmbedding; from llama_index.core import Settings, StorageContext, load_index_from_storage; from llama_index.core.llms.mock import MockLLM; Settings.llm = MockLLM(); Settings.embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', device='cpu'); os.environ['TIKTOKEN_CACHE_DIR'] = f'{os.getcwd()}/tiktoken_cache'; storage_context = StorageContext.from_defaults(persist_dir='vector_store_index'); index = load_index_from_storage(storage_context)" | |
| # The final loop. | |
| CMD ["/usr/bin/bash", "./total_run.sh"] | |
| # CMD tail -f /dev/null | |