From ab178246423324aa467a20d0ae894ae54a255498 Mon Sep 17 00:00:00 2001
From: Brian <bmahabir@bu.edu>
Date: Fri, 1 Nov 2024 19:55:10 -0400
Subject: [PATCH] reduced the size of the nvidia containerfile

Signed-off-by: Brian <bmahabir@bu.edu>
---
 container-images/cuda/Containerfile | 94 +++++++++++++----------------
 1 file changed, 42 insertions(+), 52 deletions(-)
diff --git a/container-images/cuda/Containerfile b/container-images/cuda/Containerfile
index fecc744b..244093f2 100644
--- a/container-images/cuda/Containerfile
+++ b/container-images/cuda/Containerfile
@@ -1,4 +1,5 @@
-FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9
+# Base image with CUDA for compilation
+FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder
 
 # renovate: datasource=github-releases depName=huggingface/huggingface_hub extractVersion=^v(?<version>.*)
 ARG HUGGINGFACE_HUB_VERSION=0.26.2
@@ -8,64 +9,53 @@ ARG LLAMA_CPP_SHA=3f1ae2e32cde00c39b96be6d01c2997c29bae555
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
 ARG WHISPER_CPP_SHA=4e10afb5a94469c605aae4eceb4021fb0e68c8f5
 
-# vulkan-headers vulkan-loader-devel vulkan-tools glslc glslang python3-pip mesa-libOpenCL-$MESA_VER.aarch64
-RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
-    crb enable && \
-    dnf install -y epel-release && \
-    dnf --enablerepo=ubi-9-appstream-rpms install -y git procps-ng vim \
-      dnf-plugins-core python3-dnf-plugin-versionlock cmake gcc-c++ \
-      python3-pip && \
-    dnf clean all && \
-    rm -rf /var/cache/*dnf*
-
-RUN /usr/bin/python3 --version
-RUN pip install "huggingface_hub==${HUGGINGFACE_HUB_VERSION}"
-RUN pip install "omlmd==${OMLMD_VERSION}"
-
-# CUDA_DOCKER_ARCH = 
-# Hopper GPUs (e.g., H100): Use 90
-# Ampere GPUs (e.g., RTX 30 Series, A100): Use 80
-# Turing GPUs (e.g., RTX 20 Series, GTX 16 Series): Use 75
-# Volta GPUs (e.g., V100): Use 70
-# Pascal GPUs (e.g., GTX 10 Series): Use 61
-# Maxwell GPUs (e.g., GTX 900 Series): Use 52
-# Kepler GPUs (e.g., GTX 600 and 700 Series): Use 35
-
-# Change to your gpu architecture (Optional)
 ARG CUDA_DOCKER_ARCH=default
 
-# Followed https://github.com/ggerganov/llama.cpp/blob/master/.devops/full-cuda.Dockerfile
-# for reference to build llama.cpp with cuda using cmake
+# Install dependencies only needed for building
+RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    dnf install -y git cmake gcc-c++ python3-pip && \
+    dnf clean all && rm -rf /var/cache/*dnf*
 
-RUN git clone https://github.com/ggerganov/llama.cpp && \
-    cd llama.cpp && \
+# Install Python packages
+RUN pip install "huggingface_hub==${HUGGINGFACE_HUB_VERSION}" "omlmd==${OMLMD_VERSION}"
+
+# Build llama.cpp
+RUN git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && \
     git reset --hard ${LLAMA_CPP_SHA} && \
     cmake -B build -DGGML_CUDA=ON -DCUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
-    # Move llama-cli and llama-server to /usr/bin
-    mv build/bin/llama-cli /usr/bin/llama-cli && \
-    mv build/bin/llama-server /usr/bin/llama-server && \
-    # Move shared libraries to a standard library directory
-    mv build/ggml/src/libggml.so /usr/lib/libggml.so && \
-    mv build/src/libllama.so /usr/lib/libllama.so && \
-    # Update the dynamic linker cache
-    ldconfig && \
-    # Clean up
-    cd / && \
-    rm -rf llama.cpp
+    mv build/bin/llama-cli /usr/bin/ && mv build/bin/llama-server /usr/bin/ && \
+    mv build/ggml/src/libggml.so /usr/lib/ && mv build/src/libllama.so /usr/lib/ && \
+    cd / && rm -rf llama.cpp
 
-RUN git clone https://github.com/ggerganov/whisper.cpp.git && \
-    cd whisper.cpp && \
+# Build whisper.cpp
+RUN git clone https://github.com/ggerganov/whisper.cpp && cd whisper.cpp && \
     git reset --hard ${WHISPER_CPP_SHA} && \
     cmake -B build -DGGML_CUDA=ON -DCUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc) && \
-    # Move whisper binaries to /usr/bin
-    mv build/bin/main /usr/bin/whisper-main && \
-    mv build/bin/server /usr/bin/whisper-server && \
-    # Move any shared libraries to a standard library directory if needed
-    if [ -f build/lib/libwhisper.so ]; then mv build/lib/libwhisper.so /usr/lib/libwhisper.so; fi && \
-    # Update the dynamic linker cache if any shared libraries were moved
-    ldconfig || true && \
-    # Clean up
-    cd / && \
-    rm -rf whisper.cpp
\ No newline at end of file
+    mv build/bin/main /usr/bin/whisper-main && mv build/bin/server /usr/bin/whisper-server && \
+    if [ -f build/lib/libwhisper.so ]; then mv build/lib/libwhisper.so /usr/lib/; fi && \
+    cd / && rm -rf whisper.cpp
+
+# Final runtime image
+FROM docker.io/nvidia/cuda:12.6.2-runtime-ubi9
+
+# renovate: datasource=github-releases depName=huggingface/huggingface_hub extractVersion=^v(?<version>.*)
+ARG HUGGINGFACE_HUB_VERSION=0.26.2
+# renovate: datasource=github-releases depName=containers/omlmd extractVersion=^v(?<version>.*)
+ARG OMLMD_VERSION=0.1.6
+
+# Install minimal runtime dependencies
+RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    dnf install -y python3 python3-pip && dnf clean all && rm -rf /var/cache/*dnf*
+
+# Install Python packages in the runtime image
+RUN pip install "huggingface_hub==${HUGGINGFACE_HUB_VERSION}" "omlmd==${OMLMD_VERSION}"
+
+# Copy only necessary files from the build stage
+COPY --from=builder /usr/bin/llama-cli /usr/bin/llama-server /usr/bin/
+COPY --from=builder /usr/bin/whisper-main /usr/bin/whisper-server /usr/bin/
+COPY --from=builder /usr/lib/libggml.so /usr/lib/libllama.so /usr/lib/
+
+# Update dynamic linker cache
+RUN ldconfig || true
\ No newline at end of file