Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduced the size of the nvidia containerfile #407

Merged
merged 1 commit into from
Nov 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 42 additions & 52 deletions container-images/cuda/Containerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9
# Base image with CUDA for compilation
FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder

# renovate: datasource=github-releases depName=huggingface/huggingface_hub extractVersion=^v(?<version>.*)
ARG HUGGINGFACE_HUB_VERSION=0.26.2
Expand All @@ -8,64 +9,53 @@ ARG LLAMA_CPP_SHA=3f1ae2e32cde00c39b96be6d01c2997c29bae555
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=4e10afb5a94469c605aae4eceb4021fb0e68c8f5

# vulkan-headers vulkan-loader-devel vulkan-tools glslc glslang python3-pip mesa-libOpenCL-$MESA_VER.aarch64
RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
crb enable && \
dnf install -y epel-release && \
dnf --enablerepo=ubi-9-appstream-rpms install -y git procps-ng vim \
dnf-plugins-core python3-dnf-plugin-versionlock cmake gcc-c++ \
python3-pip && \
dnf clean all && \
rm -rf /var/cache/*dnf*

RUN /usr/bin/python3 --version
RUN pip install "huggingface_hub==${HUGGINGFACE_HUB_VERSION}"
RUN pip install "omlmd==${OMLMD_VERSION}"

# CUDA_DOCKER_ARCH =
# Hopper GPUs (e.g., H100): Use 90
# Ampere GPUs (e.g., RTX 30 Series, A100): Use 80
# Turing GPUs (e.g., RTX 20 Series, GTX 16 Series): Use 75
# Volta GPUs (e.g., V100): Use 70
# Pascal GPUs (e.g., GTX 10 Series): Use 61
# Maxwell GPUs (e.g., GTX 900 Series): Use 52
# Kepler GPUs (e.g., GTX 600 and 700 Series): Use 35

# Change to your gpu architecture (Optional)
ARG CUDA_DOCKER_ARCH=default

# Followed https://github.com/ggerganov/llama.cpp/blob/master/.devops/full-cuda.Dockerfile
# for reference to build llama.cpp with cuda using cmake
# Install dependencies only needed for building
RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
dnf install -y git cmake gcc-c++ python3-pip && \
dnf clean all && rm -rf /var/cache/*dnf*

RUN git clone https://github.com/ggerganov/llama.cpp && \
cd llama.cpp && \
# Install Python packages
RUN pip install "huggingface_hub==${HUGGINGFACE_HUB_VERSION}" "omlmd==${OMLMD_VERSION}"

# Build llama.cpp
RUN git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && \
git reset --hard ${LLAMA_CPP_SHA} && \
cmake -B build -DGGML_CUDA=ON -DCUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
# Move llama-cli and llama-server to /usr/bin
mv build/bin/llama-cli /usr/bin/llama-cli && \
mv build/bin/llama-server /usr/bin/llama-server && \
# Move shared libraries to a standard library directory
mv build/ggml/src/libggml.so /usr/lib/libggml.so && \
mv build/src/libllama.so /usr/lib/libllama.so && \
# Update the dynamic linker cache
ldconfig && \
# Clean up
cd / && \
rm -rf llama.cpp
mv build/bin/llama-cli /usr/bin/ && mv build/bin/llama-server /usr/bin/ && \
mv build/ggml/src/libggml.so /usr/lib/ && mv build/src/libllama.so /usr/lib/ && \
cd / && rm -rf llama.cpp

RUN git clone https://github.com/ggerganov/whisper.cpp.git && \
cd whisper.cpp && \
# Build whisper.cpp
RUN git clone https://github.com/ggerganov/whisper.cpp && cd whisper.cpp && \
git reset --hard ${WHISPER_CPP_SHA} && \
cmake -B build -DGGML_CUDA=ON -DCUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
# Move whisper binaries to /usr/bin
mv build/bin/main /usr/bin/whisper-main && \
mv build/bin/server /usr/bin/whisper-server && \
# Move any shared libraries to a standard library directory if needed
if [ -f build/lib/libwhisper.so ]; then mv build/lib/libwhisper.so /usr/lib/libwhisper.so; fi && \
# Update the dynamic linker cache if any shared libraries were moved
ldconfig || true && \
# Clean up
cd / && \
rm -rf whisper.cpp
mv build/bin/main /usr/bin/whisper-main && mv build/bin/server /usr/bin/whisper-server && \
if [ -f build/lib/libwhisper.so ]; then mv build/lib/libwhisper.so /usr/lib/; fi && \
cd / && rm -rf whisper.cpp

# Final runtime image
FROM docker.io/nvidia/cuda:12.6.2-runtime-ubi9

# renovate: datasource=github-releases depName=huggingface/huggingface_hub extractVersion=^v(?<version>.*)
ARG HUGGINGFACE_HUB_VERSION=0.26.2
# renovate: datasource=github-releases depName=containers/omlmd extractVersion=^v(?<version>.*)
ARG OMLMD_VERSION=0.1.6

# Install minimal runtime dependencies
RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
dnf install -y python3 python3-pip && dnf clean all && rm -rf /var/cache/*dnf*

# Install Python packages in the runtime image
RUN pip install "huggingface_hub==${HUGGINGFACE_HUB_VERSION}" "omlmd==${OMLMD_VERSION}"

# Copy only necessary files from the build stage
COPY --from=builder /usr/bin/llama-cli /usr/bin/llama-server /usr/bin/
COPY --from=builder /usr/bin/whisper-main /usr/bin/whisper-server /usr/bin/
COPY --from=builder /usr/lib/libggml.so /usr/lib/libllama.so /usr/lib/

# Update dynamic linker cache
RUN ldconfig || true
Loading