From d7d2615b92522df065c82055633b648d0ed25453 Mon Sep 17 00:00:00 2001 From: qqiao Date: Sun, 26 Nov 2023 23:18:19 -0800 Subject: [PATCH 01/15] Pin pip version to 23.3 --- docker/dockerfile.merlin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index d4d95c4da..fcd670343 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -95,7 +95,7 @@ RUN ln -s /usr/bin/python3 /usr/bin/python # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7859 # 2023-02-22: pynvml==11.5.0 is currently incompatible with our version of dask/distributed # 2023-10-06: onnxruntime==1.15.1 the latest version changed api which is not compatible with hugectr -RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \ +RUN pip install --no-cache-dir --upgrade pip==23.3; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \ fastrlock nvidia-pyindex pybind11 pytest \ transformers==4.27.1 tensorflow-metadata betterproto \ cachetools graphviz nvtx scipy "scikit-learn<1.2" \ From a100ee3bba91a619242a556f8915347156efad0c Mon Sep 17 00:00:00 2001 From: qqiao Date: Mon, 27 Nov 2023 02:12:36 -0800 Subject: [PATCH 02/15] Update installation for lightfm --- docker/dockerfile.merlin | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index fcd670343..b7fe68516 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -95,14 +95,14 @@ RUN ln -s /usr/bin/python3 /usr/bin/python # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7859 # 2023-02-22: pynvml==11.5.0 is currently incompatible with our version of dask/distributed # 2023-10-06: onnxruntime==1.15.1 the latest version changed api which is not compatible with hugectr -RUN pip install --no-cache-dir --upgrade pip==23.3; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \ +RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \ fastrlock nvidia-pyindex pybind11 pytest \ transformers==4.27.1 tensorflow-metadata betterproto \ cachetools graphviz nvtx scipy "scikit-learn<1.2" \ tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \ git+https://github.com/rapidsai/asvdb.git@main \ xgboost==1.6.2 lightgbm \ - lightfm implicit \ + implicit \ numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \ pynvml==11.4.1 RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0 @@ -299,7 +299,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/ -RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.4 +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.4 lightfm ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter ENV JUPYTER_DATA_DIR=/tmp/.jupyter From 1e575b245b0b6273e25f0fe345e2f7289e9ad602 Mon Sep 17 00:00:00 2001 From: qqiao Date: Mon, 27 Nov 2023 21:59:41 -0800 Subject: [PATCH 03/15] Try to install without fast_math --- docker/dockerfile.merlin | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index b7fe68516..7b90e6d0c 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -299,7 +299,9 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/ -RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.4 lightfm +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.4 +ENV LIGHTFM_NO_CFLAGS=1 +RUN pip install --no-cache-dir -v --debug lightfm ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter ENV JUPYTER_DATA_DIR=/tmp/.jupyter From 1bf627bbfdec2a5939e49a76c2773aa4df14983b Mon Sep 17 00:00:00 2001 From: qqiao Date: Sun, 3 Dec 2023 23:27:11 -0800 Subject: [PATCH 04/15] Update version of libdcgm.so for new base --- docker/dockerfile.merlin | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index 7b90e6d0c..72054cf3c 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -241,11 +241,11 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/ COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. -COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.2 /tmp +COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.3 /tmp RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \ - mv /tmp/libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \ - chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \ - ln -s libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so + mv /tmp/libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ + chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ + ln -s libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so ENV PATH=/opt/tritonserver/bin:${PATH}: From 33abc3f556d410baadfa7a6a166d2e3bec6d3f68 Mon Sep 17 00:00:00 2001 From: qqiao Date: Mon, 4 Dec 2023 22:41:46 -0800 Subject: [PATCH 05/15] Try a newer numpy 1.24 --- docker/dockerfile.merlin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index 72054cf3c..8f8d99b40 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -106,7 +106,7 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake< numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \ pynvml==11.4.1 RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0 -RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda +RUN pip install --no-cache-dir numpy==1.24.0 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com @@ -299,7 +299,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/ -RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.4 +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.24.0 ENV LIGHTFM_NO_CFLAGS=1 RUN pip install --no-cache-dir -v --debug lightfm From 9fbdd8ad9256d1d667e1bde9c708316f723e8e99 Mon Sep 17 00:00:00 2001 From: qqiao Date: Sun, 10 Dec 2023 19:16:06 -0800 Subject: [PATCH 06/15] Create merlin-base min for hugectr release --- docker/dockerfile.ctr | 3 +- docker/dockerfile.merlin.min | 374 +++++++++++++++++++++++++++++++++++ docker/dockerfile.tf | 8 +- 3 files changed, 379 insertions(+), 6 deletions(-) create mode 100644 docker/dockerfile.merlin.min diff --git a/docker/dockerfile.ctr b/docker/dockerfile.ctr index b67e766b9..862faa737 100644 --- a/docker/dockerfile.ctr +++ b/docker/dockerfile.ctr @@ -1,6 +1,5 @@ # syntax=docker/dockerfile:1.2 -ARG MERLIN_VERSION=23.06 -ARG TRITON_VERSION=23.06 +ARG MERLIN_VERSION=23.11 ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION} diff --git a/docker/dockerfile.merlin.min b/docker/dockerfile.merlin.min new file mode 100644 index 000000000..4871f50b5 --- /dev/null +++ b/docker/dockerfile.merlin.min @@ -0,0 +1,374 @@ +# syntax=docker/dockerfile:1.2 +ARG TRITON_VERSION=23.11 +ARG DLFW_VERSION=23.11 + +ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 +ARG SDK_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-sdk +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min +ARG DLFW_IMAGE=nvcr.io/nvidia/tensorflow:${TRITON_VERSION}-tf2-py3 + +FROM ${FULL_IMAGE} as triton +FROM ${SDK_IMAGE} as sdk +FROM ${DLFW_IMAGE} as dlfw +FROM ${BASE_IMAGE} as build + +# Args +ARG TARGETOS +ARG TARGETARCH + +ARG DASK_VER=2023.1.1 + +# Envs +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_PATH=$CUDA_HOME +ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib +ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin + +# Set up NVIDIA package repository +RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ + apt clean && apt update -y --fix-missing && \ + apt install -y --no-install-recommends software-properties-common && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-ubuntu2204.pin && \ + mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/3bf863cc.pub && \ + add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/ /" && \ + apt install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + ca-certificates \ + clang-format \ + curl \ + datacenter-gpu-manager \ + git \ + libarchive-dev \ + libb64-dev \ + libboost-serialization-dev \ + libcurl4-openssl-dev \ + libexpat1-dev \ + libopenblas-dev \ + libre2-dev \ + libsasl2-2 \ + libssl-dev \ + libtbb-dev \ + openssl \ + pkg-config \ + policykit-1 \ + protobuf-compiler \ + python3 \ + python3-pip \ + python3-dev \ + swig \ + rapidjson-dev \ + nlohmann-json3-dev \ + wget \ + zlib1g-dev && \ + apt autoremove -y && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python + +# Install multiple packages + +# cmake 3.25.0 broke find_package(CUDAToolkit), which breaks the FAISS build: +# https://gitlab.kitware.com/cmake/cmake/-/issues/24119 +# A fix has already been merged but not yet released: +# https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7859 +# 2023-02-22: pynvml==11.5.0 is currently incompatible with our version of dask/distributed +# 2023-10-06: onnxruntime==1.15.1 the latest version changed api which is not compatible with hugectr +RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \ + fastrlock nvidia-pyindex pybind11 pytest \ + transformers==4.27.1 tensorflow-metadata betterproto \ + cachetools graphviz nvtx scipy "scikit-learn<1.2" \ + tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \ + git+https://github.com/rapidsai/asvdb.git@main \ + xgboost==1.6.2 lightgbm \ + implicit \ + numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \ + pynvml==11.4.1 +RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0 +RUN pip install --no-cache-dir numpy==1.24.0 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda +RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} +RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com + +# Triton Server +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/TRITON_VERSION . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/bin bin/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/ +# NOTE 2023-09: fil-backend is not available on ARM. Some docker versions flag an error if there is +# not a single source file to copy. To avoid this, we als specify a small dummy file. +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ +COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. + +ENV PATH=/opt/tritonserver/bin:${PATH}: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib + +# Install faiss (with sm80 support since the faiss-gpu wheels +# don't include it https://github.com/kyamagu/faiss-wheels/issues/54) +RUN git clone --branch v1.7.2 https://github.com/facebookresearch/faiss.git build-env && \ + pushd build-env && \ + cmake -B build . -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="60;70;80;90" && \ + make -C build -j $(nproc) faiss swigfaiss && \ + pushd build/faiss/python && \ + python setup.py install && \ + popd && \ + popd && \ + rm -rf build-env + +# Clean up +RUN rm -rf /repos + +HEALTHCHECK NONE +CMD ["/bin/bash"] + +FROM ${BASE_IMAGE} as base + +# Args +ARG TARGETOS +ARG TARGETARCH + +# Envs +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_PATH=$CUDA_HOME +ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib +ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin + +# Set up NVIDIA package repository +RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ + apt update -y --fix-missing && \ + apt install -y --no-install-recommends software-properties-common && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-ubuntu2204.pin && \ + mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/3bf863cc.pub && \ + add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/ /" && \ + apt install -y --no-install-recommends \ + ca-certificates \ + clang-format \ + curl \ + libcurl4-openssl-dev \ + git \ + graphviz \ + libarchive-dev \ + libb64-dev \ + libboost-serialization-dev \ + libexpat1-dev \ + libopenblas-dev \ + libre2-dev \ + libsasl2-2 \ + libssl-dev \ + libtbb-dev \ + openssl \ + policykit-1 \ + protobuf-compiler \ + python3 \ + python3-pip \ + python3-dev \ + python3-libnvinfer \ + rapidjson-dev \ + tree \ + wget \ + zlib1g-dev \ + # Required to build RocksDB and RdKafka. + libgflags-dev \ + libbz2-dev \ + libsnappy-dev \ + liblz4-dev \ + libzstd-dev \ + libsasl2-dev \ + # Required to build Protocol Buffers. + autoconf automake libtool \ + # Required to build Hadoop. + pkg-config \ + libpmem-dev \ + libsnappy-dev \ + # Required to run Hadoop. + openssh-server \ + # [ HugeCTR ] + libaio-dev && \ + apt autoremove -y && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python + +ENV JAVA_HOME=/usr/lib/jvm/default-java +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/lib:${JAVA_HOME}/lib/server + +# Binaries +COPY --chown=1000:1000 --from=build /usr/local/bin/cmake /usr/local/bin/ +COPY --chown=1000:1000 --from=build /usr/local/bin/pytest /usr/local/bin/ +COPY --chown=1000:1000 --from=sdk /usr/local/bin/perf_* /usr/local/bin/ + +# Triton Server +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/TRITON_VERSION . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/bin bin/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/python/ +# NOTE 2023-09: fil-backend is not available on ARM. Some docker versions flag an error if there is +# not a single source file to copy. To avoid this, we als specify a small dummy file. +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/ +COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. +COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.3 /tmp +RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \ + mv /tmp/libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ + chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ + ln -s libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so + + +ENV PATH=/opt/tritonserver/bin:${PATH}: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib + +# python --version | sed -e 's/[A-Za-z ]*//g' | awk -F'.' '{print $1"."$2}' +ENV PYTHON_VERSION=3.10 + +# Python Packages +COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/ +ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/ + + +# rapids components from the DLFW image +COPY --chown=1000:1000 --from=dlfw /usr/lib/libcudf* /usr/lib/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/libarrow* /usr/lib/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/libparquet* /usr/lib/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Arrow /usr/lib/cmake/Arrow/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Parquet /usr/lib/cmake/Parquet/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/libnvcomp* /usr/lib/ + +COPY --chown=1000:1000 --from=dlfw /usr/include/fmt /usr/include/fmt/ +COPY --chown=1000:1000 --from=dlfw /usr/include/spdlog /usr/include/spdlog/ +COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/ +COPY --chown=1000:1000 --from=dlfw /usr/include/parquet /usr/include/parquet/ +COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/ +COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/ + +# ptx compiler required by cubinlinker +RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop; + +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker + + +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/ + +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.24.0 +ENV LIGHTFM_NO_CFLAGS=1 +RUN pip install --no-cache-dir -v --debug lightfm + +ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter +ENV JUPYTER_DATA_DIR=/tmp/.jupyter +ENV JUPYTER_RUNTIME_DIR=/tmp/.jupyter +ENV MERLIN_BASE_MIN=true + +# Optional dependency: Build and install protocol buffers and Hadoop/HDFS. +ARG INSTALL_HDFS=false +# Env for HDFS +ENV HADOOP_HOME=/opt/hadoop +ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \ + HDFS_NAMENODE_USER=root \ + HDFS_SECONDARYNAMENODE_USER=root \ + HDFS_DATANODE_USER=root \ + YARN_RESOURCEMANAGER_USER=root \ + YARN_NODEMANAGER_USER=root \ + # Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057 + LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \ + # Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425). + UCX_ERROR_SIGNALS='' \ + CLASSPATH=${CLASSPATH}:\ +${HADOOP_HOME}/etc/hadoop/*:\ +${HADOOP_HOME}/share/hadoop/common/*:\ +${HADOOP_HOME}/share/hadoop/common/lib/*:\ +${HADOOP_HOME}/share/hadoop/hdfs/*:\ +${HADOOP_HOME}/share/hadoop/hdfs/lib/*:\ +${HADOOP_HOME}/share/hadoop/mapreduce/*:\ +${HADOOP_HOME}/share/hadoop/yarn/*:\ +${HADOOP_HOME}/share/hadoop/yarn/lib/* + +# Install Inference and HPS Backend +ARG HUGECTR_DEV_MODE=false +ARG HUGECTR_VER=main +ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git" +ARG HUGECTR_BACKEND_VER=main +ARG _CI_JOB_TOKEN="" +ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git" +ARG HUGECTR_HOME=/usr/local/hugectr +ARG TRITON_VERSION + +ENV PATH=$PATH:${HUGECTR_HOME}/bin \ + CPATH=$CPATH:${HUGECTR_HOME}/include \ + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib + +RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \ + # Install HugeCTR inference which is dependency for hps_backend + git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \ + cd /hugectr && \ + git submodule update --init --recursive && \ + mkdir build && \ + cd build && \ + if [[ "${INSTALL_HDFS}" == "false" ]]; then \ + cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON .. \ + ; else \ + cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON -DENABLE_HDFS=ON .. \ + ; fi && \ + make -j$(nproc) && \ + make install && \ + # Install HPS trt pugin + cd ../hps_trt && \ + mkdir build && \ + cd build && \ + cmake -DSM="70;75;80;90" .. && \ + make -j$(nproc) && \ + make install && \ + cd / && rm -rf /hugectr && \ + # Install hps_backend + git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \ + mkdir /repos/hugectr_triton_backend/hps_backend/build && \ + cd /repos/hugectr_triton_backend/hps_backend/build && \ + cmake \ + -DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \ + -DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \ + -DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \ + -DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \ + make -j$(nproc) && \ + make install && \ + cd ../../.. && \ + rm -rf hugectr_triton_backend && \ + chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hps/*.so \ + ; fi +RUN ln -s ${HUGECTR_HOME}/backends/hps /opt/tritonserver/backends/hps + +HEALTHCHECK NONE +CMD ["/bin/bash"] +ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf index b61adf156..7e4418cc0 100644 --- a/docker/dockerfile.tf +++ b/docker/dockerfile.tf @@ -1,7 +1,7 @@ # syntax=docker/dockerfile:1.2 -ARG MERLIN_VERSION=23.06 -ARG TRITON_VERSION=23.06 -ARG TENSORFLOW_VERSION=23.06 +ARG MERLIN_VERSION=23.11 +ARG TRITON_VERSION=23.11 +ARG TENSORFLOW_VERSION=23.11 ARG DLFW_IMAGE=nvcr.io/nvidia/tensorflow:${TENSORFLOW_VERSION}-tf2-py3 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 @@ -17,7 +17,7 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorflow backe # Tensorflow dependencies (only) # Pinning to pass hugectr sok tests # wrapt 1.5.0 introduce hugectr test failures, so downgrade to 1.14.0 -RUN pip install --no-cache-dir tensorflow==2.12.0 protobuf==3.20.3 wrapt==1.14.0 \ +RUN pip install --no-cache-dir tensorflow==2.14.0 protobuf==3.20.3 wrapt==1.14.0 \ && pip uninstall tensorflow keras -y # DLFW Tensorflow packages From 49ceb04ddecd13102eb1fa59ec7d1b47053d106f Mon Sep 17 00:00:00 2001 From: qqiao Date: Sun, 10 Dec 2023 19:21:08 -0800 Subject: [PATCH 07/15] Skip other merlin components tests --- ci/test_container.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/test_container.sh b/ci/test_container.sh index d41b59fe2..d5420d570 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -17,6 +17,8 @@ if [ $container != 'merlin-ci-runner' ]; then fi ${ci_script_dir}container_software.sh $container $devices -${ci_script_dir}container_integration.sh $container $devices $suppress_failures -${ci_script_dir}container_unit.sh $container $devices +if [ $MERLIN_BASE_MIN != "true" ]; then + ${ci_script_dir}container_integration.sh $container $devices $suppress_failures + ${ci_script_dir}container_unit.sh $container $devices +fi From 91d43e6913f1fe9589df3bf402177bb35be2383b Mon Sep 17 00:00:00 2001 From: qqiao Date: Sun, 10 Dec 2023 21:59:43 -0800 Subject: [PATCH 08/15] Add merlin repo which has test scripts --- docker/dockerfile.merlin.min | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker/dockerfile.merlin.min b/docker/dockerfile.merlin.min index 4871f50b5..66cc3e05b 100644 --- a/docker/dockerfile.merlin.min +++ b/docker/dockerfile.merlin.min @@ -292,6 +292,13 @@ ENV JUPYTER_DATA_DIR=/tmp/.jupyter ENV JUPYTER_RUNTIME_DIR=/tmp/.jupyter ENV MERLIN_BASE_MIN=true +ARG MERLIN_VER=main +ENV MERLIN_VER=${MERLIN_VER} + +# Add Merlin Repo +RUN git clone --branch ${MERLIN_VER} --depth 1 https://github.com/NVIDIA-Merlin/Merlin/ /Merlin && \ + cd /Merlin/ && pip install . --no-deps + # Optional dependency: Build and install protocol buffers and Hadoop/HDFS. ARG INSTALL_HDFS=false # Env for HDFS From 422092f1a9ee1745b83b5e01d342973ada1117dd Mon Sep 17 00:00:00 2001 From: qqiao Date: Sun, 10 Dec 2023 22:55:03 -0800 Subject: [PATCH 09/15] Use the personal repo to test --- docker/dockerfile.merlin.min | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/dockerfile.merlin.min b/docker/dockerfile.merlin.min index 66cc3e05b..9d2f01fd9 100644 --- a/docker/dockerfile.merlin.min +++ b/docker/dockerfile.merlin.min @@ -296,7 +296,7 @@ ARG MERLIN_VER=main ENV MERLIN_VER=${MERLIN_VER} # Add Merlin Repo -RUN git clone --branch ${MERLIN_VER} --depth 1 https://github.com/NVIDIA-Merlin/Merlin/ /Merlin && \ +RUN git clone --branch emma/pin_pip_version --depth 1 https://github.com/EmmaQiaoCh/Merlin.git /Merlin && \ cd /Merlin/ && pip install . --no-deps # Optional dependency: Build and install protocol buffers and Hadoop/HDFS. From 81c40ee8b1a23a0193dc0a8a89635742e745cbeb Mon Sep 17 00:00:00 2001 From: qqiao Date: Mon, 11 Dec 2023 01:27:11 -0800 Subject: [PATCH 10/15] Add some git log info to debug --- docker/dockerfile.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf index 7e4418cc0..2bc13d082 100644 --- a/docker/dockerfile.tf +++ b/docker/dockerfile.tf @@ -59,6 +59,7 @@ RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \ rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \ git clone --branch ${HUGECTR_VER} --depth 1 --recurse-submodules --shallow-submodules https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \ pushd /hugectr && \ + git log -n 3 && \ rm -rf .git/modules && \ pip --no-cache-dir install ninja tf2onnx && \ # Install SOK From 11ec71fd6c4d3dfa706e47c55a3cba35b4103de8 Mon Sep 17 00:00:00 2001 From: qqiao Date: Mon, 11 Dec 2023 01:48:53 -0800 Subject: [PATCH 11/15] Update torch path in latest image --- docker/dockerfile.torch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/dockerfile.torch b/docker/dockerfile.torch index 4eff5a1b1..49efc4229 100644 --- a/docker/dockerfile.torch +++ b/docker/dockerfile.torch @@ -36,7 +36,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy.dist-info/ -COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-*.egg-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch.egg-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch.dist-info/ # Argumeints "_XXXX" are only valid when $HUGECTR_DEV_MODE==false # Install hps_torch in merlin-pytorch From 51c5b2bda516925f3b7f2f00b3dfe41c138ad276 Mon Sep 17 00:00:00 2001 From: qqiao Date: Mon, 11 Dec 2023 23:35:39 -0800 Subject: [PATCH 12/15] Copy the boost lib and headers --- docker/dockerfile.merlin.min | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/dockerfile.merlin.min b/docker/dockerfile.merlin.min index 9d2f01fd9..d0e4da50d 100644 --- a/docker/dockerfile.merlin.min +++ b/docker/dockerfile.merlin.min @@ -225,6 +225,9 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/ COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. +COPY --chown=1000:1000 --from=triton /usr/lib/libboost_* /usr/lib/ +COPY --chown=1000:1000 --from=triton /usr/include/boost /usr/include/boost/ +COPY --chown=1000:1000 --from=triton /usr/lib/cmake/boost_* /usr/lib/cmake/ COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.3 /tmp RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \ mv /tmp/libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ From 8a6cd9f32e920f01435ab5ebe173dbf9a4ecd0f3 Mon Sep 17 00:00:00 2001 From: qqiao Date: Tue, 12 Dec 2023 00:40:42 -0800 Subject: [PATCH 13/15] Remove faiss and dask --- docker/dockerfile.merlin.min | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/docker/dockerfile.merlin.min b/docker/dockerfile.merlin.min index d0e4da50d..3fb8a8c91 100644 --- a/docker/dockerfile.merlin.min +++ b/docker/dockerfile.merlin.min @@ -16,8 +16,6 @@ FROM ${BASE_IMAGE} as build ARG TARGETOS ARG TARGETARCH -ARG DASK_VER=2023.1.1 - # Envs ENV CUDA_HOME=/usr/local/cuda ENV CUDA_PATH=$CUDA_HOME @@ -91,7 +89,6 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake< pynvml==11.4.1 RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0 RUN pip install --no-cache-dir numpy==1.24.0 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda -RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com # Triton Server @@ -112,18 +109,6 @@ COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. ENV PATH=/opt/tritonserver/bin:${PATH}: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib -# Install faiss (with sm80 support since the faiss-gpu wheels -# don't include it https://github.com/kyamagu/faiss-wheels/issues/54) -RUN git clone --branch v1.7.2 https://github.com/facebookresearch/faiss.git build-env && \ - pushd build-env && \ - cmake -B build . -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="60;70;80;90" && \ - make -C build -j $(nproc) faiss swigfaiss && \ - pushd build/faiss/python && \ - python setup.py install && \ - popd && \ - popd && \ - rm -rf build-env - # Clean up RUN rm -rf /repos @@ -268,8 +253,6 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf -COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf -COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends @@ -278,8 +261,6 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf.dist-info/ -COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cudf.dist-info/ -COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/dask_cuda.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy.dist-info/ From 89e514616ef50864b5d765263a43c5b52743833a Mon Sep 17 00:00:00 2001 From: qqiao Date: Thu, 14 Dec 2023 03:12:31 -0800 Subject: [PATCH 14/15] Change back merlin repo and dockerfile.merlin --- docker/dockerfile.merlin | 16 +++++++--------- docker/dockerfile.merlin.min | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index 8f8d99b40..eb8bdbdfe 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -102,11 +102,11 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake< tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \ git+https://github.com/rapidsai/asvdb.git@main \ xgboost==1.6.2 lightgbm \ - implicit \ + lightfm implicit \ numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \ pynvml==11.4.1 RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0 -RUN pip install --no-cache-dir numpy==1.24.0 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda +RUN pip install --no-cache-dir numpy==1.22.0 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com @@ -241,11 +241,11 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/ COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. -COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.3 /tmp +COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.2 /tmp RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \ - mv /tmp/libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ - chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ - ln -s libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so + mv /tmp/libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \ + chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \ + ln -s libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so ENV PATH=/opt/tritonserver/bin:${PATH}: @@ -299,9 +299,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/ -RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.24.0 -ENV LIGHTFM_NO_CFLAGS=1 -RUN pip install --no-cache-dir -v --debug lightfm +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.0 ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter ENV JUPYTER_DATA_DIR=/tmp/.jupyter diff --git a/docker/dockerfile.merlin.min b/docker/dockerfile.merlin.min index 3fb8a8c91..e7926747d 100644 --- a/docker/dockerfile.merlin.min +++ b/docker/dockerfile.merlin.min @@ -269,7 +269,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.24.0 ENV LIGHTFM_NO_CFLAGS=1 -RUN pip install --no-cache-dir -v --debug lightfm +RUN pip install --no-cache-dir lightfm ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter ENV JUPYTER_DATA_DIR=/tmp/.jupyter @@ -280,7 +280,7 @@ ARG MERLIN_VER=main ENV MERLIN_VER=${MERLIN_VER} # Add Merlin Repo -RUN git clone --branch emma/pin_pip_version --depth 1 https://github.com/EmmaQiaoCh/Merlin.git /Merlin && \ +RUN git clone --branch ${MERLIN_VER} --depth 1 https://github.com/NVIDIA-Merlin/Merlin/ /Merlin && \ cd /Merlin/ && pip install . --no-deps # Optional dependency: Build and install protocol buffers and Hadoop/HDFS. From 87216f7e1cadc8df928a75a7c22e9833eb7c1f37 Mon Sep 17 00:00:00 2001 From: qqiao Date: Thu, 14 Dec 2023 03:19:40 -0800 Subject: [PATCH 15/15] Remove other debug info --- docker/dockerfile.merlin | 4 ++-- docker/dockerfile.tf | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index eb8bdbdfe..d4d95c4da 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -106,7 +106,7 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake< numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \ pynvml==11.4.1 RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0 -RUN pip install --no-cache-dir numpy==1.22.0 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda +RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com @@ -299,7 +299,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/ -RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.0 +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook numpy==1.22.4 ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter ENV JUPYTER_DATA_DIR=/tmp/.jupyter diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf index 2bc13d082..7e4418cc0 100644 --- a/docker/dockerfile.tf +++ b/docker/dockerfile.tf @@ -59,7 +59,6 @@ RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \ rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \ git clone --branch ${HUGECTR_VER} --depth 1 --recurse-submodules --shallow-submodules https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \ pushd /hugectr && \ - git log -n 3 && \ rm -rf .git/modules && \ pip --no-cache-dir install ninja tf2onnx && \ # Install SOK