Merge branch 'main' into examples/sagemaker-23.06

NVIDIA-Merlin · Oct 14, 2023 · 07e3a28 · 07e3a28
2 parents 1ca4014 + 71d8f44
commit 07e3a28
Show file tree

Hide file tree

Showing 43 changed files with 8,612 additions and 1,251 deletions.
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -3,30 +3,28 @@ name: GPU CI
 on:
   workflow_dispatch:
   push:
-    branches: [ main ]
+    branches: 
+      - main
+      - "pull-request/[0-9]+"
     tags:
       - "v[0-9]+.[0-9]+.[0-9]+"
-  pull_request:
-    branches: [ main ]
-    types: [opened, synchronize, reopened]
 
 jobs:
   gpu-ci:
-    runs-on: 1GPU
+    runs-on: linux-amd64-gpu-p100-latest-1
+    container:
+          image: nvcr.io/nvidia/merlin/merlin-tensorflow:nightly
+          env:
+            NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+          options: --shm-size=1G
     steps:
     - uses: actions/checkout@v3
       with:
         fetch-depth: 0
     - name: Run tests
       run: |
-        ref_type=${{ github.ref_type }}
-        branch=main
-        if [[ $ref_type == "tag"* ]]
-        then
-          raw=$(git branch -r --contains ${{ github.ref_name }})
-          branch=${raw/origin\/}
-        fi
-        cd ${{ github.workspace }}; tox -e test-gpu -- $branch
+        pip install tox
+        tox -e test-gpu -- $branch
 
   gpu-ci-multigpu:
     runs-on: 2GPU

diff --git a/README.md b/README.md
@@ -143,7 +143,7 @@ real-world use cases.
 
 ## Merlin Is Built On
 
-**[cuDF](https://github.com/rapidsai/cudf)**<br> Merlin relies on cuDF for
+**[RAPIDS cuDF](https://github.com/rapidsai/cudf)**<br> Merlin relies on cuDF for
 GPU-accelerated DataFrame operations used in feature engineering.
 
 **[Dask](https://www.dask.org/)**<br> Merlin relies on Dask to distribute and scale

diff --git a/ci/container_integration.sh b/ci/container_integration.sh
@@ -12,7 +12,9 @@ exit_code=0
 
 ## Test Merlin
 echo "Run integration tests for Merlin"
-/Merlin/ci/test_integration.sh $container $devices || exit_code=1
+if [ "$container" != "merlin-tensorflow" ]; then
+    /Merlin/ci/test_integration.sh $container $devices || exit_code=1
+fi
 
 # Test NVTabular 
 ## Not shared storage in blossom yet, inference testing cannot be run

diff --git a/ci/dockerfile.ci b/ci/dockerfile.ci
@@ -162,11 +162,11 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
 
 # Set up NVIDIA package repository
 RUN apt update -y --fix-missing && \
-    apt install -y --no-install-recommends software-properties-common && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
-    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
-    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
+    apt install -y --no-install-recommends software-properties-common
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
     apt install -y --no-install-recommends \
         ca-certificates \
         clang-format \
@@ -268,7 +268,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
 # ptx compiler required by cubinlinker
 COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
 COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
-RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;
+RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && pip install .;
 
 ARG PYTHON_VERSION=3.10
 # Python Packages

diff --git a/docker/dockerfile.ctr b/docker/dockerfile.ctr
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=22.12
-ARG TRITON_VERSION=22.11
+ARG MERLIN_VERSION=23.06
+ARG TRITON_VERSION=23.06
 
 ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
 
@@ -9,16 +9,6 @@ FROM ${BASE_IMAGE} as base
 ARG HUGECTR_VER=main
 ARG HUGECTR_BACKEND_VER=main
 
-# Envs
-ENV CUDA_SHORT_VERSION=11.6
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
-ENV CUDA_HOME=/usr/local/cuda
-ENV CUDA_PATH=$CUDA_HOME
-ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs
-ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
-ENV PATH=$PATH:/usr/lib/x86_64-linux-gnu/
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
 RUN pip install --no-cache-dir --upgrade notebook ipython 
 RUN pip install --no-cache-dir mpi4py
 
@@ -29,12 +19,11 @@ RUN cd /opt/hpcx/ompi/include/openmpi/opal/mca/hwloc/hwloc201 && rm -rfv hwloc20
 RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.4/hwloc-${HWLOC_VER}.tar.gz && \
     mkdir -p /var/tmp && tar -x -f /var/tmp/hwloc-${HWLOC_VER}.tar.gz -C /var/tmp && \
     cd /var/tmp/hwloc-${HWLOC_VER} && \
-    ./configure CPPFLAGS="-I/usr/local/cuda/include/ -L/usr/local/cuda/lib64/" LDFLAGS="-L/usr/local/cuda/lib64" --enable-cuda && \
+    ./configure CPPFLAGS="-I${CUDA_HOME}/include/ -L${CUDA_HOME}/lib64/" LDFLAGS="-L${CUDA_HOME}/lib64" --enable-cuda && \
     make -j$(nproc) && make install && \
     rm -rf /var/tmp/hwloc-${HWLOC_VER} /var/tmp/hwloc-${HWLOC_VER}.tar.gz
 
 
-
 # -----------------------------------------------------------------------------
 #    HugeCTR + Dependencies
 
@@ -60,23 +49,20 @@ ENV HCOLL_ENABLE_MCAST=0
 # link sub modules expected by hugectr cmake
 RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
 RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
-RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
-
-RUN rm -rf /usr/lib/x86_64-linux-gnu/libibverbs.so && \
-    ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1.14.36.0 /usr/lib/x86_64-linux-gnu/libibverbs.so
+RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
 
 # Install HugeCTR
 ARG HUGECTR_HOME=/usr/local/hugectr
 RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
-        rm -rf /usr/local/hugectr/lib/libgmock* /usr/local/hugectr/lib/pkgconfig/gmock* /usr/local/hugectr/include/gmock && \
-        rm -rf /usr/local/hugectr/lib/libgtest* /usr/local/hugectr/lib/pkgconfig/gtest* /usr/local/hugectr/include/gtest && \
+        rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
+        rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
         git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
         cd /hugectr && \
         git submodule update --init --recursive && \
         mkdir build && \
         cd build && \
         LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
-        export PATH=$PATH:/usr/local/cuda-${CUDA_SHORT_VERSION}/compat && \
+        export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
         if [[ "${INSTALL_HDFS}" == "false" ]]; then \
             cmake -DCMAKE_BUILD_TYPE=Release -DSM="60;61;70;75;80;90" -DENABLE_MULTINODES=ON .. \
         ; else \
@@ -91,36 +77,7 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
         mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
     ; fi
 
-
-ENV PATH=$PATH:${HUGECTR_HOME}/bin \
-    CPATH=$CPATH:${HUGECTR_HOME}/include \
-    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib \
-    PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
-
-
-ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
-ARG TRITON_VERSION
-# Install Triton inference backend.
-RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
-        git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
-        mkdir /repos/hugectr_triton_backend/build && \
-        cd /repos/hugectr_triton_backend/build && \
-        cmake \
-            -DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
-            -DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
-            -DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
-            -DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
-        make -j$(nproc) && \
-        make install && \
-        cd ../.. && \
-        rm -rf hugectr_triton_backend && \
-        chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hugectr/*.so && \
-        rm -rf /repos \
-    ; fi
-RUN ln -s ${HUGECTR_HOME}/backends/hugectr /opt/tritonserver/backends/hugectr
-
-# Remove fake lib
-RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1
+ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
 
 # Clean up
 RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked