Skip to content

Commit

Permalink
Merge branch 'main' into examples/sagemaker-23.06
Browse files Browse the repository at this point in the history
  • Loading branch information
edknv authored Oct 14, 2023
2 parents 1ca4014 + 71d8f44 commit 07e3a28
Show file tree
Hide file tree
Showing 43 changed files with 8,612 additions and 1,251 deletions.
4 changes: 4 additions & 0 deletions .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Configuration file for `copy-pr-bot` GitHub App
# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

enabled: true
4 changes: 0 additions & 4 deletions .github/ops-bot.yaml

This file was deleted.

24 changes: 11 additions & 13 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,28 @@ name: GPU CI
on:
workflow_dispatch:
push:
branches: [ main ]
branches:
- main
- "pull-request/[0-9]+"
tags:
- "v[0-9]+.[0-9]+.[0-9]+"
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]

jobs:
gpu-ci:
runs-on: 1GPU
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvidia/merlin/merlin-tensorflow:nightly
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Run tests
run: |
ref_type=${{ github.ref_type }}
branch=main
if [[ $ref_type == "tag"* ]]
then
raw=$(git branch -r --contains ${{ github.ref_name }})
branch=${raw/origin\/}
fi
cd ${{ github.workspace }}; tox -e test-gpu -- $branch
pip install tox
tox -e test-gpu -- $branch
gpu-ci-multigpu:
runs-on: 2GPU
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ real-world use cases.

## Merlin Is Built On

**[cuDF](https://github.com/rapidsai/cudf)**<br> Merlin relies on cuDF for
**[RAPIDS cuDF](https://github.com/rapidsai/cudf)**<br> Merlin relies on cuDF for
GPU-accelerated DataFrame operations used in feature engineering.

**[Dask](https://www.dask.org/)**<br> Merlin relies on Dask to distribute and scale
Expand Down
4 changes: 3 additions & 1 deletion ci/container_integration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ exit_code=0

## Test Merlin
echo "Run integration tests for Merlin"
/Merlin/ci/test_integration.sh $container $devices || exit_code=1
if [ "$container" != "merlin-tensorflow" ]; then
/Merlin/ci/test_integration.sh $container $devices || exit_code=1
fi

# Test NVTabular
## Not shared storage in blossom yet, inference testing cannot be run
Expand Down
12 changes: 6 additions & 6 deletions ci/dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,11 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin

# Set up NVIDIA package repository
RUN apt update -y --fix-missing && \
apt install -y --no-install-recommends software-properties-common && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
apt install -y --no-install-recommends software-properties-common
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
apt install -y --no-install-recommends \
ca-certificates \
clang-format \
Expand Down Expand Up @@ -268,7 +268,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
# ptx compiler required by cubinlinker
COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;
RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && pip install .;

ARG PYTHON_VERSION=3.10
# Python Packages
Expand Down
59 changes: 8 additions & 51 deletions docker/dockerfile.ctr
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1.2
ARG MERLIN_VERSION=22.12
ARG TRITON_VERSION=22.11
ARG MERLIN_VERSION=23.06
ARG TRITON_VERSION=23.06

ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}

Expand All @@ -9,16 +9,6 @@ FROM ${BASE_IMAGE} as base
ARG HUGECTR_VER=main
ARG HUGECTR_BACKEND_VER=main

# Envs
ENV CUDA_SHORT_VERSION=11.6
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
ENV CUDA_HOME=/usr/local/cuda
ENV CUDA_PATH=$CUDA_HOME
ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs
ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
ENV PATH=$PATH:/usr/lib/x86_64-linux-gnu/
RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1

RUN pip install --no-cache-dir --upgrade notebook ipython
RUN pip install --no-cache-dir mpi4py

Expand All @@ -29,12 +19,11 @@ RUN cd /opt/hpcx/ompi/include/openmpi/opal/mca/hwloc/hwloc201 && rm -rfv hwloc20
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.4/hwloc-${HWLOC_VER}.tar.gz && \
mkdir -p /var/tmp && tar -x -f /var/tmp/hwloc-${HWLOC_VER}.tar.gz -C /var/tmp && \
cd /var/tmp/hwloc-${HWLOC_VER} && \
./configure CPPFLAGS="-I/usr/local/cuda/include/ -L/usr/local/cuda/lib64/" LDFLAGS="-L/usr/local/cuda/lib64" --enable-cuda && \
./configure CPPFLAGS="-I${CUDA_HOME}/include/ -L${CUDA_HOME}/lib64/" LDFLAGS="-L${CUDA_HOME}/lib64" --enable-cuda && \
make -j$(nproc) && make install && \
rm -rf /var/tmp/hwloc-${HWLOC_VER} /var/tmp/hwloc-${HWLOC_VER}.tar.gz



# -----------------------------------------------------------------------------
# HugeCTR + Dependencies

Expand All @@ -60,23 +49,20 @@ ENV HCOLL_ENABLE_MCAST=0
# link sub modules expected by hugectr cmake
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so

RUN rm -rf /usr/lib/x86_64-linux-gnu/libibverbs.so && \
ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1.14.36.0 /usr/lib/x86_64-linux-gnu/libibverbs.so
RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')

# Install HugeCTR
ARG HUGECTR_HOME=/usr/local/hugectr
RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
rm -rf /usr/local/hugectr/lib/libgmock* /usr/local/hugectr/lib/pkgconfig/gmock* /usr/local/hugectr/include/gmock && \
rm -rf /usr/local/hugectr/lib/libgtest* /usr/local/hugectr/lib/pkgconfig/gtest* /usr/local/hugectr/include/gtest && \
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
cd /hugectr && \
git submodule update --init --recursive && \
mkdir build && \
cd build && \
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
export PATH=$PATH:/usr/local/cuda-${CUDA_SHORT_VERSION}/compat && \
export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="60;61;70;75;80;90" -DENABLE_MULTINODES=ON .. \
; else \
Expand All @@ -91,36 +77,7 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
; fi


ENV PATH=$PATH:${HUGECTR_HOME}/bin \
CPATH=$CPATH:${HUGECTR_HOME}/include \
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib \
PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib


ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
ARG TRITON_VERSION
# Install Triton inference backend.
RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
mkdir /repos/hugectr_triton_backend/build && \
cd /repos/hugectr_triton_backend/build && \
cmake \
-DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
-DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
make -j$(nproc) && \
make install && \
cd ../.. && \
rm -rf hugectr_triton_backend && \
chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hugectr/*.so && \
rm -rf /repos \
; fi
RUN ln -s ${HUGECTR_HOME}/backends/hugectr /opt/tritonserver/backends/hugectr

# Remove fake lib
RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1
ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib

# Clean up
RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked
Expand Down
Loading

0 comments on commit 07e3a28

Please sign in to comment.