Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HugeCTR]Add a new base for hugectr #1098

Merged
merged 16 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions ci/container_hugectr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

container=$1
devices=$2

echo "##############"
echo "# Unit tests #"
echo "##############"

exit_code=0

## Test HugeCTR
if [ "$container" == "merlin-hugectr" ]; then
echo "Run unit tests for HugeCTR"
/hugectr/ci/test_unit.sh $container $devices || exit_code=1
echo "Run unit tests for merlin-sok"
/hugectr/ci/test_unit.sh "merlin-tensorflow" $devices || exit_code=1
fi

exit $exit_code
9 changes: 7 additions & 2 deletions ci/test_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [ $container != 'merlin-ci-runner' ]; then
fi

${ci_script_dir}container_software.sh $container $devices
${ci_script_dir}container_integration.sh $container $devices $suppress_failures
${ci_script_dir}container_unit.sh $container $devices

if [ $container == 'merlin-hugectr' ]; then
${ci_script_dir}container_hugectr.sh $container $devices
elif [ $container != 'ctr-base' ]; then
${ci_script_dir}container_integration.sh $container $devices $suppress_failures
${ci_script_dir}container_unit.sh $container $devices
fi

108 changes: 96 additions & 12 deletions docker/dockerfile.ctr
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# syntax=docker/dockerfile:1.2
ARG MERLIN_VERSION=23.06
ARG TRITON_VERSION=23.06
ARG MERLIN_VERSION=24.06
ARG TRITON_VERSION=24.03

ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/ctr-base:${MERLIN_VERSION}

FROM ${BASE_IMAGE} as base

ARG HUGECTR_VER=main
ARG HUGECTR_BACKEND_VER=main

RUN pip install --no-cache-dir --upgrade notebook ipython
RUN pip install --no-cache-dir mpi4py
RUN pip install --no-cache-dir --upgrade notebook ipython mpi4py

# Install CUDA-Aware hwloc
ARG HWLOC_VER=2.4.1
Expand Down Expand Up @@ -45,22 +44,86 @@ ENV SHARP_COLL_NUM_COLL_GROUP_RESOURCE_ALLOC_THRESHOLD=0
ENV SHARP_COLL_LOCK_ON_COMM_INIT=1
ENV SHARP_COLL_LOG_LEVEL=3
ENV HCOLL_ENABLE_MCAST=0
ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
SOK_COMPILE_UNIT_TEST=ON

# link sub modules expected by hugectr cmake
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')

# Install HugeCTR
# Optional dependency: Build and install protocol buffers and Hadoop/HDFS.
ARG INSTALL_HDFS=false
# Env for HDFS
ENV HADOOP_HOME=/opt/hadoop
ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \
HDFS_NAMENODE_USER=root \
HDFS_SECONDARYNAMENODE_USER=root \
HDFS_DATANODE_USER=root \
YARN_RESOURCEMANAGER_USER=root \
YARN_NODEMANAGER_USER=root \
# Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057
LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \
# Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425).
UCX_ERROR_SIGNALS='' \
CLASSPATH=${CLASSPATH}:\
${HADOOP_HOME}/etc/hadoop/*:\
${HADOOP_HOME}/share/hadoop/common/*:\
${HADOOP_HOME}/share/hadoop/common/lib/*:\
${HADOOP_HOME}/share/hadoop/hdfs/*:\
${HADOOP_HOME}/share/hadoop/hdfs/lib/*:\
${HADOOP_HOME}/share/hadoop/mapreduce/*:\
${HADOOP_HOME}/share/hadoop/yarn/*:\
${HADOOP_HOME}/share/hadoop/yarn/lib/*

# Install Inference and HPS Backend
ARG HUGECTR_DEV_MODE=false
ARG HUGECTR_VER=main
ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
ARG HUGECTR_BACKEND_VER=main
ARG _CI_JOB_TOKEN=""
ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
ARG HUGECTR_HOME=/usr/local/hugectr
RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
ARG TRITON_VERSION

ENV PATH=$PATH:${HUGECTR_HOME}/bin \
CPATH=$CPATH:${HUGECTR_HOME}/include \
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib

RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
# Install HugeCTR inference which is dependency for hps_backend
git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
cd /hugectr && \
git submodule update --init --recursive && \
mkdir build && \
cd build && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON .. \
; else \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON -DENABLE_HDFS=ON .. \
; fi && \
make -j$(nproc) && \
make install && \
rm -rf ./* && \
# Install hps_backend
git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
mkdir /repos/hugectr_triton_backend/hps_backend/build && \
cd /repos/hugectr_triton_backend/hps_backend/build && \
cmake \
-DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
-DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
make -j$(nproc) && \
make install && \
chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hps/*.so && \
cd ../../.. && \
rm -rf hugectr_triton_backend && \
# Remove the incompatible gmock and gtest installed by hps_backend
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
# Install HugeCTR multinode
cd /hugectr/build && \
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
Expand All @@ -70,13 +133,34 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
; fi && \
make -j$(nproc) && \
make install && \
rm -rf ./* && \
chmod +x ${HUGECTR_HOME}/bin/* ${HUGECTR_HOME}/lib/*.so && \
cd ../onnx_converter && \
# Install HPS trt pugin
cd ../hps_trt && \
mkdir build && \
cd build && \
cmake -DSM="70;75;80;90" .. && \
make -j$(nproc) && \
make install && \
cd ../../onnx_converter && \
python setup.py install && \
mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
pip --no-cache-dir install ninja tf2onnx && \
# Install SOK
cd ../sparse_operation_kit && \
python setup.py install && \
# Install HPS TF plugin
cd ../hps_tf && \
python setup.py install && \
# Install hps_torch
cd ../hps_torch/ && \
TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 9.0" python setup.py install && \
mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
rm -rf /hugectr && mkdir -p /hugectr /hugectr/sparse_operation_kit && \
mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit/sparse_operation_kit && \
chmod +x /hugectr/ci/* /hugectr/sparse_operation_kit/sparse_operation_kit/* \
; fi

RUN ln -s ${HUGECTR_HOME}/backends/hps /opt/tritonserver/backends/hps

ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib

# Clean up
Expand Down
Loading
Loading