From 95687ef285ba62efa911ce628110103d5416a832 Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches <61422851+Beanavil@users.noreply.github.com> Date: Mon, 29 Apr 2024 16:40:21 +0200 Subject: [PATCH] Develop Stream 2024-03-21 general fixes (part I) (#97) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bump the required cmake version to 3.21.3 * Fix device_globals example name * Fix hip_streams timeout on AMD windows debug build type * Update templates * Update cuda container to ROCm 5.4 * Change std::bind into lambda * HIP 5.5 fixes * fix tests not being executed * Make the reference to the identity and transpose op uniform * Fix NVCC CI * Resolve "Increase timeout for CI" * Update fixed size arrays to C++ standards * Add missing include in hip_texture_management * Remove void** cast from hipMalloc * Fix hip-libraries-cuda-ubuntu Dockerfile * Make the windows builds less verbose * Rework Windows CI * Skip failing rocsparse tests * Fix cooperative groups example * ci: Make skipped examples more prominent in windows VS test runner * Enable rocsparse examples in CI * Update .gitlab/issue_templates/example.md Fix small typo --------- Co-authored-by: Balint Soproni Co-authored-by: Robin Voetter Co-authored-by: Nara Prasetya Co-authored-by: Nol Moonen Co-authored-by: Mátyás Aradi Co-authored-by: Gergely Mészáros Co-authored-by: Sam Wu <22262939+samjwu@users.noreply.github.com> --- .gitlab-ci.yml | 343 +++++++++--------- .gitlab/issue_templates/example.md | 20 +- .gitlab/merge_request_templates/example.md | 16 + Applications/floyd_warshall/main.hip | 4 +- CMakeLists.txt | 4 +- Common/example_utils.hpp | 14 +- .../hip-libraries-cuda-ubuntu.Dockerfile | 75 ++-- HIP-Basic/cooperative_groups/CMakeLists.txt | 5 + HIP-Basic/device_globals/CMakeLists.txt | 2 +- HIP-Basic/device_query/main.cpp | 6 +- HIP-Basic/occupancy/main.hip | 4 +- HIP-Basic/texture_management/main.hip | 5 +- .../hipBLAS/gemm_strided_batched/README.md | 35 +- .../hipBLAS/gemm_strided_batched/main.hip | 10 +- Libraries/hipCUB/device_radix_sort/main.hip | 8 +- Libraries/hipCUB/device_sum/main.hip | 6 +- Libraries/hipSOLVER/syevj/main.cpp | 2 +- Libraries/rocBLAS/level_3/gemm/README.md | 28 +- Libraries/rocBLAS/level_3/gemm/main.cpp | 10 +- .../level_3/gemm_strided_batched/README.md | 27 +- .../level_3/gemm_strided_batched/main.cpp | 10 +- Libraries/rocPRIM/block_sum/main.hip | 4 +- Libraries/rocPRIM/device_sum/main.hip | 6 +- Libraries/rocThrust/device_ptr/main.hip | 6 +- Libraries/rocThrust/norm/main.hip | 6 +- Libraries/rocThrust/reduce_sum/main.hip | 6 +- Libraries/rocThrust/remove_points/main.hip | 6 +- Libraries/rocThrust/saxpy/main.hip | 6 +- Libraries/rocThrust/vectors/main.hip | 6 +- Scripts/WindowsRunner.ps1 | 108 ++++++ 30 files changed, 463 insertions(+), 325 deletions(-) create mode 100644 .gitlab/merge_request_templates/example.md create mode 100644 Scripts/WindowsRunner.ps1 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 475c1f685..80d038fee 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -31,8 +31,10 @@ include: variables: CUDA_FLAGS: "-Xcompiler -Wall,-Wextra,-Werror --Werror all-warnings" - CXX_FLAGS: "-Wall -Wextra -Werror" - HIP_FLAGS: "-Wall -Wextra -Werror" + # We require '-Wno-unused-command-line-argument' due to the followiwng warning: + # argument unused during compilation: '--rtlib=compiler-rt' + CXX_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror" + HIP_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror" stages: - lint @@ -56,7 +58,7 @@ clang-format: - Scripts/CodeFormat/check_format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT" .build:dockerfiles: - timeout: 20m + timeout: 60m image: name: gcr.io/kaniko-project/executor:debug entrypoint: [""] @@ -94,6 +96,10 @@ build:cuda-ubuntu-dockerfile: variables: TAG: cuda-ubuntu +######################## +# Ubuntu make # +######################## + build:make-rocm: image: $DOCKER_TAG_PREFIX:rocm-ubuntu stage: build @@ -116,6 +122,10 @@ build:make-cuda: script: - cd $CI_PROJECT_DIR && make CXXFLAGS="$CUDA_FLAGS" GPU_RUNTIME=CUDA -j $(nproc) +######################## +# Ubuntu cmake # +######################## + .build:cmake: stage: build extends: @@ -150,28 +160,32 @@ build:cmake-rocm: - cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install build:cmake-cuda: - image: $DOCKER_TAG_PREFIX:cuda-ubuntu - extends: - - .build:cmake - tags: - - nvcc-build - script: - - cmake - -S $CI_PROJECT_DIR - -B $CI_PROJECT_DIR/build - -D GPU_RUNTIME=CUDA - -D CMAKE_CXX_FLAGS="$CXX_FLAGS" - -D CMAKE_CUDA_FLAGS="$CUDA_FLAGS" - -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake - 2>&1 | tee cmake_log.txt - # check if all dependencies were found - - |- - if grep -qi "could not find" cmake_log.txt; then - echo "Some CMake libraries could not be found" - exit 1 - fi - - cmake --build $CI_PROJECT_DIR/build - - cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install + image: $DOCKER_TAG_PREFIX:cuda-ubuntu + extends: + - .build:cmake + tags: + - nvcc-build + script: + - cmake + -S $CI_PROJECT_DIR + -B $CI_PROJECT_DIR/build + -D GPU_RUNTIME=CUDA + -D CMAKE_CXX_FLAGS="$CXX_FLAGS" + -D CMAKE_CUDA_FLAGS="$CUDA_FLAGS" + -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake + 2>&1 | tee cmake_log.txt + # check if all dependencies were found + - |- + if grep -qi "could not find" cmake_log.txt; then + echo "Some CMake libraries could not be found" + exit 1 + fi + - cmake --build $CI_PROJECT_DIR/build + - cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install + +######################## +# Ubuntu Tests # +######################## .test: stage: test @@ -196,122 +210,104 @@ test:cuda: needs: - build:cmake-cuda -.test:windows: - extends: - - .rules:test - stage: test - needs: [] - parallel: - matrix: - - BUILD_TYPE: [Debug, Release] +######################## +# Windows VisualStudio # +######################## -.test:rocm-windows: - extends: - - .test:windows +.test:windows-rocm: tags: - windows - shell - rx6900 -.test:windows-vs: - script: - # MSBuild cannot properly resolve the ` - & $MSBUILD - /maxCpuCount - "/p:Configuration=$BUILD_TYPE" - /warnAsError - /warnAsMessage:MSB3026 - $MSBUILD_EXTRA_OPTIONS - "$CI_PROJECT_DIR/$SOLUTION" +.test:windows-nvcc: + tags: + - nvcc-windows -test:rocm-windows-vs2019: +.test:windows-vs: + stage: test + timeout: 30m extends: - - .test:rocm-windows - - .test:windows-vs + - .rules:test + parallel: + matrix: + - VS_VERSION: + - 2017 + - 2019 + - 2022 + BUILD_TYPE: + - Debug + - Release variables: - MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe" - SOLUTION: "ROCm-Examples-VS2019.sln" + Timeout: 30 + Filter: "*_vs$VS_VERSION.exe" script: - - !reference [".test:windows-vs", script] - - |- - $SkippedExamples = @( - "hip_vulkan_interop_vs2019.exe" # Graphical - "hip_texture_management_vs2019.exe" # Hangs sometimes - "hip_hello_world_vs2019.exe" # Crashes (known driver issue) + - | # Find MSBuild.exe of the associated version. + $MSBUILD = ( + & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -find MSBuild\**\Bin\MSBuild.exe + | Select-String -Pattern $VS_VERSION + )[0] + Write-Output ("MSBuild: $MSBUILD" -f $MSBUILD) + - | # Fixes error MSB8036: The Windows SDK version 8.1 was not found + if ($VS_VERSION -eq 2017) { + $MSBUILD_EXTRA_OPTIONS = "/p:WindowsTargetPlatformVersion=10.0.20348.0" + } + - | # Build! + & $MSBUILD @( + "/clp:Summary;ShowEventId;ShowTimestamp" + "/p:Configuration=$BUILD_TYPE" + "/p:Verbose=false" + "/maxCpuCount:8" + "/p:CL_MPCount=8" + "/verbosity:minimal" + "/validate" + "/warnAsError" + # MSBuild cannot properly resolve the ` + hip_vulkan_interop_*.exe, + hip_texture_management_*.exe, -test:rocm-windows-vs2022: +test:windows-nvcc-vs: extends: - - .test:rocm-windows + - .test:windows-nvcc - .test:windows-vs - variables: - MSBUILD: "C:/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe" - SOLUTION: "ROCm-Examples-VS2022.sln" - -test:rocm-windows-cmake: - extends: - - .test:rocm-windows - script: - - Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' - - Enter-VsDevShell -InstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -SkipAutomaticLocation -DevCmdArguments '/arch=x64 /host_arch=x64 /no_logo' - - cmake - -S "$CI_PROJECT_DIR" - -B "$CI_PROJECT_DIR/build" - -G Ninja - -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" - -D CMAKE_HIP_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" - -D CMAKE_HIP_LINK_EXECUTABLE:PATH="${env:HIP_PATH}\bin\lld-link.exe" - -D CMAKE_HIP_FLAGS="-fuse-ld=lld" - -D CMAKE_CXX_FLAGS="$CXX_FLAGS" - -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" - -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}" - -D CMAKE_HIP_FLAGS="$HIP_FLAGS" - -D CMAKE_BUILD_TYPE="$BUILD_TYPE" - -D CMAKE_HIP_ARCHITECTURES=gfx1030 - -D CMAKE_TOOLCHAIN_FILE:PATH="C:\Tools\Microsoft\vcpkg\scripts\buildsystems\vcpkg.cmake" - 2>&1 | Tee-Object -filepath cmake_log.txt - - |- - if (Select-String -Path cmake_log.txt -Pattern "could not find") { - throw "Some cmake libraries are missing" - } - - cmake --build "$CI_PROJECT_DIR/build" - # CMake does not copy the dependencies to the test folder, and there is no sufficiently concise way of doing it. - # So for now, just add the library path here. - - $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH - - cd "$CI_PROJECT_DIR/build" && ctest --output-on-failure --timeout 10 - - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install" - -.test:nvcc-windows: - extends: - - .test:windows tags: - nvcc-windows + variables: + SOLUTION_PREFIX: ROCm-Examples-Portable-VS + # hip_runtime_compilation: fails on VS2017 + SkippedExamples: > + hip_runtime_compilation_vs2017.exe before_script: + - | # Release builds are currently broken! + $SKIP_TESTS = ($BUILD_TYPE -eq "Release") # To test for NVIDIA, we need to set the platform toolset to HIP_nvcc. This cannot be done with /p:PlatformToolset # though, as some examples use the regular msvc toolchain. - | @@ -321,76 +317,75 @@ test:rocm-windows-cmake: Set-Content $f } -test:nvcc-windows-vs2019: +######################## +# Windows cmake # +######################## + +.test:windows-cmake: extends: - - .test:nvcc-windows - - .test:windows-vs + - .rules:test variables: - MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe" - SOLUTION: "ROCm-Examples-Portable-VS2019.sln" + VS_VERSION: 2022 + BUILD_TYPE: Release + before_script: + - | # Find VS installation + $VS_PATH = ( + & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -property InstallationPath + | Select-String -Pattern $VS_VERSION + )[0] + - | # Find DevShell.dll + $VS_DEV_SHELL = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -path "$VS_PATH" -find "**\Tools\Microsoft.VisualStudio.DevShell.dll" + - Import-Module "$VS_DEV_SHELL" + - Enter-VsDevShell -InstallPath "$VS_PATH" -SkipAutomaticLocation -DevCmdArguments '/arch=x64 /host_arch=x64 /no_logo' script: - - !reference [".test:windows-vs", script] - - |- - $SkippedExamples = @( - "hip_vulkan_interop_vs2019.exe" # Graphical - "hip_opengl_interop_vs2019.exe" # Graphical - ) - Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" | - ForEach-Object { - if ($SkippedExamples -NotContains $_.Name) { - echo "--" $_.Name - & "$CI_PROJECT_DIR/$BUILD_TYPE/$_" - if (!$?) { - throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE - } - } else { - echo "-- SKIPPING " $_.Name - } + - | # Ensure no libraries are missing during compilation! + if (Select-String -Path cmake_log.txt -Pattern "could not find") { + throw "Some cmake libraries are missing" } + - cmake --build "$CI_PROJECT_DIR/build" + # CMake does not copy the dependencies to the test folder, and there is no sufficiently concise way of doing it. + # So for now, just add the library path here. + - $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH + - cd "$CI_PROJECT_DIR/build" + - ctest --output-on-failure --timeout 15 -E "rocsparse_bsrsv|rocsparse_csrsv|rocsparse_spsv|rocsparse_bsrsm|rocsparse_csrsm|rocsparse_bsric0|rocsparse_bsrilu0|rocsparse_csric0|rocsparse_csrilu0" + - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install" + needs: [] -test:nvcc-windows-vs2017: - extends: - - .test:nvcc-windows - - .test:windows-vs - variables: - MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/MSBuild/15.0/Bin/MSBuild.exe" - SOLUTION: "ROCm-Examples-Portable-VS2017.sln" - # See https://developercommunity.visualstudio.com/t/windowstargetplatformversion-makes-it-impossible-t/140294 - MSBUILD_EXTRA_OPTIONS: "/p:WindowsTargetPlatformVersion=10.0.20348.0" - -test:nvcc-windows-vs2022: +test:windows-rocm-cmake: extends: - - .test:nvcc-windows - - .test:windows-vs - variables: - MSBUILD: "C:/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe" - SOLUTION: "ROCm-Examples-Portable-VS2022.sln" + - .test:windows-rocm + - .test:windows-cmake + script: + - cmake + -S "$CI_PROJECT_DIR" + -B "$CI_PROJECT_DIR/build" + -G Ninja + -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" + -D CMAKE_HIP_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" + -D CMAKE_HIP_LINK_EXECUTABLE:PATH="${env:HIP_PATH}\bin\lld-link.exe" + -D CMAKE_CXX_FLAGS="$CXX_FLAGS" + -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}" + -D CMAKE_HIP_FLAGS="$HIP_FLAGS" + -D CMAKE_BUILD_TYPE="$BUILD_TYPE" + -D CMAKE_HIP_ARCHITECTURES=gfx1030 + -D CMAKE_TOOLCHAIN_FILE:PATH="C:\Tools\Microsoft\vcpkg\scripts\buildsystems\vcpkg.cmake" + 2>&1 | Tee-Object -filepath cmake_log.txt + - !reference [.test:windows-cmake, script] -test:nvcc-windows-cmake: +test:windows-nvcc-cmake: extends: - - .test:nvcc-windows + - .test:windows-nvcc + - .test:windows-cmake script: - # Import the VisualStudio 2022 development environment - - |- - $vs = &"C:/Program Files (x86)/Microsoft Visual Studio/Installer/vswhere.exe" -version 17.0 -property InstallationPath - Import-Module (Join-Path $vs "Common7/Tools/Microsoft.VisualStudio.DevShell.dll") - Enter-VsDevShell -VsInstallPath $vs -SkipAutomaticLocation -DevCmdArguments "/arch=x64 /host_arch=x64 /no_logo" # Note: The current version of the HIP SDK does not ship with CMake config files for Nvidia, so we can only test # the HIP-Basic and Applications examples. It is expected that some dependencies will not be found for this. - cmake -S "$CI_PROJECT_DIR" -B "$CI_PROJECT_DIR/build" -G Ninja - -D CMAKE_CXX_COMPILER="cl.exe" - -D CMAKE_BUILD_TYPE="$BUILD_TYPE" -D CMAKE_TOOLCHAIN_FILE="C:/Tools/Microsoft/vcpkg/scripts/buildsystems/vcpkg.cmake" + -D CMAKE_BUILD_TYPE="$BUILD_TYPE" + -D CMAKE_CXX_COMPILER="cl.exe" -D GPU_RUNTIME=CUDA 2>&1 | Tee-Object -filepath cmake_log.txt - - |- - if (Select-String -Path cmake_log.txt -Pattern "could not find") { - throw "Some cmake libraries are missing" - } - - cmake --build "$CI_PROJECT_DIR/build" - - cd "$CI_PROJECT_DIR/build" - - ctest --output-on-failure --timeout 10 - - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install" + - !reference [.test:windows-cmake, script] diff --git a/.gitlab/issue_templates/example.md b/.gitlab/issue_templates/example.md index 2aea591ed..8d32b8fab 100644 --- a/.gitlab/issue_templates/example.md +++ b/.gitlab/issue_templates/example.md @@ -1,22 +1,12 @@ # Example checklist - Elaboration - - [ ] Example concept is described and agreed on + - [ ] Example concept is described and agreed upon - Implementation - - [ ] Example is implemented - - CMake support is added - - [ ] Linux - - [ ] Windows - - [ ] GNU Make support is added (Linux) - - [ ] Visual Studio project is added (Windows) - - [ ] Project is added to the root solution - - [ ] Inline code documentation is added - - [ ] README is added according to template - - [ ] Related READMEs, ToC are updated - - [ ] Internal CI passes + - [ ] Example is implemented - Internal review - - [ ] Internal code review is done + - [ ] Internal code review is done - External review - - [ ] Upstreaming PR is opened, external code review is done + - [ ] Upstreaming PR is opened, external review is done - Done - - [ ] Example merged to upstream + - [ ] Example merged to upstream diff --git a/.gitlab/merge_request_templates/example.md b/.gitlab/merge_request_templates/example.md new file mode 100644 index 000000000..1221d3020 --- /dev/null +++ b/.gitlab/merge_request_templates/example.md @@ -0,0 +1,16 @@ +## Notes for the reviewer +_The reviewer should acknowledge all these topics._ + + +## Checklist before merge +- [ ] CMake support is added + - [ ] Dependencies are copied via `IMPORTED_RUNTIME_ARTIFACTS` if applicable +- [ ] GNU Make support is added (Linux) +- [ ] Visual Studio project is added for VS2017, 2019, 2022 (Windows) (use [the script](https://projects.streamhpc.com/departments/knowledge/employee-handbook/-/wikis/Projects/AMD/Libraries/examples/Adding-Visual-Studio-Projects-to-new-examples#scripts)) + - [ ] DLL dependencies are copied via ` #include #include diff --git a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile index d529c14f4..8a27fc502 100644 --- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile +++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile @@ -27,7 +27,7 @@ RUN export DEBIAN_FRONTEND=noninteractive; \ # Install HIP using the installer script RUN export DEBIAN_FRONTEND=noninteractive; \ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \ - && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.3/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \ + && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.4/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \ && apt-get update -qq \ && apt-get install -y hip-base hipify-clang \ && apt-get download hip-runtime-nvidia hip-dev \ @@ -45,64 +45,71 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \ && ldconfig # Install rocRAND -RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.3.0.tar.gz \ - && tar -xf ./rocm-5.3.0.tar.gz \ - && rm ./rocm-5.3.0.tar.gz \ - && cmake -S ./rocRAND-rocm-5.3.0 -B ./rocRAND-rocm-5.3.0/build \ +RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.4.0.tar.gz \ + && tar -xf ./rocm-5.4.0.tar.gz \ + && rm ./rocm-5.4.0.tar.gz \ + && cmake -S ./rocRAND-rocm-5.4.0 -B ./rocRAND-rocm-5.4.0/build \ -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D BUILD_HIPRAND=OFF \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ - && cmake --build ./rocRAND-rocm-5.3.0/build --target install \ - && rm -rf ./rocRAND-rocm-5.3.0 + && cmake --build ./rocRAND-rocm-5.4.0/build --target install \ + && rm -rf ./rocRAND-rocm-5.4.0 # Install hipCUB -RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.3.0.tar.gz \ - && tar -xf ./rocm-5.3.0.tar.gz \ - && rm ./rocm-5.3.0.tar.gz \ - && cmake -S ./hipCUB-rocm-5.3.0 -B ./hipCUB-rocm-5.3.0/build \ +RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.4.0.tar.gz \ + && tar -xf ./rocm-5.4.0.tar.gz \ + && rm ./rocm-5.4.0.tar.gz \ + && cmake -S ./hipCUB-rocm-5.4.0 -B ./hipCUB-rocm-5.4.0/build \ -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ - && cmake --build ./hipCUB-rocm-5.3.0/build --target install \ - && rm -rf ./hipCUB-rocm-5.3.0 + && cmake --build ./hipCUB-rocm-5.4.0/build --target install \ + && rm -rf ./hipCUB-rocm-5.4.0 # Install hipBLAS -RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.3.0.tar.gz \ - && tar -xf ./rocm-5.3.0.tar.gz \ - && rm ./rocm-5.3.0.tar.gz \ - && cmake -S ./hipBLAS-rocm-5.3.0 -B ./hipBLAS-rocm-5.3.0/build \ +RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.4.0.tar.gz \ + && tar -xf ./rocm-5.4.0.tar.gz \ + && rm ./rocm-5.4.0.tar.gz \ + && cmake -S ./hipBLAS-rocm-5.4.0 -B ./hipBLAS-rocm-5.4.0/build \ -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ -D USE_CUDA=ON \ - && cmake --build ./hipBLAS-rocm-5.3.0/build --target install \ - && rm -rf ./hipBLAS-rocm-5.3.0 + && cmake --build ./hipBLAS-rocm-5.4.0/build --target install \ + && rm -rf ./hipBLAS-rocm-5.4.0 # Install hipSOLVER -RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.3.0.tar.gz \ - && tar -xf ./rocm-5.3.0.tar.gz \ - && rm ./rocm-5.3.0.tar.gz \ - && cmake -S ./hipSOLVER-rocm-5.3.0 -B ./hipSOLVER-rocm-5.3.0/build \ +RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.4.0.tar.gz \ + && tar -xf ./rocm-5.4.0.tar.gz \ + && rm ./rocm-5.4.0.tar.gz \ + && cmake -S ./hipSOLVER-rocm-5.4.0 -B ./hipSOLVER-rocm-5.4.0/build \ -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ -D USE_CUDA=ON \ - && cmake --build ./hipSOLVER-rocm-5.3.0/build --target install \ - && rm -rf ./hipSOLVER-rocm-5.3.0 + && cmake --build ./hipSOLVER-rocm-5.4.0/build --target install \ + && rm -rf ./hipSOLVER-rocm-5.4.0 # Install hipRAND -RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.3.0.tar.gz \ - && tar -xf ./rocm-5.3.0.tar.gz \ - && rm ./rocm-5.3.0.tar.gz \ - && cmake -S ./hipRAND-rocm-5.3.0 -B ./hipRAND-rocm-5.3.0/build \ +RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.4.0.tar.gz \ + && tar -xf ./rocm-5.4.0.tar.gz \ + && rm ./rocm-5.4.0.tar.gz \ + && cmake -S ./hipRAND-rocm-5.4.0 -B ./hipRAND-rocm-5.4.0/build \ -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ - && cmake --build ./hipRAND-rocm-5.3.0/build --target install \ - && rm -rf ./hipRAND-rocm-5.3.0 + -D BUILD_WITH_LIB=CUDA \ + && cmake --build ./hipRAND-rocm-5.4.0/build --target install \ + && rm -rf ./hipRAND-rocm-5.4.0 # Use render group as an argument from user ARG GID=109 -# Add the render group and a user with sudo permissions for the container -RUN groupadd --system --gid ${GID} render \ - && useradd -Um -G sudo,video,render developer \ +# Add the render group or change id if already exists +RUN if [ $(getent group render) ]; then \ + groupmod --gid ${GID} render; \ + else \ + groupadd --system --gid ${GID} render; \ + fi + +# Add a user with sudo permissions for the container +RUN useradd -Um -G sudo,video,render developer \ && echo developer ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/developer \ && chmod 0440 /etc/sudoers.d/developer diff --git a/HIP-Basic/cooperative_groups/CMakeLists.txt b/HIP-Basic/cooperative_groups/CMakeLists.txt index e6dac1e51..fd7bc4d62 100644 --- a/HIP-Basic/cooperative_groups/CMakeLists.txt +++ b/HIP-Basic/cooperative_groups/CMakeLists.txt @@ -54,6 +54,11 @@ add_test(${example_name} ${example_name}) set(include_dirs "../../Common") if(GPU_RUNTIME STREQUAL "CUDA") list(APPEND include_dirs "${ROCM_ROOT}/include") +else() + # Add NDEBUG for HIP version >= 5.5 and < 6.0 due to a known bug in the cooperative groups header + if( ${hip-lang_VERSION} VERSION_GREATER_EQUAL 5.5 AND ${hip-lang_VERSION} VERSION_LESS 6 ) + add_compile_definitions(NDEBUG) + endif() endif() target_include_directories(${example_name} PRIVATE ${include_dirs}) diff --git a/HIP-Basic/device_globals/CMakeLists.txt b/HIP-Basic/device_globals/CMakeLists.txt index eceb522de..c4031a9e5 100644 --- a/HIP-Basic/device_globals/CMakeLists.txt +++ b/HIP-Basic/device_globals/CMakeLists.txt @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -set(example_name device_globals) +set(example_name hip_device_globals) cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(${example_name} LANGUAGES CXX) diff --git a/HIP-Basic/device_query/main.cpp b/HIP-Basic/device_query/main.cpp index 0e5f449ea..1c9247886 100644 --- a/HIP-Basic/device_query/main.cpp +++ b/HIP-Basic/device_query/main.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include -#include "example_utils.hpp" - namespace { /// Number of characters in the first column. diff --git a/HIP-Basic/occupancy/main.hip b/HIP-Basic/occupancy/main.hip index 528c7ee64..eb6afefa4 100644 --- a/HIP-Basic/occupancy/main.hip +++ b/HIP-Basic/occupancy/main.hip @@ -165,8 +165,8 @@ int main() // Initialize the input data for(int i = 0; i < size; i++) { - h_A[i] = (float)i; - h_B[i] = (float)i; + h_A[i] = static_cast(i); + h_B[i] = static_cast(i); } float* d_A = nullptr; diff --git a/HIP-Basic/texture_management/main.hip b/HIP-Basic/texture_management/main.hip index e82b980c7..a0dc53181 100644 --- a/HIP-Basic/texture_management/main.hip +++ b/HIP-Basic/texture_management/main.hip @@ -24,6 +24,7 @@ #include +#include #include #include @@ -147,8 +148,8 @@ int main() HIP_CHECK(hipGetLastError()); // Copy data from device back to host. - unsigned int h_histogram[hist_bin_count]; - HIP_CHECK(hipMemcpy(h_histogram, d_histogram, hist_bytes, hipMemcpyDeviceToHost)); + std::array h_histogram; + HIP_CHECK(hipMemcpy(h_histogram.data(), d_histogram, hist_bytes, hipMemcpyDeviceToHost)); // Print out results. std::cout << "Equal-width histogram with " << hist_bin_count << " bins of values [0, " << size diff --git a/Libraries/hipBLAS/gemm_strided_batched/README.md b/Libraries/hipBLAS/gemm_strided_batched/README.md index 33b0ba81c..b7bdc8eef 100644 --- a/Libraries/hipBLAS/gemm_strided_batched/README.md +++ b/Libraries/hipBLAS/gemm_strided_batched/README.md @@ -3,20 +3,21 @@ ## Description This example illustrates the use of the hipBLAS Level 3 Strided Batched General Matrix Multiplication. The hipBLAS GEMM STRIDED BATCHED performs a matrix--matrix operation for a _batch_ of matrices as: -$C[i] = \alpha \cdot f(A[i]) \cdot f(B[i]) + \beta \cdot (C[i])$ +$C[i] = \alpha \cdot A[i]' \cdot B[i]' + \beta \cdot (C[i])$ -for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $f(X)$ is one of the following: -- $f(X) = X$ or -- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or -- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $). +for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $X'$ is one of the following: +- $X' = X$ or +- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or +- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $). +In this example the identity is used. $\alpha$ and $\beta$ are scalars, and $A$, $B$ and $C$ are the batches of matrices. For each $i$, $A[i]$, $B[i]$ and $C[i]$ are matrices such that -$f(A[i])$ is an $m \times k$ matrix, $f(B[i])$ a $k \times n$ matrix and $C[i]$ an $m \times n$ matrix. +$A_i'$ is an $m \times k$ matrix, $B_i'$ a $k \times n$ matrix and $C_i$ an $m \times n$ matrix. ### Application flow 1. Read in command-line parameters. -2. Set $f$ operation, set sizes of matrices and get batch count. +2. Set dimension variables of the matrices and get the batch count. 3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix. 4. Initialize gold standard matrix. 5. Compute CPU reference result with strided batched subvectors. @@ -33,19 +34,19 @@ The application provides the following optional command line arguments: - `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1. - `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1. - `-c` or `--count`. Batch count. Its default value is 3. -- `-m` or `--m`. The number of rows of matrices $f(A)$ and $C$, which must be greater than 0. Its default value is 5. -- `-n` or `--n`. The number of columns of matrices $f(B)$ and $C$, which must be greater than 0. Its default value is 5. -- `-k` or `--k`. The number of columns of matrix $f(A)$ and rows of matrix $f(B)$, which must be greater than 0. Its default value is 5. +- `-m` or `--m`. The number of rows of matrices $A$ and $C$, which must be greater than 0. Its default value is 5. +- `-n` or `--n`. The number of columns of matrices $B$ and $C$, which must be greater than 0. Its default value is 5. +- `-k` or `--k`. The number of columns of matrix $A$ and rows of matrix $B$, which must be greater than 0. Its default value is 5. ## Key APIs and Concepts - The performance of a numerical multi-linear algebra code can be heavily increased by using tensor contractions [ [Y. Shi et al., HiPC, pp 193, 2016.](https://doi.org/10.1109/HiPC.2016.031) ], thereby most of the hipBLAS functions have a`_batched` and a `_strided_batched` [ [C. Jhurani and P. Mullowney, JPDP Vol 75, pp 133, 2015.](https://doi.org/10.1016/j.jpdc.2014.09.003) ] extensions.
We can apply the same multiplication operator for several matrices if we combine them into batched matrices. Batched matrix multiplication has a performance improvement for a large number of small matrices. For a constant stride between matrices, further acceleration is available by strided batched GEMM. - hipBLAS is initialized by calling `hipblasCreate(hipblasHandle*)` and it is terminated by calling `hipblasDestroy(hipblasHandle)`. - The _pointer mode_ controls whether scalar parameters must be allocated on the host (`HIPBLAS_POINTER_MODE_HOST`) or on the device (`HIPBLAS_POINTER_MODE_DEVICE`). It is controlled by `hipblasSetPointerMode`. -- The $f$ operator -- defined in Description section -- can be - - `HIPBLAS_OP_N`: identity operator ($f(X) = X$), - - `HIPBLAS_OP_T`: transpose operator ($f(X) = X^T$) or - - `HIPBLAS_OP_C`: Hermitian (conjugate transpose) operator ($f(X) = X^H$). +- The symbol $X'$ denotes the following operations, as defined in the Description section: + - `HIPBLAS_OP_N`: identity operator ($X' = X$), + - `HIPBLAS_OP_T`: transpose operator ($X' = X^T$) or + - `HIPBLAS_OP_C`: Hermitian (conjugate transpose) operator ($X' = X^H$). - `hipblasStride` strides between matrices or vectors in strided_batched functions. - `hipblas[HSDCZ]gemmStridedBatched` @@ -60,9 +61,9 @@ We can apply the same multiplication operator for several matrices if we combine - `hipblasHandle_t handle` - `hipblasOperation_t trans_a`: transformation operator on each $A_i$ matrix - `hipblasOperation_t trans_b`: transformation operator on each $B_i$ matrix - - `int m`: number of rows in each $f(A_i)$ and $C$ matrices - - `int n`: number of columns in each $f(B_i)$ and $C$ matrices - - `int k`: number of columns in each $f(A_i)$ matrix and number of rows in each $f(B_i)$ matrix + - `int m`: number of rows in each $A_i'$ and $C$ matrices + - `int n`: number of columns in each $B_i'$ and $C$ matrices + - `int k`: number of columns in each $A_i'$ matrix and number of rows in each $B_i'$ matrix - `const float *alpha`: scalar multiplier of each $C_i$ matrix addition - `const float *A`: pointer to the each $A_i$ matrix - `int lda`: leading dimension of each $A_i$ matrix diff --git a/Libraries/hipBLAS/gemm_strided_batched/main.hip b/Libraries/hipBLAS/gemm_strided_batched/main.hip index 94d0a718b..95b2d0d53 100644 --- a/Libraries/hipBLAS/gemm_strided_batched/main.hip +++ b/Libraries/hipBLAS/gemm_strided_batched/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -42,9 +42,9 @@ int main(const int argc, const char** argv) parser.set_optional("a", "alpha", 1.f, "Alpha scalar"); parser.set_optional("b", "beta", 1.f, "Beta scalar"); parser.set_optional("c", "count", 3, "Batch count"); - parser.set_optional("m", "m", 5, "Number of rows of matrices f(A_i) and C_i"); - parser.set_optional("n", "n", 5, "Number of columns of matrices f(B_i) and C_i"); - parser.set_optional("k", "k", 5, "Number of columns of matrix f(A_i) and rows of f(B_i)"); + parser.set_optional("m", "m", 5, "Number of rows of matrices A_i and C_i"); + parser.set_optional("n", "n", 5, "Number of columns of matrices B_i and C_i"); + parser.set_optional("k", "k", 5, "Number of columns of matrix A_i and rows of B_i"); parser.run_and_exit_if_error(); // Set sizes of matrices. @@ -84,7 +84,7 @@ int main(const int argc, const char** argv) const float h_alpha = parser.get("a"); const float h_beta = parser.get("b"); - // Set GEMM operation as identity operation: $f(X) = X$ + // Set GEMM operation as identity operation: $X' = X$ const hipblasOperation_t trans_a = HIPBLAS_OP_N; const hipblasOperation_t trans_b = HIPBLAS_OP_N; diff --git a/Libraries/hipCUB/device_radix_sort/main.hip b/Libraries/hipCUB/device_radix_sort/main.hip index a67609041..b11867d46 100644 --- a/Libraries/hipCUB/device_radix_sort/main.hip +++ b/Libraries/hipCUB/device_radix_sort/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include @@ -27,12 +29,10 @@ #include #include -#include "example_utils.hpp" - int main() { // Allocate and initialize data on the host - const std::vector h_keys{9.3, 2.1, 7.3, 4, 2.2, 5, 3.6, 2.7, 1.1, 0}; + const std::vector h_keys{9.3f, 2.1f, 7.3f, 4.0f, 2.2f, 5.0f, 3.6f, 2.7f, 1.1f, 0.0f}; const std::vector h_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; assert(h_keys.size() == h_values.size()); const int num_elements = h_keys.size(); diff --git a/Libraries/hipCUB/device_sum/main.hip b/Libraries/hipCUB/device_sum/main.hip index d400c55b1..b1450ad0e 100644 --- a/Libraries/hipCUB/device_sum/main.hip +++ b/Libraries/hipCUB/device_sum/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,14 +20,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include #include -#include "example_utils.hpp" - int main() { // Allocate and initialize data on the host diff --git a/Libraries/hipSOLVER/syevj/main.cpp b/Libraries/hipSOLVER/syevj/main.cpp index ccf03accb..e1afd94aa 100644 --- a/Libraries/hipSOLVER/syevj/main.cpp +++ b/Libraries/hipSOLVER/syevj/main.cpp @@ -55,7 +55,7 @@ int main(const int argc, char* argv[]) // 3. Generate a random symmetric matrix std::default_random_engine generator; std::uniform_real_distribution distribution(0., 2.); - auto random_number = std::bind(distribution, generator); + auto random_number = [&]() { return distribution(generator); }; for(int i = 0; i < n; i++) { diff --git a/Libraries/rocBLAS/level_3/gemm/README.md b/Libraries/rocBLAS/level_3/gemm/README.md index eeb5c2741..99e2bce0e 100644 --- a/Libraries/rocBLAS/level_3/gemm/README.md +++ b/Libraries/rocBLAS/level_3/gemm/README.md @@ -2,17 +2,19 @@ ## Description This example illustrates the use of the rocBLAS Level 3 General Matrix Multiplication. The rocBLAS GEMM performs a matrix--matrix operation as: -$C = \alpha \cdot f(A) \cdot f(B) + \beta \cdot C$, -where $f(X)$ is one of the following: -- $f(X) = X$ or -- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or -- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar{X_{ji}} $), +$C = \alpha \cdot A' \cdot B' + \beta \cdot C$, +where $X'$ is one of the following: +- $X' = X$ or +- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or +- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar{X_{ji}} $), +In this example the identity is used. + $\alpha and $\beta$ are scalars, and $A$, $B$ and $C$ are matrices, with -$f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times n$ matrix. +$A'$ an $m \times k$ matrix, $B'$ a $k \times n$ matrix and $C$ an $m \times n$ matrix. ### Application flow 1. Read in command-line parameters. -2. Set $f$ operation and set sizes of matrices. +2. Set dimension variables of the matrices. 3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix. 4. Initialize gold standard matrix. 5. Compute CPU reference result. @@ -28,9 +30,9 @@ $f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times The application provides the following optional command line arguments: - `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1. - `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1. -- `-m` or `--m`. The number of rows of matrices $f(A)$ and $C$, which must be greater than 0. Its default value is 5. -- `-n` or `--n`. The number of columns of matrices $f(B)$ and $C$, which must be greater than 0. Its default value is 5. -- `-k` or `--k`. The number of columns of matrix $f(A)$ and rows of matrix $f(B)$, which must be greater than 0. Its default value is 5. +- `-m` or `--m`. The number of rows of matrices $A$ and $C$, which must be greater than 0. Its default value is 5. +- `-n` or `--n`. The number of columns of matrices $B$ and $C$, which must be greater than 0. Its default value is 5. +- `-k` or `--k`. The number of columns of matrix $A$ and rows of matrix $B$, which must be greater than 0. Its default value is 5. ## Key APIs and Concepts - rocBLAS is initialized by calling `rocblas_create_handle(rocblas_handle*)` and it is terminated by calling `rocblas_destroy_handle(rocblas_handle)`. @@ -47,9 +49,9 @@ The application provides the following optional command line arguments: - `rocblas_handle handle` - `rocblas_operation transA`: transformation operator on $A$ matrix - `rocblas_operation transB`: transformation operator on $B$ matrix - - `rocblas_int m`: number of rows in $f(A)$ and $C$ matrices - - `rocblas_int n`: number of columns in $f(B)$ and $C$ matrices - - `rocblas_int k`: number of columns in $f(A)$ matrix and number of rows in $f(B)$ matrix + - `rocblas_int m`: number of rows in $A'$ and $C$ matrices + - `rocblas_int n`: number of columns in $B'$ and $C$ matrices + - `rocblas_int k`: number of columns in $A'$ matrix and number of rows in $B'$ matrix - `const float *alpha`: scalar multiplier of $C$ matrix addition - `const float *A`: pointer to the $A$ matrix - `rocblas_int lda`: leading dimension of $A$ matrix diff --git a/Libraries/rocBLAS/level_3/gemm/main.cpp b/Libraries/rocBLAS/level_3/gemm/main.cpp index 650b07573..8190a7b87 100644 --- a/Libraries/rocBLAS/level_3/gemm/main.cpp +++ b/Libraries/rocBLAS/level_3/gemm/main.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -40,9 +40,9 @@ int main(const int argc, const char** argv) cli::Parser parser(argc, argv); parser.set_optional("a", "alpha", 1.f, "Alpha scalar"); parser.set_optional("b", "beta", 1.f, "Beta scalar"); - parser.set_optional("m", "m", 5, "Number of rows of matrices f(A) and C"); - parser.set_optional("n", "n", 5, "Number of columns of matrices f(B) and C"); - parser.set_optional("k", "k", 5, "Number of columns of matrix f(A) and rows of f(B)"); + parser.set_optional("m", "m", 5, "Number of rows of matrices A and C"); + parser.set_optional("n", "n", 5, "Number of columns of matrices B and C"); + parser.set_optional("k", "k", 5, "Number of columns of matrix A and rows of B"); parser.run_and_exit_if_error(); // Set sizes of matrices. @@ -73,7 +73,7 @@ int main(const int argc, const char** argv) const rocblas_float h_alpha = parser.get("a"); const rocblas_float h_beta = parser.get("b"); - // Set GEMM operation as identity operation: $f(X) = X$ + // Set GEMM operation as identity operation: $X' = X$ const rocblas_operation trans_a = rocblas_operation_none; const rocblas_operation trans_b = rocblas_operation_none; diff --git a/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md b/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md index 057e0e599..f025c7490 100644 --- a/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md +++ b/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md @@ -3,20 +3,21 @@ ## Description This example illustrates the use of the rocBLAS Level 3 Strided Batched General Matrix Multiplication. The rocBLAS GEMM STRIDED BATCHED performs a matrix--matrix operation for a _batch_ of matrices as: -$C[i] = \alpha \cdot f(A[i]) \cdot f(B[i]) + \beta \cdot (C[i])$ +$C[i] = \alpha \cdot A[i]' \cdot B[i]' + \beta \cdot (C[i])$ -for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $f(X)$ is one of the following: -- $f(X) = X$ or -- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or -- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $). +for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $X'$ is one of the following: +- $X' = X$ or +- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or +- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $). +In this example the identity is used. $\alpha$ and $\beta$ are scalars, and $A$, $B$ and $C$ are the batches of matrices. For each $i$, $A[i]$, $B[i]$ and $C[i]$ are matrices such that -$f(A[i])$ is an $m \times k$ matrix, $f(B[i])$ a $k \times n$ matrix and $C[i]$ an $m \times n$ matrix. +$A_i'$ is an $m \times k$ matrix, $B_i'$ a $k \times n$ matrix and $C_i$ an $m \times n$ matrix. ### Application flow 1. Read in command-line parameters. -2. Set $f$ operation, set sizes of matrices and get batch count. +2. Set dimension variables of the matrices and get batch count and stride. 3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix. 4. Initialize gold standard matrix. 5. Compute CPU reference result with strided batched subvectors. @@ -33,9 +34,9 @@ The application provides the following optional command line arguments: - `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1. - `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1. - `-c` or `--count`. Batch count. Its default value is 3. -- `-m` or `--m`. The number of rows of matrices $f(A_i)$ and $C_i$, which must be greater than 0. Its default value is 5. -- `-n` or `--n`. The number of columns of matrices $f(B_i)$ and $C_i$, which must be greater than 0. Its default value is 5. -- `-k` or `--k`. The number of columns of columns of matrix f(A_i) and rows of f(B_i) +- `-m` or `--m`. The number of rows of matrices $A_i$ and $C_i$, which must be greater than 0. Its default value is 5. +- `-n` or `--n`. The number of columns of matrices $B_i$ and $C_i$, which must be greater than 0. Its default value is 5. +- `-k` or `--k`. The number of columns of columns of matrix $A_i$ and rows of $B_i$ ## Key APIs and Concepts - The performance of a numerical multi-linear algebra code can be heavily increased by using tensor contractions [ [Y. Shi et al., HiPC, pp 193, 2016.](https://doi.org/10.1109/HiPC.2016.031) ], thereby most of the rocBLAS functions have a`_batched` and a `_strided_batched` [ [C. Jhurani and P. Mullowney, JPDP Vol 75, pp 133, 2015.](https://doi.org/10.1016/j.jpdc.2014.09.003) ] extensions.
@@ -57,9 +58,9 @@ We can apply the same multiplication operator for several matrices if we combine - `rocblas_handle handle` - `rocblas_operation transA`: transformation operator on $A_i$ matrix - `rocblas_operation transB`: transformation operator on $B_i$ matrix - - `rocblas_int m`: number of rows in $f(A_i)$ and $C_i$ matrices - - `rocblas_int n`: number of columns in $f(B_i)$ and $C_i$ matrices - - `rocblas_int k`: number of columns in $f(A_i)$ matrix and number of rows in $f(B_i)$ matrix + - `rocblas_int m`: number of rows in $A_i'$ and $C_i$ matrices + - `rocblas_int n`: number of columns in $B_i'$ and $C_i$ matrices + - `rocblas_int k`: number of columns in $A_i'$ matrix and number of rows in $B_i'$ matrix - `const float *alpha`: scalar multiplier of $C_i$ matrix addition - `const float *A`: pointer to each $A_i$ matrix - `rocblas_int lda`: leading dimension of each $A_i$ matrix diff --git a/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp b/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp index 32750eac2..471df167b 100644 --- a/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp +++ b/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -42,9 +42,9 @@ int main(const int argc, const char** argv) parser.set_optional("a", "alpha", 1.f, "Alpha scalar"); parser.set_optional("b", "beta", 1.f, "Beta scalar"); parser.set_optional("c", "count", 3, "Batch count"); - parser.set_optional("m", "m", 5, "Number of rows of matrices f(A_i) and C_i"); - parser.set_optional("n", "n", 5, "Number of columns of matrices f(B_i) and C_i"); - parser.set_optional("k", "k", 5, "Number of columns of matrix f(A_i) and rows of f(B_i)"); + parser.set_optional("m", "m", 5, "Number of rows of matrices A_i and C_i"); + parser.set_optional("n", "n", 5, "Number of columns of matrices B_i and C_i"); + parser.set_optional("k", "k", 5, "Number of columns of matrix A_i and rows of B_i"); parser.run_and_exit_if_error(); // Set sizes of matrices. @@ -84,7 +84,7 @@ int main(const int argc, const char** argv) const rocblas_float h_alpha = parser.get("a"); const rocblas_float h_beta = parser.get("b"); - // Set GEMM operation as identity operation: $f(X) = X$. + // Set GEMM operation as identity operation: $X' = X$. const rocblas_operation trans_a = rocblas_operation_none; const rocblas_operation trans_b = rocblas_operation_none; diff --git a/Libraries/rocPRIM/block_sum/main.hip b/Libraries/rocPRIM/block_sum/main.hip index deb70fd81..01e5d6d43 100644 --- a/Libraries/rocPRIM/block_sum/main.hip +++ b/Libraries/rocPRIM/block_sum/main.hip @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include @@ -28,8 +30,6 @@ #include #include -#include "example_utils.hpp" - /// \brief Compute the sum of an array on the host CPU std::vector reduce_sum_host(const std::vector& data, const unsigned int run_size, diff --git a/Libraries/rocPRIM/device_sum/main.hip b/Libraries/rocPRIM/device_sum/main.hip index 0d5851766..e67824a05 100644 --- a/Libraries/rocPRIM/device_sum/main.hip +++ b/Libraries/rocPRIM/device_sum/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,14 +20,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include #include -#include "example_utils.hpp" - int main() { // Allocate and initialize data on the host diff --git a/Libraries/rocThrust/device_ptr/main.hip b/Libraries/rocThrust/device_ptr/main.hip index f799098ec..9cb90a314 100644 --- a/Libraries/rocThrust/device_ptr/main.hip +++ b/Libraries/rocThrust/device_ptr/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include @@ -33,8 +35,6 @@ #include #include -#include "example_utils.hpp" - int main() { // Allocate memory buffer to store 10 integers on the device diff --git a/Libraries/rocThrust/norm/main.hip b/Libraries/rocThrust/norm/main.hip index 2b2862740..9566618fa 100644 --- a/Libraries/rocThrust/norm/main.hip +++ b/Libraries/rocThrust/norm/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include @@ -29,8 +31,6 @@ #include #include -#include "example_utils.hpp" - // An anonymous namespace sets static linkage to its contents. // This means that the contained function definitions will only be visible // in the current compilation unit (i.e. cpp source file). diff --git a/Libraries/rocThrust/reduce_sum/main.hip b/Libraries/rocThrust/reduce_sum/main.hip index 984ff8f0b..38ec13353 100644 --- a/Libraries/rocThrust/reduce_sum/main.hip +++ b/Libraries/rocThrust/reduce_sum/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include @@ -27,8 +29,6 @@ #include #include -#include "example_utils.hpp" - int main() { // create a host vector with 4 elements diff --git a/Libraries/rocThrust/remove_points/main.hip b/Libraries/rocThrust/remove_points/main.hip index 24da8363c..70812169b 100644 --- a/Libraries/rocThrust/remove_points/main.hip +++ b/Libraries/rocThrust/remove_points/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include #include -#include "example_utils.hpp" - // An anonymous namespace sets static linkage to its contents. // This means that the contained function definitions will only be visible // in the current compilation unit (i.e. cpp source file). diff --git a/Libraries/rocThrust/saxpy/main.hip b/Libraries/rocThrust/saxpy/main.hip index 77478dda3..a82afd161 100644 --- a/Libraries/rocThrust/saxpy/main.hip +++ b/Libraries/rocThrust/saxpy/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,14 +20,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include #include #include -#include "example_utils.hpp" - // This example illustrates how to implement the SAXPY operation // (Y[i] = a * X[i] + Y[i]) using rocThrust. diff --git a/Libraries/rocThrust/vectors/main.hip b/Libraries/rocThrust/vectors/main.hip index 74a46cc54..42c3c936c 100644 --- a/Libraries/rocThrust/vectors/main.hip +++ b/Libraries/rocThrust/vectors/main.hip @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "example_utils.hpp" + #include #include #include -#include "example_utils.hpp" - int main() { // Allocate a resizable vector in host memory. diff --git a/Scripts/WindowsRunner.ps1 b/Scripts/WindowsRunner.ps1 new file mode 100644 index 000000000..ed96efa4c --- /dev/null +++ b/Scripts/WindowsRunner.ps1 @@ -0,0 +1,108 @@ +param( + [Parameter(Mandatory)] + [string]$Path = "Debug", + [string]$Filter = "*.exe", + [int]$Timeout = 10, + [string[]]$Skip = @() +) +$Skip = $Skip | ForEach-Object { $_.Trim() } + +Write-Host "Testing all '$Filter' in '$Path' with a timeout of $Timeout" +Write-Host "Skipping examples that match any of:" +foreach($item in $Skip) { + Write-Host "- $item" +} + +$FailureCount = 0 +$Results = @() + +function Run-Example { + param( + [System.IO.FileInfo]$FileInfo + ) + + $Job = Start-Job -ScriptBlock { + param([string]$FullName) + $Time = Measure-Command { + try { + $Log = & $FullName + $JobExitStatus = $LASTEXITCODE + } catch { + $JobExitStatus = "CRASH!" + } + } + return [PSCustomObject]@{ + ExitStatus = $JobExitStatus + Log = $Log + Time = $Time + } + } -ArgumentList $FileInfo.FullName + + # Execute the job with a timeout + $Job | Wait-Job -TimeOut $Timeout | Out-Null + + # Get the results from the job! + $Result = Receive-Job $Job + Write-Host $Result.Log + + if ($null -ne $Result.ExitStatus) { + $TimeSpan = $Result.Time.toString("mm\:ss\.fff") + $ExitStatus = $Result.ExitStatus + } else { + $ExitStatus = "Timeout!" + $TimeSpan = $null + } + + if ($Result.ExitStatus -eq 0) { + # Exited gracefully! + $Status = "`e[32mPass`e[0m" + $ExitDisplay = "`e[32m$ExitStatus`e[0m" + } else { + $ExitDisplay = "`e[31m$ExitStatus`e[0m" + + # Otherwise, fail! + $Status = "`e[31m`e[1mFail`e[0m" + $FailureCount += 1 + } + + # Clean up! + Remove-Job -force $Job + + [PSCustomObject]@{ + Name = $FileInfo.Name + State = $Status + ExitStatus = $ExitDisplay + Time = $TimeSpan + } +} + +Get-ChildItem -Recurse -File -Path $Path -Filter $Filter | ForEach-Object { + Write-Host ("`e[36m-- {0}`e[0m" -f $_.Name) + + $ShouldSkip = $false + foreach($F in $Skip) { + if ($_.Name -like $F) { + Write-Host "`e[33m`e[1mSkipped by wildcard:`e[0m $F" + $ShouldSkip = $true + break + } + } + + # Put into a hash table and append to a list for table magic! + if (-not $ShouldSkip) { + $Results += Run-Example $_ + } else { + $Results += [PSCustomObject]@{ + Name = $_.Name + State = "`e[33m`e[1mSkip`e[0m" + ExitStatus = $null + Time = $null + } + } +} + +$Results | Format-Table + +if ($FailureCount -gt 0) { + throw "$FailureCount failed jobs!" +}