HIP "Basic" Example Suite (part 4) (#13)

* add cmake build instructions for windows * Add bit extract example * Resolve "Multi GPU data transfer example" * Resolve "Vulkan interop example" * fix bit extract typo * Resolve "OpenGL interop example" * Add module API example * enable -Wall -Wextra -Werror in cmake in ci * Resolve "CMake don't throw error when building examples and certain libraries are missing." * Device Globals example * add moving average example * Resolve "Static library example" * fix missing opengl cmake check * Resolve "Inline assembly / GPU arch example" * revert to old msvc project file structure * add hip basic texture management example * normalize line endings * remove hiprtc from bitextract * Resolve "Cooperative groups example" * Fix GUIDs * Resolve "Floyd-Warshall example" Co-authored-by: Nol Moonen <[email protected]> Co-authored-by: Beatriz Navidad Vilches <[email protected]> Co-authored-by: Robin Voetter <[email protected]> Co-authored-by: Vince van Heertum <[email protected]>
ROCm · Nov 23, 2022 · acdf61b · acdf61b
1 parent 58c3a8e
commit acdf61b
Show file tree

Hide file tree

Showing 242 changed files with 19,021 additions and 1,983 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,4 @@
 *.hip gitlab-language=cuda linguist-language=Cuda
+*.sln text eol=crlf
+*.vcxproj text eol=crlf
+*.vcxproj.filters text eol=crlf
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -29,6 +29,12 @@ include:
       - /gpus-nvcc.yaml
       - /rules.yaml
 
+variables:
+  # suppressing 186 allows us to write `assert(a && "message")`.
+  CUDA_FLAGS: "-Xcompiler -Wall,-Wextra,-Werror --Werror all-warnings --diag-suppress 186"
+  CXX_FLAGS: "-Wall -Wextra -Werror"
+  HIP_FLAGS: "-Wall -Wextra -Werror"
+
 stages:
   - lint
   - build
@@ -98,7 +104,7 @@ build:make-rocm:
     - rocm-build
   needs: []
   script:
-    - cd $CI_PROJECT_DIR && make -j $(nproc)
+    - cd $CI_PROJECT_DIR && make CXXFLAGS="$HIP_FLAGS" -j $(nproc)
 
 build:make-cuda:
   image: $DOCKER_TAG_PREFIX:cuda-ubuntu
@@ -109,7 +115,7 @@ build:make-cuda:
     - nvcc-build
   needs: []
   script:
-    - cd $CI_PROJECT_DIR && make GPU_RUNTIME=CUDA -j $(nproc)
+    - cd $CI_PROJECT_DIR && make CXXFLAGS="$CUDA_FLAGS" GPU_RUNTIME=CUDA -j $(nproc)
 
 .build:cmake:
   stage: build
@@ -132,6 +138,15 @@ build:cmake-rocm:
       -S $CI_PROJECT_DIR
       -B $CI_PROJECT_DIR/build
       -D CMAKE_HIP_ARCHITECTURES="$GPU_TARGETS"
+      -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
+      -D CMAKE_HIP_FLAGS="$HIP_FLAGS"
+      | tee cmake_log.txt
+    # check if all dependencies were found
+    - |-
+      if grep -q "Could NOT find" cmake_log.txt; then
+          echo "Some CMake libraries could not be found"
+          exit 1
+      fi
     - cmake --build $CI_PROJECT_DIR/build
 
 build:cmake-cuda:
@@ -145,6 +160,15 @@ build:cmake-cuda:
       -S $CI_PROJECT_DIR
       -B $CI_PROJECT_DIR/build
       -D GPU_RUNTIME=CUDA
+      -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
+      -D CMAKE_CUDA_FLAGS="$CUDA_FLAGS"
+      | tee cmake_log.txt
+    # check if all dependencies were found
+    - |-
+      if grep -q "Could NOT find" cmake_log.txt; then
+          echo "Some CMake libraries could not be found"
+          exit 1
+      fi
     - cmake --build $CI_PROJECT_DIR/build
 
 .test:
@@ -190,16 +214,19 @@ test:rocm-windows-vs2019:
     - >
       & "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
       /maxCpuCount
+      /warnAsError
       "/p:Configuration=$BUILD_TYPE"
       "$CI_PROJECT_DIR"
     - |-
       Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" |
       ForEach-Object {
-        echo "--" $_.Name
-        & "$_"
-        if (!$?) {
-          throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
-        }
+          if (("hip_vulkan_interop_vs2019.exe","hip_opengl_interop_vs2019.exe") -NotContains $_.Name) {
+              echo "--" $_.Name
+              & "$_"
+              if (!$?) {
+                throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
+              }
+          }
       }
 
 test:rocm-windows-cmake:
@@ -210,8 +237,16 @@ test:rocm-windows-cmake:
       -S "$CI_PROJECT_DIR"
       -B "$CI_PROJECT_DIR/build"
       -G Ninja
+      -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
+      -D CMAKE_HIP_FLAGS="$HIP_FLAGS"
       -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
       -D CMAKE_HIP_ARCHITECTURES=gfx1030
       -D CMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/10/bin/10.0.19041.0/x64/rc.exe"
+      -D CMAKE_TOOLCHAIN_FILE="C:/Tools/Microsoft/vcpkg/scripts/buildsystems/vcpkg.cmake"
+      | Tee-Object -filepath cmake_log.txt
+    - |-
+      if (Select-String -Path cmake_log.txt -Pattern "Could NOT find") {
+          throw "Some cmake libraries are missing"
+      }
     - cmake --build "$CI_PROJECT_DIR/build"
     - cd "$CI_PROJECT_DIR/build" && ctest --output-on-failure
diff --git a/Applications/CMakeLists.txt b/Applications/CMakeLists.txt
@@ -0,0 +1,26 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(Applications LANGUAGES CXX)
+
+add_subdirectory(floyd_warshall)
diff --git a/Applications/Makefile b/Applications/Makefile
@@ -0,0 +1,34 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLES := \
+	floyd_warshall
+
+all: $(EXAMPLES)
+
+clean: TARGET=clean
+clean: all
+
+$(EXAMPLES):
+	$(MAKE) -C $@ $(TARGET)
+
+.PHONY: all clean $(EXAMPLES)
diff --git a/Applications/README.md b/Applications/README.md
@@ -0,0 +1,43 @@
+# Applications Examples
+
+## Summary
+The examples in this subdirectory showcase several GPU-implementations of finance, computer science, physics, etc. models or algorithms that additionally offer a command line application. The examples are build on Linux for the ROCm (AMD GPU) backend. Some examples additionally support the CUDA (NVIDIA GPU) backend.
+
+## Prerequisites
+### Linux
+- [CMake](https://cmake.org/download/) (at least version 3.21)
+- OR GNU Make - available via the distribution's package manager
+- [ROCm](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.1.3/page/Overview_of_ROCm_Installation_Methods.html) (at least version 5.x.x)
+
+### Windows
+- [Visual Studio](https://visualstudio.microsoft.com/) 2019 or 2022 with the "Desktop Development with C++" workload
+- ROCm toolchain for Windows (No public release yet)
+    - The Visual Studio ROCm extension needs to be installed to build with the solution files.
+- [CMake](https://cmake.org/download/) (optional, to build with CMake. Requires at least version 3.21)
+- [Ninja](https://ninja-build.org/) (optional, to build with CMake)
+
+## Building
+### Linux
+Make sure that the dependencies are installed, or use one of the [provided Dockerfiles](../../Dockerfiles/) to build and run the examples in a containerized environment.
+
+#### Using CMake
+All examples in the `Applications` subdirectory can either be built by a single CMake project or be built independently.
+
+- `$ cd Libraries/Applications`
+- `$ cmake -S . -B build` (on ROCm) or `$ cmake -S . -B build -D GPU_RUNTIME=CUDA` (on CUDA, when supported)
+- `$ cmake --build build`
+
+#### Using Make
+All examples can be built by a single invocation to Make or be built independently.
+
+- `$ cd Libraries/Applications`
+- `$ make` (on ROCm) or `$ make GPU_RUNTIME=CUDA` (on CUDA, when supported)
+
+### Windows
+#### Visual Studio
+Visual Studio solution files are available for the individual examples. To build all supported HIP runtime examples open the top level solution file [ROCm-Examples-VS2019.sln](../../ROCm-Examples-VS2019.sln) and filter for Applications.
+
+For more detailed build instructions refer to the top level [README.md](../../README.md#visual-studio).
+
+#### CMake
+All examples in the `Applications` subdirectory can either be built by a single CMake project or be built independently. For build instructions refer to the top-level [README.md](../../README.md#cmake-2).
diff --git a/Applications/floyd_warshall/.gitignore b/Applications/floyd_warshall/.gitignore
@@ -0,0 +1 @@
+applications_floyd_warshall
diff --git a/Applications/floyd_warshall/CMakeLists.txt b/Applications/floyd_warshall/CMakeLists.txt
@@ -0,0 +1,58 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name applications_floyd_warshall)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+set(GPU_RUNTIMES "HIP" "CUDA")
+set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
+
+if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+add_executable(${example_name} main.hip)
+# Make example runnable using ctest
+add_test(${example_name} ${example_name})
+
+set(include_dirs "../../Common")
+# For examples targeting NVIDIA, include the HIP header directory.
+if(GPU_RUNTIME STREQUAL "CUDA")
+    list(APPEND include_dirs "${ROCM_ROOT}/include")
+endif()
+
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/Applications/floyd_warshall/Makefile b/Applications/floyd_warshall/Makefile
@@ -0,0 +1,60 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLE := applications_floyd_warshall
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME := HIP
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+
+# Common variables and flags
+CXX_STD   := c++17
+ICXXFLAGS := -std=$(CXX_STD)
+ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+ILDFLAGS  :=
+ILDLIBS   :=
+
+ifeq ($(GPU_RUNTIME), CUDA)
+	ICXXFLAGS += -x cu
+	ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
+else ifeq ($(GPU_RUNTIME), HIP)
+	CXXFLAGS ?= -Wall -Wextra
+else
+	$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
+endif
+
+ICXXFLAGS += $(CXXFLAGS)
+ICPPFLAGS += $(CPPFLAGS)
+ILDFLAGS  += $(LDFLAGS)
+ILDLIBS   += $(LDLIBS)
+
+$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp
+	$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
+
+clean:
+	$(RM) $(EXAMPLE)
+
+.PHONY: clean
diff --git a/Applications/floyd_warshall/README.md b/Applications/floyd_warshall/README.md
@@ -0,0 +1,60 @@
+# Applications Floyd-Warshall Example
+
+## Description
+This example showcases a GPU implementation of the [Floyd-Warshall algorithm](https://en.wikipedia.org/wiki/Floyd%E2%80%93Warshall_algorithm), which computes the shortest path between each pair of nodes in a given directed and (in this case) complete graph $G = (V, E, \omega)$. The key point of this implementation is that each kernel launch represents a step $k$ of the traditional CPU-implemented algorithm. Therefore, the kernel is launched as much times as nodes $\left(n = \vert V \vert \right)$ has the graph.
+
+In this example, there are `iterations` (consecutive) executions of the algorithm on the same graph. As each execution requires an unmodified graph input, multiple copy operations are required. Hence, the performance of the example can be improved by using _pinned memory_.
+
+Pinned memory is simply a special kind of memory that cannot be paged out the physical memory of a process, meaning that the virtual addresses associated with it are always mapped to physical memory. When copying data from/to the host to/from the GPU, the host source/destination must be pinned memory and, in case it is not, an extra allocation of pinned memory is first performed (copying the data residing in or being copied to the non-pinned host memory) and then the actual copy of the data takes place.
+
+Therefore, using pinned memory saves around 2x the time needed to copy from/to host memory. In this example, performances is improved by using this type of memory, given that there are `iterations` (consecutive) executions of the algorithm on the same graph.
+
+### Application flow
+1. Default values for the number of nodes of the graph and the number of iterations for the algorithm execution are set.
+2. Command line arguments are parsed (if any) and the previous values are updated.
+3. A number of constants are defined for kernel execution and input/output data size.
+4. Host memory is allocated for the distance matrix and initialized with the increasing sequence $1,2,3,\dots$ . These values represent the weights of the edges of the graph.
+5. Host memory is allocated for the adjacency matrix and initialized such that the initial path between each pair of vertices $x,y \in V$ ($x \neq y$) is the edge $(x,y)$.
+6. Pinned memory is allocated and mapped to device memory. The latter is initialized with the input matrices (distance and adjacency) representing the graph $G$ and the Floyd-Warshall kernel is executed for each node of the graph.
+7. The resulting distance and adjacency matrices are copied to the host and pinned memory is freed.
+8. The mean time in milliseconds needed for each iteration is printed to standard output.
+9. The results obtained are compared with the CPU implementation of the algorithm. The result of the comparison is printed to the standard output.
+
+
+### Command line interface
+There are three parameters available:
+- `-h` displays information about the available parameters and their default values.
+- `-n nodes` sets `nodes` as the number of nodes of the graph to which the Floyd-Warshall algorithm will be applied. It must be a (positive) multiple of `block_size` (= 16). Its default value is 16.
+- `-i iterations` sets `iterations` as the number of times that the algorithm will be applied to the (same) graph. It must be an integer greater than 0. Its default value is 1.
+
+## Key APIs and Concepts
+- For this GPU implementation of the Floyd-Warshall algorithm, the main kernel (`floyd_warshall_kernel`) that is launched in a 2-dimensional grid. Each thread in the grid computes the shortest path between two nodes of the graph at a certain step $k$ $\left(0 \leq k < n \right)$. The threads compare the previously computed shortest paths using only the nodes in $V'=\{v_0,v_1,...,v_{k-1}\} \subseteq V$ as intermediate nodes with the paths that include node $v_k$ as an intermediate node, and take the shortest option. Therefore, the kernel is launched $n$ times.
+- For improved performance, pinned memory is used to pass the results obtained in each iteration to the next one. With `hipHostMalloc` pinned host memory (accessible by the device) can be allocated, and `hipHostFree` frees it. In this example, host pinned memory is allocated using the `hipHostMallocMapped` flag, which indicates that `hipHostMalloc` must map the allocation into the address space of the current device. The device pointer to such allocated pinned memory is obtained with `hipHostGetDevicePointer`. Beware that an excessive allocation of pinned memory can slow down the host execution, as the program is left with less physical memory available to map the rest of the virtual addresses used.
+- With `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others.
+- `hipLaunchKernelGGL` queues the kernel execution on the device. All the kernels are launched on the `hipStreamDefault`, meaning that these executions are performed in order. `hipGetLastError` returns the last error produced by any runtime API call, allowing to check if any kernel launch resulted in error.
+- `hipEventCreate` creates the events used to measure kernel execution time, `hipEventRecord` starts recording an event and  `hipEventSynchronize` waits for all the previous work in the stream when the specified event was recorded. With these three functions it can be measured the start and stop times of the kernel, and with `hipEventElapsedTime` the kernel execution time (in milliseconds) can be obtained.
+
+## Demonstrated API Calls
+
+### HIP runtime
+#### Device symbols
+- `blockIdx`
+- `blockDim`
+- `threadIdx`
+
+#### Host symbols
+- `__global__`
+- `hipEventCreate`
+- `hipEventElapsedTime`
+- `hipEventRecord`
+- `hipEventSynchronize`
+- `hipGetLastError`
+- `hipHostFree`
+- `hipHostGetDevicePointer`
+- `hipHostMalloc`
+- `hipHostMallocMapped`
+- `hipLaunchKernelGGL`
+- `hipMemcpy`
+- `hipMemcpyDeviceToHost`
+- `hipMemcpyHostToDevice`
+- `hipStreamDefault`