From 075db8940be956516aa7dc465799514bc2e69c06 Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches <61422851+Beanavil@users.noreply.github.com> Date: Fri, 10 Nov 2023 15:52:25 +0100 Subject: [PATCH] HIP SDK Application Examples (part II) (#50) * Applications example Monte Carlo pi * Added prefix sum application example * Improve Floyd-Warshall example - Use Pinned memory for host-device copy - Use device memory for the computation - Update application README - Remove void** cast from hipMalloc * Added Histogram example * Add applications/prefix_sum to portable examples * Implement the convolution example * Bitonic sort example --------- Co-authored-by: Nol Moonen Co-authored-by: Nara Prasetya Co-authored-by: Balint Soproni Co-authored-by: Robin Voetter --- Applications/CMakeLists.txt | 9 +- Applications/Makefile | 13 +- Applications/bitonic_sort/.gitignore | 1 + Applications/bitonic_sort/CMakeLists.txt | 63 ++++ Applications/bitonic_sort/Makefile | 60 ++++ Applications/bitonic_sort/README.md | 61 ++++ Applications/bitonic_sort/bitonic_sort.svg | 4 + .../bitonic_sort/bitonic_sort_vs2017.sln | 24 ++ .../bitonic_sort/bitonic_sort_vs2017.vcxproj | 134 +++++++ .../bitonic_sort_vs2017.vcxproj.filters | 30 ++ .../bitonic_sort/bitonic_sort_vs2019.sln | 24 ++ .../bitonic_sort/bitonic_sort_vs2019.vcxproj | 134 +++++++ .../bitonic_sort_vs2019.vcxproj.filters | 30 ++ .../bitonic_sort/bitonic_sort_vs2022.sln | 24 ++ .../bitonic_sort/bitonic_sort_vs2022.vcxproj | 134 +++++++ .../bitonic_sort_vs2022.vcxproj.filters | 30 ++ Applications/bitonic_sort/main.hip | 240 +++++++++++++ Applications/convolution/.gitignore | 1 + Applications/convolution/CMakeLists.txt | 63 ++++ Applications/convolution/Makefile | 60 ++++ Applications/convolution/README.md | 58 ++++ .../convolution/convolution_vs2017.sln | 24 ++ .../convolution/convolution_vs2017.vcxproj | 134 +++++++ .../convolution_vs2017.vcxproj.filters | 30 ++ .../convolution/convolution_vs2019.sln | 24 ++ .../convolution/convolution_vs2019.vcxproj | 134 +++++++ .../convolution_vs2019.vcxproj.filters | 30 ++ .../convolution/convolution_vs2022.sln | 24 ++ .../convolution/convolution_vs2022.vcxproj | 134 +++++++ .../convolution_vs2022.vcxproj.filters | 30 ++ Applications/convolution/main.hip | 328 ++++++++++++++++++ Applications/floyd_warshall/README.md | 15 +- Applications/floyd_warshall/main.hip | 49 ++- Applications/histogram/CMakeLists.txt | 63 ++++ Applications/histogram/Makefile | 60 ++++ Applications/histogram/README.md | 58 ++++ .../histogram/bank_conflict_reduction.svg | 4 + Applications/histogram/histogram_example.svg | 4 + Applications/histogram/histogram_vs2017.sln | 24 ++ .../histogram/histogram_vs2017.vcxproj | 134 +++++++ .../histogram_vs2017.vcxproj.filters | 30 ++ Applications/histogram/histogram_vs2019.sln | 24 ++ .../histogram/histogram_vs2019.vcxproj | 134 +++++++ .../histogram_vs2019.vcxproj.filters | 30 ++ Applications/histogram/histogram_vs2022.sln | 24 ++ .../histogram/histogram_vs2022.vcxproj | 134 +++++++ .../histogram_vs2022.vcxproj.filters | 30 ++ Applications/histogram/main.hip | 180 ++++++++++ Applications/monte_carlo_pi/.gitignore | 1 + Applications/monte_carlo_pi/CMakeLists.txt | 75 ++++ Applications/monte_carlo_pi/Makefile | 69 ++++ Applications/monte_carlo_pi/README.md | 81 +++++ Applications/monte_carlo_pi/main.hip | 193 +++++++++++ Applications/prefix_sum/.gitignore | 1 + Applications/prefix_sum/CMakeLists.txt | 63 ++++ Applications/prefix_sum/Makefile | 60 ++++ Applications/prefix_sum/README.md | 63 ++++ Applications/prefix_sum/main.hip | 219 ++++++++++++ .../prefix_sum/prefix_sum_diagram.svg | 4 + Applications/prefix_sum/prefix_sum_vs2017.sln | 24 ++ .../prefix_sum/prefix_sum_vs2017.vcxproj | 134 +++++++ .../prefix_sum_vs2017.vcxproj.filters | 30 ++ Applications/prefix_sum/prefix_sum_vs2019.sln | 24 ++ .../prefix_sum/prefix_sum_vs2019.vcxproj | 134 +++++++ .../prefix_sum_vs2019.vcxproj.filters | 30 ++ Applications/prefix_sum/prefix_sum_vs2022.sln | 24 ++ .../prefix_sum/prefix_sum_vs2022.vcxproj | 134 +++++++ .../prefix_sum_vs2022.vcxproj.filters | 30 ++ Common/hiprand_utils.hpp | 47 +++ .../hip-libraries-cuda-ubuntu.Dockerfile | 10 + README.md | 5 + ROCm-Examples-Portable-VS2017.sln | 28 ++ ROCm-Examples-Portable-VS2019.sln | 28 ++ ROCm-Examples-Portable-VS2022.sln | 28 ++ ROCm-Examples-VS2017.sln | 28 ++ ROCm-Examples-VS2019.sln | 28 ++ ROCm-Examples-VS2022.sln | 29 ++ 77 files changed, 4682 insertions(+), 28 deletions(-) create mode 100644 Applications/bitonic_sort/.gitignore create mode 100644 Applications/bitonic_sort/CMakeLists.txt create mode 100644 Applications/bitonic_sort/Makefile create mode 100644 Applications/bitonic_sort/README.md create mode 100644 Applications/bitonic_sort/bitonic_sort.svg create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2017.sln create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj.filters create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2019.sln create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj.filters create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2022.sln create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj create mode 100644 Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj.filters create mode 100644 Applications/bitonic_sort/main.hip create mode 100644 Applications/convolution/.gitignore create mode 100644 Applications/convolution/CMakeLists.txt create mode 100644 Applications/convolution/Makefile create mode 100644 Applications/convolution/README.md create mode 100644 Applications/convolution/convolution_vs2017.sln create mode 100644 Applications/convolution/convolution_vs2017.vcxproj create mode 100644 Applications/convolution/convolution_vs2017.vcxproj.filters create mode 100644 Applications/convolution/convolution_vs2019.sln create mode 100644 Applications/convolution/convolution_vs2019.vcxproj create mode 100644 Applications/convolution/convolution_vs2019.vcxproj.filters create mode 100644 Applications/convolution/convolution_vs2022.sln create mode 100644 Applications/convolution/convolution_vs2022.vcxproj create mode 100644 Applications/convolution/convolution_vs2022.vcxproj.filters create mode 100644 Applications/convolution/main.hip create mode 100644 Applications/histogram/CMakeLists.txt create mode 100644 Applications/histogram/Makefile create mode 100644 Applications/histogram/README.md create mode 100644 Applications/histogram/bank_conflict_reduction.svg create mode 100644 Applications/histogram/histogram_example.svg create mode 100644 Applications/histogram/histogram_vs2017.sln create mode 100644 Applications/histogram/histogram_vs2017.vcxproj create mode 100644 Applications/histogram/histogram_vs2017.vcxproj.filters create mode 100644 Applications/histogram/histogram_vs2019.sln create mode 100644 Applications/histogram/histogram_vs2019.vcxproj create mode 100644 Applications/histogram/histogram_vs2019.vcxproj.filters create mode 100644 Applications/histogram/histogram_vs2022.sln create mode 100644 Applications/histogram/histogram_vs2022.vcxproj create mode 100644 Applications/histogram/histogram_vs2022.vcxproj.filters create mode 100644 Applications/histogram/main.hip create mode 100644 Applications/monte_carlo_pi/.gitignore create mode 100644 Applications/monte_carlo_pi/CMakeLists.txt create mode 100644 Applications/monte_carlo_pi/Makefile create mode 100644 Applications/monte_carlo_pi/README.md create mode 100644 Applications/monte_carlo_pi/main.hip create mode 100644 Applications/prefix_sum/.gitignore create mode 100644 Applications/prefix_sum/CMakeLists.txt create mode 100644 Applications/prefix_sum/Makefile create mode 100644 Applications/prefix_sum/README.md create mode 100644 Applications/prefix_sum/main.hip create mode 100644 Applications/prefix_sum/prefix_sum_diagram.svg create mode 100644 Applications/prefix_sum/prefix_sum_vs2017.sln create mode 100644 Applications/prefix_sum/prefix_sum_vs2017.vcxproj create mode 100644 Applications/prefix_sum/prefix_sum_vs2017.vcxproj.filters create mode 100644 Applications/prefix_sum/prefix_sum_vs2019.sln create mode 100644 Applications/prefix_sum/prefix_sum_vs2019.vcxproj create mode 100644 Applications/prefix_sum/prefix_sum_vs2019.vcxproj.filters create mode 100644 Applications/prefix_sum/prefix_sum_vs2022.sln create mode 100644 Applications/prefix_sum/prefix_sum_vs2022.vcxproj create mode 100644 Applications/prefix_sum/prefix_sum_vs2022.vcxproj.filters create mode 100644 Common/hiprand_utils.hpp diff --git a/Applications/CMakeLists.txt b/Applications/CMakeLists.txt index 98a45b32b..c0c9c0cff 100644 --- a/Applications/CMakeLists.txt +++ b/Applications/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -23,4 +23,11 @@ cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(Applications LANGUAGES CXX) +add_subdirectory(bitonic_sort) +add_subdirectory(convolution) add_subdirectory(floyd_warshall) +add_subdirectory(histogram) +if(NOT WIN32 AND GPU_RUNTIME STREQUAL "HIP") + add_subdirectory(monte_carlo_pi) +endif() +add_subdirectory(prefix_sum) diff --git a/Applications/Makefile b/Applications/Makefile index d1a397102..bdfede296 100644 --- a/Applications/Makefile +++ b/Applications/Makefile @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -21,7 +21,16 @@ # SOFTWARE. EXAMPLES := \ - floyd_warshall + bitonic_sort \ + convolution \ + floyd_warshall \ + histogram \ + prefix_sum + +ifneq ($(GPU_RUNTIME), CUDA) +EXAMPLES += \ + monte_carlo_pi +endif all: $(EXAMPLES) diff --git a/Applications/bitonic_sort/.gitignore b/Applications/bitonic_sort/.gitignore new file mode 100644 index 000000000..5485cb76d --- /dev/null +++ b/Applications/bitonic_sort/.gitignore @@ -0,0 +1 @@ +applications_bitonic_sort diff --git a/Applications/bitonic_sort/CMakeLists.txt b/Applications/bitonic_sort/CMakeLists.txt new file mode 100644 index 000000000..9a3cf182e --- /dev/null +++ b/Applications/bitonic_sort/CMakeLists.txt @@ -0,0 +1,63 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name applications_bitonic_sort) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +if(WIN32) + set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") +else() + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +# For examples targeting NVIDIA, include the HIP header directory. +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) + +install(TARGETS ${example_name}) diff --git a/Applications/bitonic_sort/Makefile b/Applications/bitonic_sort/Makefile new file mode 100644 index 000000000..c4eced363 --- /dev/null +++ b/Applications/bitonic_sort/Makefile @@ -0,0 +1,60 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := applications_bitonic_sort +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +ICXXFLAGS := -std=$(CXX_STD) +ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) +ILDFLAGS := +ILDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + ICXXFLAGS += -x cu + ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) + CXXFLAGS ?= -Wall -Wextra +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +ICXXFLAGS += $(CXXFLAGS) +ICPPFLAGS += $(CPPFLAGS) +ILDFLAGS += $(LDFLAGS) +ILDLIBS += $(LDLIBS) + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp + $(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS) + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/Applications/bitonic_sort/README.md b/Applications/bitonic_sort/README.md new file mode 100644 index 000000000..fa4ec1b8c --- /dev/null +++ b/Applications/bitonic_sort/README.md @@ -0,0 +1,61 @@ +# Applications Bitonic Sort Example + +## Description +This example showcases a GPU implementation of the [bitonic sort](https://en.wikipedia.org/wiki/Bitonic_sorter) and uses it to order increasingly (or decreasingly) an array of $n$ elements. Another implementation of the said algorithm exists in rocPRIM and could be used instead. Also, rocPRIM's algorithm would likely offer an improved performance. + +A sequence $\{x_n\}_{n=1}^m$ is called bitonic if it possesses one of the following two properties: + +1. There exists an index $k$ such that $x_0 \leq x_1 \leq \cdots \leq x_k$ and $x_k \geq x_{k+1} \geq \cdots x_{m-1}$ i.e. $\{x_n\}$ is monotonically increasing before $x_k$ and monotonically decreasing after. +2. There exists a permutation $\sigma \in S_m$ of the indices such that $\{x_{\sigma(n)}\}_{n=1}^m$ satisfies the above property. + +Each step $i$ of this bitonic sort implementation yields bitonic subsequences of length $2^{i+2}$, each of them having two monotonically ordered subsequences of length $2^{i+1}$. The idea is to use this bitonic sort for as many steps as necessary to obtain a bitonic sequence of length $2n$, because then our $n$-length array will be monotonically (increasingly or decreasingly) sorted. That is, we need to iterate for a total of $\log_2(n) - 1$ steps. Notice that this also implies that the array to be sorted must have a length equal to a power of two. + +Below is presented an example of how an array of length 8 would be ordered increasingly. An arrow from one element to other means that those two elements are compared in the stage and step indicated in the left columns. The resulting order will be such that the lesser element will be placed at the position from which the arrow starts and the greater element will be placed at the position pointed by the end of the arrow. For an easier understanding, black arrows correspond to an increasing order and grey arrows to a decreasing order of the elements. + +![bitonic_sort.svg](bitonic_sort.svg) + +### Application flow +1. Parse user input. +2. Allocate and initialize host input array and make a copy for the CPU comparison. +3. Define a number of constants for kernel execution. +4. Declare device array and copy input data from host to device. +5. Enqueue calls to the bitonic sort kernel for each step and stage. +6. Copy back to the host the resulting ordered array and free events variables and device memory. +7. Report execution time of the kernels. +8. Compare the array obtained with the CPU implementation of the bitonic sort and print to standard output the result. + +### Command line interface +There are three options available: +- `-h` displays information about the available parameters and their default values. +- `-l ` sets `length` as the number of elements of the array that will be sorted. It must be a power of $2$. Its default value is $2^{15}$. +- `-s ` sets `sort` as the type or sorting that we want our array to have: decreasing ("dec") or increasing ("inc"). The default value is "inc". + +## Key APIs and Concepts +- Device memory is allocated with `hipMalloc` and deallocated with `hipFree`. +- With `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`). +- `hipEventCreate` creates events, which are used in this example to measure the kernels execution time. `hipEventRecord` starts recording an event, `hipEventSynchronize` waits for all the previous work in the stream when the specified event was recorded. With these three functions it can be measured the start and stop times of the kernel and with `hipEventElapsedTime` it can be obtained the kernel execution time in milliseconds. Lastly, `hipEventDestroy` destroys an event. +- `myKernelName<<<...>>>` queues kernel execution on the device. All the kernels are launched on the `hipStreamDefault`, meaning that these executions are performed in order. `hipGetLastError` returns the last error produced by any runtime API call, allowing to check if any kernel launch resulted in error. + +## Demonstrated API Calls + +### HIP runtime +#### Device symbols +- `blockDim` +- `blockIdx` +- `threadIdx` + +#### Host symbols +- `__global__` +- `hipEvent_t` +- `hipEventCreate` +- `hipEventDestroy` +- `hipEventElapsedTime` +- `hipEventRecord` +- `hipEventSynchronize` +- `hipFree` +- `hipGetLastError` +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyDeviceToHost` +- `hipMemcpyHostToDevice` +- `hipStreamDefault` diff --git a/Applications/bitonic_sort/bitonic_sort.svg b/Applications/bitonic_sort/bitonic_sort.svg new file mode 100644 index 000000000..1f8d6aa41 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort.svg @@ -0,0 +1,4 @@ + + + +
1
1
3
3
1
1
5
5
7
7
4
4
0
0
4
4
Stage
Stage
Step
Step
0
0
1
1
2
2
0
0
0
0
1
1
0
0
1
1
2
2
Result
Result
1
1
3
3
1
1
5
5
4
4
7
7
4
4
0
0
1
1
1
1
3
3
5
5
4
4
7
7
4
4
0
0
1
1
1
1
5
5
3
3
7
7
4
4
4
4
0
0
1
1
1
1
0
0
3
3
7
7
4
4
4
4
5
5
1
1
0
0
1
1
3
3
4
4
4
4
7
7
5
5
0
0
1
1
3
3
1
1
4
4
4
4
5
5
7
7
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Applications/bitonic_sort/bitonic_sort_vs2017.sln b/Applications/bitonic_sort/bitonic_sort_vs2017.sln new file mode 100644 index 000000000..bf9b47cd4 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2017.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.33026.149 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2017", "bitonic_sort_vs2017.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {6C3AE930-86C0-4E39-A249-BBFA047DBE8C} + EndGlobalSection +EndGlobal diff --git a/Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj b/Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj new file mode 100644 index 000000000..0858bbec0 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {265F7154-A362-45FA-B300-DB74E14BA010} + Win32Proj + bitonic_sort_vs2017 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj.filters b/Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj.filters new file mode 100644 index 000000000..529e694ad --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2017.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {4e54cfdc-c09f-4186-a227-28788996b4ab} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {72617ae4-f7c5-4de7-a6a6-c0d1b99ff59c} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {425864ec-094b-4e39-b6c2-1bb0cf8ff388} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/bitonic_sort/bitonic_sort_vs2019.sln b/Applications/bitonic_sort/bitonic_sort_vs2019.sln new file mode 100644 index 000000000..26d3582c7 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2019.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2019", "bitonic_sort_vs2019.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0} + EndGlobalSection +EndGlobal diff --git a/Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj b/Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj new file mode 100644 index 000000000..298326059 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + {265F7154-A362-45FA-B300-DB74E14BA010} + Win32Proj + bitonic_sort_vs2019 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + diff --git a/Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj.filters b/Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj.filters new file mode 100644 index 000000000..529e694ad --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2019.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {4e54cfdc-c09f-4186-a227-28788996b4ab} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {72617ae4-f7c5-4de7-a6a6-c0d1b99ff59c} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {425864ec-094b-4e39-b6c2-1bb0cf8ff388} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/bitonic_sort/bitonic_sort_vs2022.sln b/Applications/bitonic_sort/bitonic_sort_vs2022.sln new file mode 100644 index 000000000..b3cbe9d00 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2022.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.33213.308 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2022", "bitonic_sort_vs2022.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {538AA3F0-203D-4A51-8C00-F65282A55F3B} + EndGlobalSection +EndGlobal diff --git a/Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj b/Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj new file mode 100644 index 000000000..1c7870d25 --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {265F7154-A362-45FA-B300-DB74E14BA010} + Win32Proj + bitonic_sort_vs2022 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj.filters b/Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj.filters new file mode 100644 index 000000000..529e694ad --- /dev/null +++ b/Applications/bitonic_sort/bitonic_sort_vs2022.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {4e54cfdc-c09f-4186-a227-28788996b4ab} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {72617ae4-f7c5-4de7-a6a6-c0d1b99ff59c} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {425864ec-094b-4e39-b6c2-1bb0cf8ff388} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/bitonic_sort/main.hip b/Applications/bitonic_sort/main.hip new file mode 100644 index 000000000..31cd3e4c7 --- /dev/null +++ b/Applications/bitonic_sort/main.hip @@ -0,0 +1,240 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "cmdparser.hpp" +#include "example_utils.hpp" + +#include + +#include +#include +#include +#include +#include + +/// \brief Given an array of n elements, this kernel implements the j-th stage within the i-th +/// step of the bitonic sort, being 0 <= i < log_2(n) and 0 <= j <= i. +__global__ void bitonic_sort_kernel(unsigned int* array, + const unsigned int step, + const unsigned int stage, + bool sort_increasing) +{ + // Current thread id. + unsigned int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // How many pairs of elements are ordered with the same criteria (increasingly or decreasingly) + // within each of the bitonic subsequences computed in each step. E.g. in the step 0 we have + // 1 pair of elements in each monotonic component of the bitonic subsequences, that is, we + // obtain bitonic sequences of length 4. + const unsigned int same_order_block_width = 1 << step; + + // Distance between the two elements that each thread sorts. + const unsigned int pair_distance = 1 << (step - stage); + + // Total number of elements of each subsequence processed. + const unsigned int sorted_block_width = 2 * pair_distance; + + // Compute indexes of the elements of the array that the thread will sort. + const unsigned int left_id + = (thread_id % pair_distance) + (thread_id / pair_distance) * sorted_block_width; + const unsigned int right_id = left_id + pair_distance; + + // Get the elements of the array that the thread will sort. + const unsigned int left_element = array[left_id]; + const unsigned int right_element = array[right_id]; + + // If the current thread is the first one ordering an element from the right component of the + // bitonic sequence that it's computing, then the ordering criteria changes. + if((thread_id / same_order_block_width) % 2 == 1) + sort_increasing = !sort_increasing; + + // Compare elements and switch them if necessary. + const unsigned int greater = (left_element > right_element) ? left_element : right_element; + const unsigned int lesser = (left_element > right_element) ? right_element : left_element; + array[left_id] = (sort_increasing) ? lesser : greater; + array[right_id] = (sort_increasing) ? greater : lesser; +} + +/// \brief Swaps two elements if the first is greater than the second. +void swap_if_first_greater(unsigned int* a, unsigned int* b) +{ + if(*a > *b) + { + std::swap(*a, *b); + } +} + +/// \brief Reference CPU implementation of the bitonic sort for results verification. +void bitonic_sort_reference(unsigned int* array, + const unsigned int length, + const bool sort_increasing) +{ + const unsigned int half_length = length / 2; + + // For each step i' = log_2(i) - 1, 0 <= i' < log_2(length). + for(unsigned int i = 2; i <= length; i *= 2) + { + // For each stage j' = log_2(i / j), 0 <= j' <= i'. + for(unsigned int j = i; j > 1; j /= 2) + { + bool increasing = sort_increasing; + const unsigned int half_j = j / 2; + + // Sort elements separated by distance j / 2. + for(unsigned int k = 0; k < length; k += j) + { + const unsigned int k_plus_half_j = k + half_j; + + // Each time we sort i elements we must change the ordering direction. + if((k == i) || ((i < length) && (k % i) == 0 && (k != half_length))) + { + increasing = !increasing; + } + + // Compare and sort elements. + for(unsigned int l = k; l < k_plus_half_j; ++l) + { + if(increasing) + { + swap_if_first_greater(&array[l], &array[l + half_j]); + } + else + { + swap_if_first_greater(&array[l + half_j], &array[l]); + } + } + } + } + } +} + +int main(int argc, char* argv[]) +{ + // Parse user input. + cli::Parser parser(argc, argv); + parser.set_optional("l", + "log2length", + 15, + "2**l will be the length of the array to be sorted."); + parser.set_optional("s", + "sort", + "inc", + "Sort in decreasing (dec) or increasing (inc) order."); + parser.run_and_exit_if_error(); + + const unsigned int steps = parser.get("l"); + + const std::string sort = parser.get("s"); + if(sort.compare("dec") && sort.compare("inc")) + { + std::cout << "The ordering must be 'dec' or 'inc', the default ordering is 'inc'." + << std::endl; + return 0; + } + const bool sort_increasing = (sort.compare("inc") == 0); + + // Compute length of the array to be sorted. + const unsigned int length = 1u << steps; + + // Allocate and init random host input array. Copy input array for CPU execution. + std::vector array(length); + std::for_each(array.begin(), array.end(), [](unsigned int& e) { e = rand() % 10; }); + + std::vector expected_array(array); + + std::cout << "Sorting an array of " << length << " elements using the bitonic sort." + << std::endl; + + // Declare and allocate device memory and copy input data. + unsigned int* d_array{}; + HIP_CHECK(hipMalloc(&d_array, length * sizeof(unsigned int))); + HIP_CHECK( + hipMemcpy(d_array, array.data(), length * sizeof(unsigned int), hipMemcpyHostToDevice)); + + // Number of threads in each kernel block and number of blocks in the grid. Each thread is in + // charge of 2 elements, so we need enough threads to cover half the length of the array. + const unsigned int local_threads = (length > 256) ? 256 : length / 2; + const unsigned int global_threads = length / 2; + const dim3 block_dim(local_threads); + const dim3 grid_dim(global_threads / local_threads); + + // Create events to measure the execution time of the kernels. + float total_kernels{}; + float kernel_ms{}; + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + // Bitonic sort GPU algorithm: launch bitonic sort kernel for each stage of each step. + for(unsigned int i = 0; i < steps; ++i) + { + // For each step i we need i + 1 stages. + for(unsigned int j = 0; j <= i; ++j) + { + // Record the start event. + HIP_CHECK(hipEventRecord(start, hipStreamDefault)); + + // Launch the bitonic sort kernel on the default stream. + bitonic_sort_kernel<<>>( + d_array, + i, + j, + sort_increasing); + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()); + + // Record the stop event and wait until the kernel execution finishes. + HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); + HIP_CHECK(hipEventSynchronize(stop)); + + // Get the execution time of the kernel and add it to the total count. + HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop)); + total_kernels += kernel_ms; + } + } + + // Copy results back to host. + HIP_CHECK( + hipMemcpy(array.data(), d_array, length * sizeof(unsigned int), hipMemcpyDeviceToHost)); + + // Free events variables and device memory. + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipFree(d_array)); + + // Report execution time. + std::cout << "GPU bitonic sort took " << total_kernels << " milliseconds to complete." + << std::endl; + + // Execute CPU algorithm. + bitonic_sort_reference(expected_array.data(), length, sort_increasing); + + // Verify results and report to user. + unsigned int errors{}; + std::cout << "Validating results with CPU implementation." << std::endl; + for(unsigned int i = 0; i < length; ++i) + { + errors += (array[i] - expected_array[i] != 0); + } + report_validation_result(errors); +} diff --git a/Applications/convolution/.gitignore b/Applications/convolution/.gitignore new file mode 100644 index 000000000..fa270e392 --- /dev/null +++ b/Applications/convolution/.gitignore @@ -0,0 +1 @@ +applications_convolution diff --git a/Applications/convolution/CMakeLists.txt b/Applications/convolution/CMakeLists.txt new file mode 100644 index 000000000..d60ca6778 --- /dev/null +++ b/Applications/convolution/CMakeLists.txt @@ -0,0 +1,63 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name applications_convolution) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +if(WIN32) + set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") +else() + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +# For examples targeting NVIDIA, include the HIP header directory. +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) + +install(TARGETS ${example_name}) diff --git a/Applications/convolution/Makefile b/Applications/convolution/Makefile new file mode 100644 index 000000000..6a5ab1e58 --- /dev/null +++ b/Applications/convolution/Makefile @@ -0,0 +1,60 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := applications_convolution +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +ICXXFLAGS := -std=$(CXX_STD) +ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) +ILDFLAGS := +ILDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + ICXXFLAGS += -x cu + ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) + CXXFLAGS ?= -Wall -Wextra +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +ICXXFLAGS += $(CXXFLAGS) +ICPPFLAGS += $(CPPFLAGS) +ILDFLAGS += $(LDFLAGS) +ILDLIBS += $(LDLIBS) + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp + $(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS) + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/Applications/convolution/README.md b/Applications/convolution/README.md new file mode 100644 index 000000000..f0786ce6b --- /dev/null +++ b/Applications/convolution/README.md @@ -0,0 +1,58 @@ +# Applications Convolution Example + +## Description +This example showcases a simple GPU implementation for calculating the [discrete convolution](https://en.wikipedia.org/wiki/Convolution#Discrete_convolution). The key point of this implementation is that in the GPU kernel each thread calculates the value for a convolution for a given element in the resulting grid. + +For storing the mask constant memory is used. Constant memory is a read-only memory that is limited in size, but offers faster access times than regular memory. Furthermore on some architectures it has a separate cache. Therefore accessing constant memory can reduce the pressure on the memory system. + +### Application flow +1. Default values for the size of the grid, mask and the number of iterations for the algorithm execution are set. +2. Command line arguments are parsed. +3. Host memory is allocated for the input, output and the mask. Input data is initialized with random numbers between 0-256. +4. Input data is copied to the device. +5. The simple convolution kernel is executed multiple times. Number of iterations is specified by the `-i` flag. +6. The resulting convoluted grid is copied to the host and device memory is freed. +7. The mean time in milliseconds needed for each iteration is printed to standard output as well as the mean estimated bandwidth. +8. The results obtained are compared with the CPU implementation of the algorithm. The result of the comparison is printed to the standard output. +9. In case requested the convoluted grid, the input grid, and the reference results are printed to standard output. + +### Command line interface +There are three parameters available: +- `-h` displays information about the available parameters and their default values. +- `-x width` sets the grid size in the x direction. Default value is 4096. +- `-y height` sets the grid size in the y direction. Default value is 4096. +- `-p` Toggles the printing of the input, reference and output grids. +- `-i iterations` sets the number of times that the algorithm will be applied to the (same) grid. It must be an integer greater than 0. Its default value is 10. + +## Key APIs and Concepts +- For this GPU implementation of the simple convolution calculation, the main kernel (`convolution`) is launched in a 2-dimensional grid. Each thread computes the convolution for one element of the resulting grid. +- Device memory is allocated with `hipMalloc` which is later freed by `hipFree`. +- Constant memory is declared in global scope for the mask, using the `__constant__` qualifier. The size of the object stored in constant memory must be available at compile time. Later the memory is initialized with `hipMemcpyToSymbol`. +- With `hipMemcpy` data can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`). +- `myKernelName<<<...>>>` queues the kernel execution on the device. All the kernels are launched on the default stream `hipStreamDefault`, meaning that these executions are performed in order. `hipGetLastError` returns the last error produced by any runtime API call, allowing to check if any kernel launch resulted in an error. +- `hipEventCreate` creates the events used to measure kernel execution time, `hipEventRecord` starts recording an event and `hipEventSynchronize` waits for all the previous work in the stream when the specified event was recorded. These three functions can be used to measure the start and stop times of the kernel, and with `hipEventElapsedTime` the kernel execution time (in milliseconds) can be obtained. With `hipEventDestroy` the created events are freed. + +## Demonstrated API Calls + +### HIP runtime +#### Device symbols +- `blockIdx` +- `blockDim` +- `threadIdx` + +#### Host symbols +- `__global__` +- `__constant__` +- `hipEventCreate` +- `hipEventDestroy` +- `hipEventElapsedTime` +- `hipEventRecord` +- `hipEventSynchronize` +- `hipFree` +- `hipGetLastError` +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyDeviceToHost` +- `hipMemcpyHostToDevice` +- `hipMemcpyToSymbol` +- `hipStreamDefault` diff --git a/Applications/convolution/convolution_vs2017.sln b/Applications/convolution/convolution_vs2017.sln new file mode 100644 index 000000000..bc9ab60b3 --- /dev/null +++ b/Applications/convolution/convolution_vs2017.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.33026.149 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2017", "convolution_vs2017.vcxproj", "{4232B140-4C47-4961-8A8A-F67D14DC9349}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Debug|x64.ActiveCfg = Debug|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Debug|x64.Build.0 = Debug|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Release|x64.ActiveCfg = Release|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {6C3AE930-86C0-4E39-A249-BBFA047DBE8C} + EndGlobalSection +EndGlobal diff --git a/Applications/convolution/convolution_vs2017.vcxproj b/Applications/convolution/convolution_vs2017.vcxproj new file mode 100644 index 000000000..e32226d3d --- /dev/null +++ b/Applications/convolution/convolution_vs2017.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {4232B140-4C47-4961-8A8A-F67D14DC9349} + Win32Proj + convolution_vs2017 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/convolution/convolution_vs2017.vcxproj.filters b/Applications/convolution/convolution_vs2017.vcxproj.filters new file mode 100644 index 000000000..529e694ad --- /dev/null +++ b/Applications/convolution/convolution_vs2017.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {4e54cfdc-c09f-4186-a227-28788996b4ab} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {72617ae4-f7c5-4de7-a6a6-c0d1b99ff59c} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {425864ec-094b-4e39-b6c2-1bb0cf8ff388} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/convolution/convolution_vs2019.sln b/Applications/convolution/convolution_vs2019.sln new file mode 100644 index 000000000..1c2df7c3c --- /dev/null +++ b/Applications/convolution/convolution_vs2019.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2019", "convolution_vs2019.vcxproj", "{A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Debug|x64.ActiveCfg = Debug|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Debug|x64.Build.0 = Debug|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Release|x64.ActiveCfg = Release|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0} + EndGlobalSection +EndGlobal diff --git a/Applications/convolution/convolution_vs2019.vcxproj b/Applications/convolution/convolution_vs2019.vcxproj new file mode 100644 index 000000000..fa6042eda --- /dev/null +++ b/Applications/convolution/convolution_vs2019.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F} + Win32Proj + convolution_vs2019 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/convolution/convolution_vs2019.vcxproj.filters b/Applications/convolution/convolution_vs2019.vcxproj.filters new file mode 100644 index 000000000..a36242334 --- /dev/null +++ b/Applications/convolution/convolution_vs2019.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {2932a426-602b-4926-887e-27c50ba7eab7} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {ed043ec4-e8ac-4831-93f5-a58546ec7bea} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {0da954bd-e555-4454-b082-b68d10c753b9} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/convolution/convolution_vs2022.sln b/Applications/convolution/convolution_vs2022.sln new file mode 100644 index 000000000..c5667ff7c --- /dev/null +++ b/Applications/convolution/convolution_vs2022.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.33213.308 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2022", "convolution_vs2022.vcxproj", "{E98F33FC-C29B-4229-A853-51C490D74E3E}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Debug|x64.ActiveCfg = Debug|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Debug|x64.Build.0 = Debug|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Release|x64.ActiveCfg = Release|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {538AA3F0-203D-4A51-8C00-F65282A55F3B} + EndGlobalSection +EndGlobal diff --git a/Applications/convolution/convolution_vs2022.vcxproj b/Applications/convolution/convolution_vs2022.vcxproj new file mode 100644 index 000000000..ffc1448e2 --- /dev/null +++ b/Applications/convolution/convolution_vs2022.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {E98F33FC-C29B-4229-A853-51C490D74E3E} + Win32Proj + convolution_vs2022 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + diff --git a/Applications/convolution/convolution_vs2022.vcxproj.filters b/Applications/convolution/convolution_vs2022.vcxproj.filters new file mode 100644 index 000000000..0aea6e69b --- /dev/null +++ b/Applications/convolution/convolution_vs2022.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + + + Source Files + + + \ No newline at end of file diff --git a/Applications/convolution/main.hip b/Applications/convolution/main.hip new file mode 100644 index 000000000..561b44adc --- /dev/null +++ b/Applications/convolution/main.hip @@ -0,0 +1,328 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "cmdparser.hpp" +#include "example_utils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// clang-format off +/// \brief Convolution filter using arbitrary values +const constexpr std::array convolution_filter_5x5 = {1.0f, 3.0f, 0.0f, -2.0f, -0.0f, + 1.0f, 4.0f, 0.0f, -8.0f, -4.0f, + 2.0f, 7.0f, 0.0f, -12.0f, -0.0f, + 2.0f, 3.0f, 1.5f, -8.0f, -4.0f, + 0.0f, 1.0f, 0.0f, -2.0f, -0.0f}; +// clang-format on + +/// \brief allocate memory in constant address space for the mask on the device +__constant__ float d_mask[5 * 5]; + +/// \brief Implements a convolution for an input grid \p input and a \p d_mask that is defined in constant memory. The \p input needs +/// to be padded such that \p mask_size is taken into account, i.e. padded_width = floor(mask_width/2) * 2 + width +/// and padded_height = floor(mask_height/2) * 2 + height +template +__global__ void convolution(const float* input, float* output, const uint2 input_dimensions) +{ + const size_t x = blockDim.x * blockIdx.x + threadIdx.x; + const size_t y = blockDim.y * blockIdx.y + threadIdx.y; + const size_t width = input_dimensions.x; + const size_t height = input_dimensions.y; + const size_t padded_width = width + (MaskWidth / 2) * 2; + + // Check if the currently computed element is inside the grid domain. + if(x >= width || y >= height) + return; + + // Temporary storage variables. + float sum = 0.0f; + const size_t convolution_base = y * padded_width + x; + + // Iterate over the mask in both x and y direction. + for(size_t mask_index_y = 0; mask_index_y < MaskWidth; ++mask_index_y) + { + for(size_t mask_index_x = 0; mask_index_x < MaskWidth; ++mask_index_x) + { + const size_t mask_index = mask_index_y * MaskWidth + mask_index_x; + const size_t convolution_offset = mask_index_y * padded_width + mask_index_x; + sum += input[convolution_base + convolution_offset] * d_mask[mask_index]; + } + } + + output[y * width + x] = sum; +} + +template +void print_grid(std::vector vec, int width) +{ + size_t num_rows = vec.size() / width; + auto it = vec.begin(); + for(size_t i = 0; i < num_rows; i++) + { + std::copy(it, it + width, std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + it += width; + } +} + +/// \brief Reference CPU implementation of convolution for results verification. +template +void convolution_reference(std::vector& verificationOutput, + const std::vector& paddedInput, + const mask_type& mask, + const unsigned int height, + const unsigned int width, + const unsigned int mask_width) +{ + // padded_width = width + floor(mask_width / 2) * 2 + const unsigned int padded_width = width + (mask_width / 2) * 2; + // Iterate over the provided grid. + for(unsigned int y = 0; y < height; y++) + { + + for(unsigned int x = 0; x < width; x++) + { + // temporary for summation. + float sum = 0.0f; + // Iterate over the mask for the given element. + for(unsigned int mask_index_y = 0; mask_index_y < mask_width; ++mask_index_y) + { + for(unsigned int mask_index_x = 0; mask_index_x < mask_width; ++mask_index_x) + { + unsigned int mask_index = mask_index_y * mask_width + mask_index_x; + unsigned int input_index + = (y + mask_index_y) * padded_width + (x + mask_index_x); + sum += paddedInput[input_index] * mask[mask_index]; + } + } + verificationOutput[(y * width + x)] = sum; + } + } +} + +/// \brief Adds to a command line parser the necessary options for this example. +template +void configure_parser(cli::Parser& parser) +{ + // Default parameters. + const constexpr unsigned int width = 4096; + const constexpr unsigned int height = 4096; + const constexpr unsigned int iterations = 10; + const constexpr bool print = false; + + parser.set_optional("x", "width", width, "Width of the input grid"); + parser.set_optional("y", "height", height, "Height of the input grid"); + parser.set_optional("i", + "iterations", + iterations, + "Number of times the algorithm is executed."); + parser.set_optional("p", "print", print, "Enables printing the convoluted grid"); +} + +int main(int argc, char* argv[]) +{ + // Number of threads in each kernel block dimension. + const constexpr unsigned int block_size = 32; + const constexpr unsigned int mask_width = 5; + + // Parse user input. + cli::Parser parser(argc, argv); + configure_parser(parser); + parser.run_and_exit_if_error(); + + // Get number of nodes and iterations from the command line, if provided. + const unsigned int width = parser.get("x"); + const unsigned int height = parser.get("y"); + const unsigned int iterations = parser.get("i"); + const bool print = parser.get("p"); + + // Check values provided. + if(width < 1) + { + std::cout << "Width must be at least 1. (provided " << width << " )" << std::endl; + return error_exit_code; + } + if(height < 1) + { + std::cout << "Height must be at least 1. (provided " << height << " )" << std::endl; + return error_exit_code; + } + if(iterations < 1) + { + std::cout << "Iterations must be at least 1. (provided " << iterations << " )" + << std::endl; + return error_exit_code; + } + + // Total number of elements and bytes of the input grid. + const unsigned int size = width * height; + const unsigned int size_bytes = size * sizeof(float); + + const constexpr unsigned int mask_element_num = mask_width * mask_width; + const constexpr unsigned int mask_size_bytes = mask_element_num * sizeof(float); + const constexpr unsigned int filter_radius = mask_width / 2; + + const unsigned int padded_width = width + filter_radius * 2; + const unsigned int padded_height = height + filter_radius * 2; + const unsigned int input_size_padded = padded_width * padded_height; + const unsigned int input_size_padded_bytes = input_size_padded * sizeof(float); + + auto mask = convolution_filter_5x5; + + // Allocate host input grid initialized with random floats between 0-256. + std::vector input_grid(size); + std::mt19937 mersenne_engine{0}; + std::uniform_real_distribution distribution{0, 256}; + auto rnd = std::bind(distribution, mersenne_engine); + std::generate(input_grid.begin(), input_grid.end(), rnd); + + // Allocate output grid. + std::vector output_grid(size); + + // Allocate padded input with zero boundary condition. + std::vector input_grid_padded(input_size_padded, 0); + + auto input_grid_row_begin = input_grid.begin(); + auto padded_input_grid_row_begin + = input_grid_padded.begin() + filter_radius * padded_width + filter_radius; + for(unsigned int i = 0; i < height; i++) + { + std::copy(input_grid_row_begin, input_grid_row_begin + width, padded_input_grid_row_begin); + padded_input_grid_row_begin += padded_width; + input_grid_row_begin += width; + } + + // Allocate host memory for the CPU implementation and copy input data. + std::vector expected_output_grid(output_grid); + + std::cout << "Executing a simple convolution for " << iterations << " iterations with a " + << width << " x " << height << " sized grid." << std::endl; + + // Allocate device memory. + float* d_input_grid_padded; + float* d_output_grid; + + HIP_CHECK(hipMalloc(&d_input_grid_padded, input_size_padded_bytes)); + HIP_CHECK(hipMalloc(&d_output_grid, size_bytes)); + + // Copy input data from host to device memory. + HIP_CHECK(hipMemcpy(d_input_grid_padded, + input_grid_padded.data(), + input_size_padded_bytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpyToSymbol(d_mask, mask.data(), mask_size_bytes)); + + // Cumulative variable to compute the mean bandwidth per iteration of the algorithm. + double kernel_bandwidths = 0; + + // Cumulative variable to compute the mean time per iteration of the algorithm. + double kernel_time = 0; + + // Create events to measure the execution time of the kernels. + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + // Number of threads in each kernel block and number of blocks in the grid. + const dim3 block_dim(block_size, block_size); + const dim3 grid_dim((width + block_size) / block_size, (height + block_size) / block_size); + + // Run iterations times the convolution GPU algorithm. + for(unsigned int i = 0; i < iterations; ++i) + { + float kernel_ms{}; + + // Record the start event. + HIP_CHECK(hipEventRecord(start, hipStreamDefault)); + + // Launch Convolution kernel on the default stream. + convolution<<>>(d_input_grid_padded, + d_output_grid, + {width, height}); + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()); + + // Record the stop event and wait until the kernel execution finishes. + HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); + HIP_CHECK(hipEventSynchronize(stop)); + + // Get the execution time of the kernel and add it to the total count. + HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop)); + kernel_time += kernel_ms; + kernel_bandwidths += (size_bytes + input_size_padded_bytes) / kernel_ms; + } + + // Destroy hipEvents. + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + + // Copy results back to host. + HIP_CHECK(hipMemcpy(output_grid.data(), d_output_grid, size_bytes, hipMemcpyDeviceToHost)); + + // Free device memory. + HIP_CHECK(hipFree(d_input_grid_padded)); + HIP_CHECK(hipFree(d_output_grid)); + + // Print the mean time per iteration (in miliseconds) of the algorithm, and the estimated mean bandwidth in (GB/s). + double average_bandwidth = kernel_bandwidths / iterations; + kernel_time /= iterations; + std::cout << "The mean time needed for each iteration has been " << kernel_time + << "ms and mean bandwidth was " << average_bandwidth / 1e6 << " GB/s" << std::endl; + + // Execute CPU algorithm. + convolution_reference(expected_output_grid, input_grid_padded, mask, height, width, mask_width); + + // Print the calculated grids. + if(print) + { + std::cout << "Input grid:" << std::endl; + print_grid(input_grid, width); + std::cout << "Result grid:" << std::endl; + print_grid(output_grid, width); + std::cout << "CPU reference grid:" << std::endl; + print_grid(expected_output_grid, width); + } + + // Verify results. + double error = 0; + std::cout << "Validating results with CPU implementation." << std::endl; + for(unsigned int i = 0; i < size; ++i) + { + double diff = (output_grid[i] - expected_output_grid[i]); + error += diff * diff; + } + error = std::sqrt(error / size); + std::cout << "The root-mean-square error of the difference between the reference and the gpu " + "result is " + << error << std::endl; +} diff --git a/Applications/floyd_warshall/README.md b/Applications/floyd_warshall/README.md index bd1f345cc..60e595ae5 100644 --- a/Applications/floyd_warshall/README.md +++ b/Applications/floyd_warshall/README.md @@ -5,9 +5,9 @@ This example showcases a GPU implementation of the [Floyd-Warshall algorithm](ht In this example, there are `iterations` (consecutive) executions of the algorithm on the same graph. As each execution requires an unmodified graph input, multiple copy operations are required. Hence, the performance of the example can be improved by using _pinned memory_. -Pinned memory is simply a special kind of memory that cannot be paged out the physical memory of a process, meaning that the virtual addresses associated with it are always mapped to physical memory. When copying data from/to the host to/from the GPU, the host source/destination must be pinned memory and, in case it is not, an extra allocation of pinned memory is first performed (copying the data residing in or being copied to the non-pinned host memory) and then the actual copy of the data takes place. +Pinned memory is simply a special kind of memory that cannot be paged out the physical memory of a process, meaning that the virtual addresses associated with it are always mapped to physical memory. When copying data from/to the host to/from the GPU, if host source/destination is not pinned memory the runtime and the operating system has to do ensure that the memory is not swapped out. This usually significantly impact the performance of memory movements. -Therefore, using pinned memory saves around 2x the time needed to copy from/to host memory. In this example, performances is improved by using this type of memory, given that there are `iterations` (consecutive) executions of the algorithm on the same graph. +Therefore, using pinned memory saves significant time needed to copy from/to host memory. In this example, performances is improved by using this type of memory, given that there are `iterations` (consecutive) executions of the algorithm on the same graph. ### Application flow 1. Default values for the number of nodes of the graph and the number of iterations for the algorithm execution are set. @@ -15,8 +15,8 @@ Therefore, using pinned memory saves around 2x the time needed to copy from/to h 3. A number of constants are defined for kernel execution and input/output data size. 4. Host memory is allocated for the distance matrix and initialized with the increasing sequence $1,2,3,\dots$ . These values represent the weights of the edges of the graph. 5. Host memory is allocated for the adjacency matrix and initialized such that the initial path between each pair of vertices $x,y \in V$ ($x \neq y$) is the edge $(x,y)$. -6. Pinned memory is allocated and mapped to device memory. The latter is initialized with the input matrices (distance and adjacency) representing the graph $G$ and the Floyd-Warshall kernel is executed for each node of the graph. -7. The resulting distance and adjacency matrices are copied to the host and pinned memory is freed. +6. Pinned host memory and device memory are allocated. Data is first copied to the pinned host memory and then to the device. Memory is initialized with the input matrices (distance and adjacency) representing the graph $G$ and the Floyd-Warshall kernel is executed for each node of the graph. +7. The resulting distance and adjacency matrices are copied to the host and pinned memory and device memory are freed. 8. The mean time in milliseconds needed for each iteration is printed to standard output. 9. The results obtained are compared with the CPU implementation of the algorithm. The result of the comparison is printed to the standard output. @@ -29,7 +29,8 @@ There are three parameters available: ## Key APIs and Concepts - For this GPU implementation of the Floyd-Warshall algorithm, the main kernel (`floyd_warshall_kernel`) that is launched in a 2-dimensional grid. Each thread in the grid computes the shortest path between two nodes of the graph at a certain step $k$ $\left(0 \leq k < n \right)$. The threads compare the previously computed shortest paths using only the nodes in $V'=\{v_0,v_1,...,v_{k-1}\} \subseteq V$ as intermediate nodes with the paths that include node $v_k$ as an intermediate node, and take the shortest option. Therefore, the kernel is launched $n$ times. -- For improved performance, pinned memory is used to pass the results obtained in each iteration to the next one. With `hipHostMalloc` pinned host memory (accessible by the device) can be allocated, and `hipHostFree` frees it. In this example, host pinned memory is allocated using the `hipHostMallocMapped` flag, which indicates that `hipHostMalloc` must map the allocation into the address space of the current device. The device pointer to such allocated pinned memory is obtained with `hipHostGetDevicePointer`. Beware that an excessive allocation of pinned memory can slow down the host execution, as the program is left with less physical memory available to map the rest of the virtual addresses used. +- For improved performance, pinned memory is used to pass the results obtained in each iteration to the next one. With `hipHostMalloc` pinned host memory (accessible by the device) can be allocated, and `hipHostFree` frees it. In this example, host pinned memory is allocated using the `hipHostMallocMapped` flag, which indicates that `hipHostMalloc` must map the allocation into the address space of the current device. Beware that an excessive allocation of pinned memory can slow down the host execution, as the program is left with less physical memory available to map the rest of the virtual addresses used. +- Device memory is allocated using `hipMalloc` which is later freed using `hipFree` - With `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others. - `myKernelName<<<...>>>` queues the kernel execution on the device. All the kernels are launched on the `hipStreamDefault`, meaning that these executions are performed in order. `hipGetLastError` returns the last error produced by any runtime API call, allowing to check if any kernel launch resulted in error. - `hipEventCreate` creates the events used to measure kernel execution time, `hipEventRecord` starts recording an event and `hipEventSynchronize` waits for all the previous work in the stream when the specified event was recorded. With these three functions it can be measured the start and stop times of the kernel, and with `hipEventElapsedTime` the kernel execution time (in milliseconds) can be obtained. @@ -45,14 +46,16 @@ There are three parameters available: #### Host symbols - `__global__` - `hipEventCreate` +- `hipEventDestroy` - `hipEventElapsedTime` - `hipEventRecord` - `hipEventSynchronize` +- `hipFree` - `hipGetLastError` - `hipHostFree` -- `hipHostGetDevicePointer` - `hipHostMalloc` - `hipHostMallocMapped` +- `hipMalloc` - `hipMemcpy` - `hipMemcpyDeviceToHost` - `hipMemcpyHostToDevice` diff --git a/Applications/floyd_warshall/main.hip b/Applications/floyd_warshall/main.hip index 18545e854..34f938db9 100644 --- a/Applications/floyd_warshall/main.hip +++ b/Applications/floyd_warshall/main.hip @@ -135,12 +135,12 @@ int main(int argc, char* argv[]) { std::cout << "Number of nodes must be a positive multiple of block_size (" << std::to_string(block_size) << ")." << std::endl; - exit(0); + return error_exit_code; } if(iterations == 0) { std::cout << "Number of iterations must be at least 1." << std::endl; - exit(0); + return error_exit_code; } // Total number of elements and bytes of the input matrices. @@ -191,26 +191,31 @@ int main(int argc, char* argv[]) HIP_CHECK(hipHostMalloc(&part_adjacency_matrix, size_bytes, hipHostMallocMapped)); HIP_CHECK(hipHostMalloc(&part_next_matrix, size_bytes, hipHostMallocMapped)); - // Get device pointer to pinned host memory allocations for the input matrices. - float *d_adjacency_matrix, *d_next_matrix; - HIP_CHECK( - hipHostGetDevicePointer((void**)&d_adjacency_matrix, part_adjacency_matrix, 0 /*flags*/)); - HIP_CHECK(hipHostGetDevicePointer((void**)&d_next_matrix, part_next_matrix, 0 /*flags*/)); + // Copy memory to pinned memory region + std::copy(adjacency_matrix.begin(), adjacency_matrix.end(), part_adjacency_matrix); + std::copy(next_matrix.begin(), next_matrix.end(), part_next_matrix); + + // Allocate device memory + unsigned int* d_adjacency_matrix; + unsigned int* d_next_matrix; + HIP_CHECK(hipMalloc((void**)&d_adjacency_matrix, size_bytes)); + HIP_CHECK(hipMalloc((void**)&d_next_matrix, size_bytes)); + + // Create events to measure the execution time of the kernels. + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); // Run iterations times the Floyd-Warshall GPU algorithm. for(unsigned int i = 0; i < iterations; ++i) { // Copy input data from host to device memory. HIP_CHECK(hipMemcpy(d_adjacency_matrix, - adjacency_matrix.data(), + part_adjacency_matrix, size_bytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_next_matrix, next_matrix.data(), size_bytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_next_matrix, part_next_matrix, size_bytes, hipMemcpyHostToDevice)); - // Create events to measure the execution time of the kernels. - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); float kernel_ms{}; // Floyd-Warshall GPU algorithm: launch Floyd-Warshall kernel for each node of the graph. @@ -220,11 +225,10 @@ int main(int argc, char* argv[]) HIP_CHECK(hipEventRecord(start, hipStreamDefault)); // Launch Floyd-Warshall kernel on the default stream. - floyd_warshall_kernel<<>>( - part_adjacency_matrix, - part_next_matrix, - nodes, - k); + floyd_warshall_kernel<<>>(d_adjacency_matrix, + d_next_matrix, + nodes, + k); // Check if the kernel launch was successful. HIP_CHECK(hipGetLastError()); @@ -238,16 +242,23 @@ int main(int argc, char* argv[]) kernel_time += kernel_ms; } } + // Free events used for time measurement + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); // Copy results back to host. HIP_CHECK( hipMemcpy(adjacency_matrix.data(), d_adjacency_matrix, size_bytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipMemcpy(next_matrix.data(), d_next_matrix, size_bytes, hipMemcpyDeviceToHost)); - // Free device memory. + // Free host memory. HIP_CHECK(hipHostFree(part_adjacency_matrix)); HIP_CHECK(hipHostFree(part_next_matrix)); + // Free device memory + HIP_CHECK(hipFree(d_adjacency_matrix)); + HIP_CHECK(hipFree(d_next_matrix)); + // Print the mean time per iteration (in miliseconds) of the algorithm. kernel_time /= iterations; std::cout << "The mean time needed for each iteration has been " << kernel_time << "ms." diff --git a/Applications/histogram/CMakeLists.txt b/Applications/histogram/CMakeLists.txt new file mode 100644 index 000000000..fe0f43eb0 --- /dev/null +++ b/Applications/histogram/CMakeLists.txt @@ -0,0 +1,63 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name applications_histogram) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +if(WIN32) + set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") +else() + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +# For examples targeting NVIDIA, include the HIP header directory. +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) + +install(TARGETS ${example_name}) diff --git a/Applications/histogram/Makefile b/Applications/histogram/Makefile new file mode 100644 index 000000000..213f341c5 --- /dev/null +++ b/Applications/histogram/Makefile @@ -0,0 +1,60 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := applications_histogram +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +ICXXFLAGS := -std=$(CXX_STD) +ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) +ILDFLAGS := +ILDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + ICXXFLAGS += -x cu + ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) + CXXFLAGS ?= -Wall -Wextra +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +ICXXFLAGS += $(CXXFLAGS) +ICPPFLAGS += $(CPPFLAGS) +ILDFLAGS += $(LDFLAGS) +ILDLIBS += $(LDLIBS) + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp + $(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS) + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/Applications/histogram/README.md b/Applications/histogram/README.md new file mode 100644 index 000000000..edfb59b5e --- /dev/null +++ b/Applications/histogram/README.md @@ -0,0 +1,58 @@ +# Applications: Histogram Example + +## Description +This program showcases a GPU kernel and its invocation of a histogram computation over a byte (`unsigned char`) array. A histogram constructs a table with the counts of each discrete value. +The diagram below showcases a 4 bin histogram over an 8-element long array: + +![A diagram illustrating the access and write pattern of a histogram operation.](histogram_example.svg) + +The kernel is optimized to reduce bank conflicts. +On GPUs memory is divided into banks and each bank may be accessed in parallel. +When the same bank is accessed twice concurrently, the memory accesses will be executed serially which lowers data throughput. +Since this kernel uses a shared memory with less than 4-byte long elements (`unsigned char`, 1-byte long) bank conflicts can occur. +This is solved by striding over the input such a way that each thread accesses a different memory bank. See the diagram below: + +![A diagram illustrating bank conflicts and solution using striding.](bank_conflict_reduction.svg) + + +### Application flow +1. Define and allocate inputs and outputs on host. +2. Allocate the memory on device and copy the input. +3. Launch the histogram kernel. +4. Copy the results back to host and calculate the final histogram. +5. Free the allocated memory on device. +6. Verify the results on host. + +### Key APIs and concepts +- _Bank conflicts._ Memory is stored across multiple banks. Elements in banks are stored in 4-byte words. Each thread within a wavefront should access different banks to ensure high throughput. +- `__ffs(int input)` finds the 1-index of the first set least significant bit of the input. +- `__syncthreads()` halts this thread until all threads within the same block have reached this point. +- `__shared__` marks memory as shared. All threads within the same block can access this. + +## Demonstrated API calls + +### HIP runtime + +#### Device symbols +- `blockDim` +- `blockIdx` +- `threadIdx` +- `__ffs()` +- `__syncthreads()` +- `__shared__` + +#### Host symbols +- `__global__` +- `hipEvent_t` +- `hipEventCreate` +- `hipEventDestroy` +- `hipEventElapsedTime` +- `hipEventRecord` +- `hipEventSynchronize` +- `hipFree()` +- `hipGetLastError` +- `hipMalloc()` +- `hipMemcpy()` +- `hipMemcpyHostToDevice` +- `hipMemcpyDeviceToHost` +- `myKernel<<<...>>>()` diff --git a/Applications/histogram/bank_conflict_reduction.svg b/Applications/histogram/bank_conflict_reduction.svg new file mode 100644 index 000000000..68786b79e --- /dev/null +++ b/Applications/histogram/bank_conflict_reduction.svg @@ -0,0 +1,4 @@ + + + +
Memory
Memory
Bank
Bank
Wave Front
Wave Front
Threads
Threads
Memory
Memory
Bank
Bank
Wave Front
Wave Front
Threads
Threads
Threads in the same wave front access the same bank multiple times: conflicts.
Threads in the same wave f...
Memory access is strided: wave fronts can access banks in parallel.
Memory access is strided:...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Applications/histogram/histogram_example.svg b/Applications/histogram/histogram_example.svg new file mode 100644 index 000000000..64d795f45 --- /dev/null +++ b/Applications/histogram/histogram_example.svg @@ -0,0 +1,4 @@ + + + +
0
0
3
3
2
2
3
3
0
0
1
1
3
3
1
1
0: 2
0: 2
1: 2
1: 2
2: 1
2: 1
3: 3
3: 3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Applications/histogram/histogram_vs2017.sln b/Applications/histogram/histogram_vs2017.sln new file mode 100644 index 000000000..8311f8ccb --- /dev/null +++ b/Applications/histogram/histogram_vs2017.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 15 +VisualStudioVersion = 15.0.33026.149 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2017", "histogram_vs2017.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {7748E875-C82A-4406-936C-562C9DFFF9C6} + EndGlobalSection +EndGlobal diff --git a/Applications/histogram/histogram_vs2017.vcxproj b/Applications/histogram/histogram_vs2017.vcxproj new file mode 100644 index 000000000..24e3a3748 --- /dev/null +++ b/Applications/histogram/histogram_vs2017.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {D3531843-4D0D-445D-BD8D-2352038D8221} + Win32Proj + histogram_vs2017 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/histogram/histogram_vs2017.vcxproj.filters b/Applications/histogram/histogram_vs2017.vcxproj.filters new file mode 100644 index 000000000..441a84073 --- /dev/null +++ b/Applications/histogram/histogram_vs2017.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/histogram/histogram_vs2019.sln b/Applications/histogram/histogram_vs2019.sln new file mode 100644 index 000000000..ec0a73321 --- /dev/null +++ b/Applications/histogram/histogram_vs2019.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.33214.272 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2019", "histogram_vs2019.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {7748E875-C82A-4406-936C-562C9DFFF9C6} + EndGlobalSection +EndGlobal diff --git a/Applications/histogram/histogram_vs2019.vcxproj b/Applications/histogram/histogram_vs2019.vcxproj new file mode 100644 index 000000000..f10996129 --- /dev/null +++ b/Applications/histogram/histogram_vs2019.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + {D3531843-4D0D-445D-BD8D-2352038D8221} + Win32Proj + histogram_vs2019 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/histogram/histogram_vs2019.vcxproj.filters b/Applications/histogram/histogram_vs2019.vcxproj.filters new file mode 100644 index 000000000..441a84073 --- /dev/null +++ b/Applications/histogram/histogram_vs2019.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/histogram/histogram_vs2022.sln b/Applications/histogram/histogram_vs2022.sln new file mode 100644 index 000000000..cfd0d618c --- /dev/null +++ b/Applications/histogram/histogram_vs2022.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.33213.308 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2022", "histogram_vs2022.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {7748E875-C82A-4406-936C-562C9DFFF9C6} + EndGlobalSection +EndGlobal diff --git a/Applications/histogram/histogram_vs2022.vcxproj b/Applications/histogram/histogram_vs2022.vcxproj new file mode 100644 index 000000000..489b54f66 --- /dev/null +++ b/Applications/histogram/histogram_vs2022.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {D3531843-4D0D-445D-BD8D-2352038D8221} + Win32Proj + histogram_vs2022 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/histogram/histogram_vs2022.vcxproj.filters b/Applications/histogram/histogram_vs2022.vcxproj.filters new file mode 100644 index 000000000..441a84073 --- /dev/null +++ b/Applications/histogram/histogram_vs2022.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/histogram/main.hip b/Applications/histogram/main.hip new file mode 100644 index 000000000..9fa23a11a --- /dev/null +++ b/Applications/histogram/main.hip @@ -0,0 +1,180 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "example_utils.hpp" +#include + +#include +#include +#include +#include + +/// \brief Calculates the 256-sized bin histogram for a block. +__global__ void + histogram256_block(unsigned char* data, unsigned int* block_bins, const int items_per_thread) +{ + const int thread_id = threadIdx.x; + const int block_id = blockIdx.x; + const int block_size = blockDim.x; + const int bin_size = 256; + + // If thread_bins was an array of unsigned int, thread_bins could be + // clustered by thread to reduce banking conflicts: + // | t0 ... t128 | t0 ... t128 | ... | t0 ... t128 | + // | bin0 | bin1 | ... | bin255 | + // Thread bins is of size: bin_size * block_size. + extern __shared__ unsigned char thread_bins[]; + + // However, we need to use unsigned char to save space, which is smaller + // than 32-bit word unit stored per bank. We can shuffle thread_id such + // that a wave front iterates through thread_bins with a stride of + // 4 elements (32-bits total). Example with 128 threads per block: + // 0b0000_0000_0AAB_BBBBB into ( thread_id) + // 0b0000_0000_0BBB_BBBAA (sh_thread_id) + // sh_thread_id is in the range [0; block_size) + + // If we assume that block_size is a power of two, then we can get the + // length of B by finding the first '1' bit with '__ffs'. + const int b_bits_length = __ffs(block_size) - 3; + const int sh_thread_id + = (thread_id & (1 << b_bits_length) - 1) << 2 | (thread_id >> b_bits_length); + + // Initialize 'thread_bins' to 0 + for(int i = 0; i < bin_size; ++i) + { + thread_bins[i + bin_size * sh_thread_id] = 0; + } + __syncthreads(); + + for(int i = 0; i < items_per_thread; i++) + { + const unsigned int value = data[(block_id * block_size + thread_id) * items_per_thread + i]; + thread_bins[value * block_size + sh_thread_id]++; + } + __syncthreads(); + + // Join the generated 256 bins from 128 threads by letting each thread sum 256 elements from 2 bins. + const int bins_per_thread = bin_size / block_size; + for(int i = 0; i < bins_per_thread; ++i) + { + // bin_sh_id is in the range [0; bin_size) + const int bin_sh_id = i * block_size + sh_thread_id; + + // Accumulate bins. + unsigned int bin_acc = 0; + for(int j = 0; j < block_size; ++j) + { + // Sum the result from the j-th thread from the 'block_size'-sized 'bin_id'th bin. + bin_acc += thread_bins[bin_sh_id * block_size + j]; + } + + block_bins[block_id * bin_size + bin_sh_id] = bin_acc; + } +} + +int main() +{ + // 1. Define inputs + const int size = 1024 * 1024; + const int items_per_thread = 1024; + const int threads_per_block = 128; + + const int bin_size = 256; + const int total_blocks = (size) / (items_per_thread * threads_per_block); + + std::vector h_data(size); + + std::default_random_engine generator; + std::uniform_int_distribution distribution; + + std::generate(h_data.begin(), h_data.end(), [&]() { return distribution(generator); }); + + std::vector h_bins(bin_size); + std::vector h_blockBins(sizeof(unsigned int) * bin_size * total_blocks); + + // 2. Allocate memory on device. + unsigned char* d_data; + unsigned int* d_blockBins; + + // Setup kernel execution time tracking. + float kernel_ms = 0; + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + HIP_CHECK(hipMalloc(&d_blockBins, sizeof(unsigned int) * bin_size * total_blocks)); + HIP_CHECK(hipMalloc(&d_data, sizeof(unsigned char) * size)); + HIP_CHECK( + hipMemcpy(d_data, h_data.data(), sizeof(unsigned char) * size, hipMemcpyHostToDevice)); + + // 3. Launch the histogram kernel + std::cout << "Launching 'histogram256_block' with " << total_blocks << " blocks of size " + << threads_per_block << std::endl; + + HIP_CHECK(hipEventRecord(start)); + + histogram256_block<<>>(d_data, d_blockBins, items_per_thread); + // Check for errors. + HIP_CHECK(hipGetLastError()); + + // Get kernel execution time. + HIP_CHECK(hipEventRecord(stop)); + HIP_CHECK(hipEventSynchronize(stop)); + HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop)); + std::cout << "Kernel took " << kernel_ms << " milliseconds." << std::endl; + + // 4. Copy back to host and calculate final histogram bin. + HIP_CHECK(hipMemcpy(h_blockBins.data(), + d_blockBins, + sizeof(unsigned int) * bin_size * total_blocks, + hipMemcpyDeviceToHost)); + + for(int i = 0; i < total_blocks; ++i) + { + for(int j = 0; j < bin_size; ++j) + { + int count = h_blockBins[i * bin_size + j]; + h_bins[j] += count; + } + } + + // 5. Free device memory. + HIP_CHECK(hipFree(d_blockBins)); + HIP_CHECK(hipFree(d_data)); + HIP_CHECK(hipEventDestroy(start)) + HIP_CHECK(hipEventDestroy(stop)) + + // 6. Verify by calculating on host. + int errors = 0; + std::vector h_verify_bins(bin_size); + for(int i = 0; i < size; ++i) + { + ++h_verify_bins[h_data[i]]; + } + for(int i = 0; i < bin_size; ++i) + { + errors += h_bins[i] != h_verify_bins[i]; + } + return report_validation_result(errors); +} diff --git a/Applications/monte_carlo_pi/.gitignore b/Applications/monte_carlo_pi/.gitignore new file mode 100644 index 000000000..8fcf11976 --- /dev/null +++ b/Applications/monte_carlo_pi/.gitignore @@ -0,0 +1 @@ +applications_monte_carlo_pi diff --git a/Applications/monte_carlo_pi/CMakeLists.txt b/Applications/monte_carlo_pi/CMakeLists.txt new file mode 100644 index 000000000..8ee0fd9a3 --- /dev/null +++ b/Applications/monte_carlo_pi/CMakeLists.txt @@ -0,0 +1,75 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name applications_monte_carlo_pi) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + message(FATAL_ERROR "Only the following values are accepted for GPU_RUNTIME: ${GPU_RUNTIMES}") +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +if(WIN32) + set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") +else() + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +find_package(hipcub REQUIRED) +find_package(hiprand REQUIRED) +# Workaround for hipRAND, requires manual linking with backend. +if(GPU_RUNTIME STREQUAL "CUDA") + find_package(CUDAToolkit REQUIRED) +else() + find_package(rocrand REQUIRED) +endif() + +add_executable(${example_name} main.hip) +add_test(${example_name} ${example_name}) + +target_link_libraries(${example_name} PRIVATE hip::hipcub hip::hiprand) +# Workaround for hipRAND, requires manual linking with backend. +if(GPU_RUNTIME STREQUAL "CUDA") + target_link_libraries(${example_name} PRIVATE CUDA::curand) +else() + target_link_libraries(${example_name} PRIVATE roc::rocrand) +endif() +target_include_directories(${example_name} PRIVATE "../../Common") +# Workaround for rocRAND since it relies on definition "WIN32". +if(WIN32) + target_compile_definitions(${example_name} PRIVATE WIN32) +endif() +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) + +install(TARGETS ${example_name}) diff --git a/Applications/monte_carlo_pi/Makefile b/Applications/monte_carlo_pi/Makefile new file mode 100644 index 000000000..b1ad303a8 --- /dev/null +++ b/Applications/monte_carlo_pi/Makefile @@ -0,0 +1,69 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := applications_monte_carlo_pi +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +CUDA_INSTALL_DIR := /usr/local/cuda + +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include +HIPCUB_INCLUDE_DIR := $(HIP_INCLUDE_DIR) +HIPRAND_INCLUDE_DIR := $(HIP_INCLUDE_DIR) + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc +CUDACXX ?= $(CUDA_INSTALL_DIR)/bin/nvcc + +# Common variables and flags +CXX_STD := c++17 +ICXXFLAGS := -std=$(CXX_STD) +ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) -isystem $(HIPCUB_INCLUDE_DIR) -isystem $(HIPRAND_INCLUDE_DIR) \ + -I $(COMMON_INCLUDE_DIR) +ILDFLAGS := -L $(ROCM_INSTALL_DIR)/lib +ILDLIBS := -lhiprand + +ifeq ($(GPU_RUNTIME), CUDA) + ICXXFLAGS += -x cu + ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR) -D__HIP_PLATFORM_NVIDIA__ + COMPILER := $(CUDACXX) +else ifeq ($(GPU_RUNTIME), HIP) + CXXFLAGS ?= -Wall -Wextra + ICPPFLAGS += -D__HIP_PLATFORM_AMD__ + COMPILER := $(HIPCXX) +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +ICXXFLAGS += $(CXXFLAGS) +ICPPFLAGS += $(CPPFLAGS) +ILDFLAGS += $(LDFLAGS) +ILDLIBS += $(LDLIBS) + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/hiprand_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp + $(COMPILER) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS) + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/Applications/monte_carlo_pi/README.md b/Applications/monte_carlo_pi/README.md new file mode 100644 index 000000000..77ade8249 --- /dev/null +++ b/Applications/monte_carlo_pi/README.md @@ -0,0 +1,81 @@ +# Applications Monte Carlo Pi Example + +## Description +This example demonstrates how the mathematical constant pi ($\pi$) can be approximated using Monte Carlo integration. Monte Carlo integration approximates integration of a function by generating random values over a domain that is the superset of the function's domain. Using the ratio between the number of samples in both domains and the range of the random values, the integral is approximated. + +The area of a disk is given by $r^2\pi$, where $r$ is the radius of a disk. Uniform random values are typically generated in the range $(0,1]$. Using a disk of radius $1$ centered on the origin, a sample point is in the disk if it's distance to the origin is less than $1$. The ratio between the number of sample points within the disk and the total sample points is an approximation of the ratio between the area of the disk and the quadrant $(0,1]\times(0,1]$, which is $\frac{\pi}{4}$. Multiplying the sample point ratio by $4$ approximates the value of pi. + +To generate a large number of random samples we use hipRAND, a platform-independent library for GPU-based random number generation. hipRAND offers a choice of different generators, belonging to one of two categories: pseudorandom and quasirandom. Pseudorandom-number generators output a stream of numbers that appears to be statistically random, but is deterministic based on a seed. Quasirandom-number generators output a stream of values that cover the output domain evenly, for each given domain. For Monte Carlo integration, is it assumed that a quasirandom-number generator will provide a better approximation with a low number of points, because the sample points are of a guaranteed statistical distribution. + +To compute the number of sample points that lie within the disk, we use hipCUB, which is a platform-independent library providing GPU primitives. For each sample, we are looking to compute whether it lies in the disk, and to count the number of samples for which this is the case. Using and indicator function and `TransformInputIterator`, an iterator is created which outputs a zero or one for each sample. Using `DeviceReduce::Sum`, the sum over the iterator's values is computed. + +### Application flow +1. Parse and validate user input. +2. Allocate device memory to store the random values. Since the samples are two-dimensional, two random values are + required per sample. +3. Initialize hipRAND's default pseudorandom-number generator and generate the required number of values. +4. Allocate and initialize the input and output for hipCUB's `DeviceReduce::Sum`: + 1. Create a `hipcub::CountingInputIterator` that starts from `0`, which will represent the sample index. + 2. Create a `hipcub::TransformInputIterator` that uses the sample index to obtain the sample's coordinates from the + array of random numbers, and computes whether it lies within the disk. This iterator will be the input for the + device function. + 3. Allocate device memory for the variable that stores the output of the function. +5. Calculate the required amount of temporary storage, and allocate it. +6. Calculate the number of samples within the disk with `hipcub::DeviceReduce::Sum`. +7. Copy the result back to the host and calculate pi. +8. Clean up the generator and print the result. +9. Initialize hipRAND's default quasirandom-number generator, set the dimensions to two, and generate the required + number of values. Note that the first half of the array will be the first dimension, the second half will be the + second dimension. +10. Repeat steps 4. - 8. for the quasirandom values. + +### Command line interface +- `-s ` or `-sample_count ` sets the number of samples used, the default is $2^{20}$. + +## Key APIs and Concepts +- To start using hipRAND, a call to `hiprandCreateGenerator` with a generator type is made. + - To pick any of hipRAND's pseudorandom-number generators, we use type `HIPRAND_RNG_PSEUDO_DEFAULT`. For pseudorandom-number generators, the seed can be set with `hiprandSetPseudoRandomGeneratorSeed`. + - We use type `HIPRAND_RNG_QUASI_DEFAULT` to create a quasirandom-number generator. For quasirandom-number generators, the number of dimensions can be set with `hiprandSetQuasiRandomGeneratorDimensions`. For this example, we calculate an area, so our domain consists of two dimensions. + + Destroying the hipRAND generator is done with `hiprandDestroyGenerator`. +- hipCUB itself requires no initialization, but each of its functions must be called twice. The first call must have a null-valued temporary storage argument, the call sets the required storage size. The second call performs the actual operation with the user-allocated memory. +- hipCUB offers a number of iterators for convenience: + - `hipcub::CountingInputIterator` will act as an incrementing sequence starting from a specified index. + - `hipcub::TransformInputIterator` takes an iterator and applies a user-defined function on it. +- hipCUB's `DeviceReduce::Sum` computes the sum over the input iterator and outputs a single value to the output iterator. + +## Demonstrated API Calls + +### HIP runtime +- `__device__` +- `__forceinline__` +- `__host__` +- `hipError_t` +- `hipEventCreate` +- `hipEventDestroy` +- `hipEventElapsedTime` +- `hipEventRecord` +- `hipEventSynchronize` +- `hipGetErrorString` +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyDeviceToHost` +- `hipMemcpyHostToDevice` +- `hipStreamDefault` + +### hipRAND +- `HIPRAND_RNG_PSEUDO_DEFAULT` +- `HIPRAND_RNG_QUASI_DEFAULT` +- `HIPRAND_STATUS_SUCCESS` +- `hiprandCreateGenerator` +- `hiprandDestroyGenerator` +- `hiprandGenerateUniform` +- `hiprandGenerator_t` +- `hiprandSetPseudoRandomGeneratorSeed` +- `hiprandSetQuasiRandomGeneratorDimensions` +- `hiprandStatus_t` + +### hipCUB +- `hipcub::CountingInputIterator` +- `hipcub::DeviceReduce::Sum` +- `hipcub::TransformInputIterator` diff --git a/Applications/monte_carlo_pi/main.hip b/Applications/monte_carlo_pi/main.hip new file mode 100644 index 000000000..29004f8bf --- /dev/null +++ b/Applications/monte_carlo_pi/main.hip @@ -0,0 +1,193 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "cmdparser.hpp" +#include "example_utils.hpp" +#include "hiprand_utils.hpp" + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/// \brief Given a sample's index, return 1 if the sample, for which both dimensions lie in +/// (0, 1], is contained within the disk centered at the origin with radius 1. Else return 0. +struct conversion_op +{ + /// \brief The number of samples is given by \p s, 2 * s random numbers are given in + /// \p d_d, which first stores \p s x-values followed by \p s y-values. + conversion_op(int s, float* d_d) : sample_count(s), d_data(d_d) {} + + __device__ __host__ __forceinline__ int operator()(const int& a) const + { + float x = d_data[a]; + float y = d_data[sample_count + a]; + // sample points are in (0, 1] + float distance = x * x + y * y; + return static_cast(distance <= 1.f); + } + + int sample_count; + float* d_data; +}; + +/// \brief Given 2 * sample_count random numbers in \p d_data that are in (0, 1], +/// approximate pi with the assumption that the random numbers are uniformly distributed. +/// \p d_data first stores \p sample_count x-values followed by \p sample_count y-values. +float calculate_pi(int sample_count, float* d_data) +{ + // 4. Set up the input and output iterator for hipCUB's Sum. + + // Represents the samples' index. + auto input_counting = hipcub::CountingInputIterator(0); + + // Converts the sample's index to a 0 or 1, indicating whether the sample lies within the disk. + conversion_op convert_op(sample_count, d_data); + auto input = hipcub::TransformInputIterator( + input_counting, + convert_op); + + int* d_output{}; + HIP_CHECK(hipMalloc(&d_output, sizeof(int))); + + // 5. Call hipCUB's Sum to calculate the required memory size, allocate that amount. + void* tmp_storage{}; + std::size_t tmp_storage_size{}; + + HIP_CHECK( + hipcub::DeviceReduce::Sum(tmp_storage, tmp_storage_size, input, d_output, sample_count)); + + HIP_CHECK(hipMalloc(&tmp_storage, tmp_storage_size)); + + // 6. Call hipCUB's Sum to calculate the number of samples within the circle. + HIP_CHECK( + hipcub::DeviceReduce::Sum(tmp_storage, tmp_storage_size, input, d_output, sample_count)); + + // 7. Copy back the result and approximate pi. + int num_items{}; + HIP_CHECK(hipMemcpy(&num_items, d_output, sizeof(int), hipMemcpyDeviceToHost)); + + float pi = 4.f * num_items / sample_count; + + HIP_CHECK(hipFree(tmp_storage)); + + return pi; +} + +/// \brief Prints the time elapsed and the calculated value of pi with an error value. +void print_results(int sample_count, + float pi_calc, + float elapsed_ms, + const std::string& random_kind) +{ + constexpr float pi = 3.14159265358979323846f; // ground truth + + float err = std::abs((pi_calc - pi) / pi * 100.f); + std::cout << "Calculating pi using " << sample_count << " " << std::setw(6) << random_kind + << "random samples: " << std::fixed << pi_calc << " (error: " << err + << "%), which took " << elapsed_ms << " ms." << std::defaultfloat << std::endl; +} + +int main(int argc, char* argv[]) +{ + // 1. Parse user inputs. + cli::Parser parser(argc, argv); + parser.set_optional("s", "sample_count", 1u << 20, "Number of samples."); + parser.run_and_exit_if_error(); + + const int sample_count = parser.get("s"); + if(sample_count <= 0) + { + std::cerr << "Sample count should be greater than 0." << std::endl; + return 0; + } + + // The samples have two dimensions, so two random numbers are required per sample. + const int rng_count = 2 * sample_count; + + // 2. Allocate data, initialize variables. + float* d_data{}; + HIP_CHECK(hipMalloc(&d_data, rng_count * sizeof(float))); + + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + float pi_calc{}; + float elapsed_ms{}; + + HIP_CHECK(hipEventRecord(start, hipStreamDefault)); + { + // 3. Initialize hipRAND's default pseudorandom generator and generate 2 * n samples. + hiprandGenerator_t gen; + HIPRAND_CHECK(hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT)); + HIPRAND_CHECK(hiprandSetPseudoRandomGeneratorSeed(gen, 42)); + + HIPRAND_CHECK(hiprandGenerateUniform(gen, d_data, rng_count)); + + // 4. - 7. + pi_calc = calculate_pi(sample_count, d_data); + + // 8. Clean up the generator and print the result. + HIPRAND_CHECK(hiprandDestroyGenerator(gen)); + } + HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); + HIP_CHECK(hipEventSynchronize(stop)); + HIP_CHECK(hipEventElapsedTime(&elapsed_ms, start, stop)); + print_results(sample_count, pi_calc, elapsed_ms, "pseudo"); + + HIP_CHECK(hipEventRecord(start, hipStreamDefault)); + { + // 9. Initialize hipRAND's default quasirandom generator, set the dimensions to two, + // and generate 2 * n samples. + hiprandGenerator_t gen; + HIPRAND_CHECK(hiprandCreateGenerator(&gen, HIPRAND_RNG_QUASI_DEFAULT)); + HIPRAND_CHECK(hiprandSetQuasiRandomGeneratorDimensions(gen, 2)); + + // The first dimension will be in the first half of the array, the second dimension in the + // second half. + HIPRAND_CHECK(hiprandGenerateUniform(gen, d_data, rng_count)); + + // 4. - 7. + pi_calc = calculate_pi(sample_count, d_data); + + // 8. Clean up the generator and print the result. + HIPRAND_CHECK(hiprandDestroyGenerator(gen)); + } + HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); + HIP_CHECK(hipEventSynchronize(stop)); + HIP_CHECK(hipEventElapsedTime(&elapsed_ms, start, stop)); + print_results(sample_count, pi_calc, elapsed_ms, "quasi"); + + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipEventDestroy(start)); + + HIP_CHECK(hipFree(d_data)); +} diff --git a/Applications/prefix_sum/.gitignore b/Applications/prefix_sum/.gitignore new file mode 100644 index 000000000..0d845478b --- /dev/null +++ b/Applications/prefix_sum/.gitignore @@ -0,0 +1 @@ +applications_prefix_sum diff --git a/Applications/prefix_sum/CMakeLists.txt b/Applications/prefix_sum/CMakeLists.txt new file mode 100644 index 000000000..f75db53d5 --- /dev/null +++ b/Applications/prefix_sum/CMakeLists.txt @@ -0,0 +1,63 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name applications_prefix_sum) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +if(WIN32) + set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") +else() + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +# For examples targeting NVIDIA, include the HIP header directory. +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) + +install(TARGETS ${example_name}) diff --git a/Applications/prefix_sum/Makefile b/Applications/prefix_sum/Makefile new file mode 100644 index 000000000..70509e425 --- /dev/null +++ b/Applications/prefix_sum/Makefile @@ -0,0 +1,60 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := applications_prefix_sum +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +ICXXFLAGS := -std=$(CXX_STD) +ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) +ILDFLAGS := +ILDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + ICXXFLAGS += -x cu + ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) + CXXFLAGS ?= -Wall -Wextra +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +ICXXFLAGS += $(CXXFLAGS) +ICPPFLAGS += $(CPPFLAGS) +ILDFLAGS += $(LDFLAGS) +ILDLIBS += $(LDLIBS) + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp + $(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS) + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/Applications/prefix_sum/README.md b/Applications/prefix_sum/README.md new file mode 100644 index 000000000..5ee106d67 --- /dev/null +++ b/Applications/prefix_sum/README.md @@ -0,0 +1,63 @@ +# Applications: Prefix Sum Example + +## Description +This example showcases a GPU implementation of a prefix sum via a scan algorithm. +This example does not use the scan or reduce methods from rocPRIM or hipCUB (`hipcub::DeviceScan::ExclusiveScan`) which could provide improved performance. + +For each element in the input, prefix sum calculates the sum from the beginning up until the item: + +$a_n = \sum^{n}_{m=0} A[m]$ + +The algorithm used has two phases which are repeated: + A) the block wide prefix sum which uses a two pass prefix sum algorithm as described in _Prefix Sums and Their Applications_ (Blelloch, 1988). + B) the device wide prefix sum which propagates values from one block to others. + +Below is an example where the threads per block is 2. +In the first iteration ($\text{offset}=1$) we have 4 threads combining 8 items. + +![](prefix_sum_diagram.svg) + +### Application flow +1. Parse user input. +2. Generate input vector. +3. Calculate the prefix sum. + 1. Define the kernel constants. + 2. Declare and allocate device memory. + 3. Copy the input from host to device + 4. Sweep over the input, multiple times if needed. + 5. Copy the results from device to hsot. + 6. Clean up device memory allocations. +4. Verify the output. + +### Command line interface +The application has an optional argument: +- `-n ` with size of the array to run the prefix sum over. The default value is `256`. + +### Key APIs and concepts +- Device memory is managed with `hipMalloc` and `hipFree`. The former sets the pointer to the allocated space and the latter frees this space. +- `myKernel<<<...>>>()` launches the kernel named `myKernel`. + In this example the kernels `block_prefix_sum` and `device_prefix_sum` are launched. + `block_prefix_sum` requires shared memory which is passed along in the kernel launch. +- `extern __shared__ float[]` in the kernel code denotes an array in shared memory which can be accessed by all threads in the same block. +- `__syncthreads()` blocks this thread until all threads within the current block have reached this point. + This is to ensure no unwanted read-after-write, write-after-write, or write-after-read situations occur. + +## Demonstrated API calls + +### HIP runtime + +#### Device symbols +- `blockDim` +- `blockIdx` +- `threadIdx` +- `__syncthreads()` +- `__shared__` + +#### Host symbols +- `__global__` +- `hipFree()` +- `hipMalloc()` +- `hipMemcpy()` +- `hipMemcpyHostToDevice` +- `hipMemcpyDeviceToHost` +- `myKernel<<<...>>>()` diff --git a/Applications/prefix_sum/main.hip b/Applications/prefix_sum/main.hip new file mode 100644 index 000000000..a60afbe5b --- /dev/null +++ b/Applications/prefix_sum/main.hip @@ -0,0 +1,219 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "cmdparser.hpp" +#include "example_utils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +/// \brief Calculates the prefix sum within a block, in place. +__global__ void block_prefix_sum(float* d_data, int size, int offset) +{ + const int thread_id = threadIdx.x; + const int block_id = blockIdx.x; + const int block_size = blockDim.x; + + const int x = (offset * (2 * (block_id * block_size + thread_id) + 1)) - 1; + + // Cache the computational window in shared memory + extern __shared__ float block[]; + if(x < size) + { + block[2 * thread_id] = d_data[x]; + } + if(x + offset < size) + { + block[2 * thread_id + 1] = d_data[x + offset]; + } + + // Build up tree + int tree_offset = 1; + for(int tree_size = size >> 1; tree_size > 0; tree_size >>= 1) + { + __syncthreads(); + if(thread_id < tree_size) + { + int from = tree_offset * (2 * thread_id + 1) - 1; + int to = tree_offset * (2 * thread_id + 2) - 1; + block[to] += block[from]; + } + tree_offset <<= 1; + } + + if(size > 2) + { + if(tree_offset < size) + { + tree_offset <<= 1; + } + + // Build down tree + int max_thread = tree_offset >> 1; + for(int tree_size = 0; tree_size < max_thread; tree_size <<= 1) + { + tree_size += 1; + tree_offset >>= 1; + __syncthreads(); + + if(thread_id < tree_size) + { + int from = tree_offset * (thread_id + 1) - 1; + int to = from + (tree_offset >> 1); + block[to] += block[from]; + } + } + } + __syncthreads(); + + // write the results back to global memory + if(x < size) + { + d_data[x] = block[2 * thread_id]; + } + if(x + offset < size) + { + d_data[x + offset] = block[2 * thread_id + 1]; + } +} + +/// \brief Propogates values of the prefix sum between blocks on a device. +__global__ void device_prefix_sum(float* buffer, int size, int offset) +{ + const int thread_id = threadIdx.x; + const int block_size = blockDim.x; + const int block_id = blockIdx.x; + + const int sorted_blocks = offset / block_size; + const int unsorted_block_id + = block_id + (block_id / ((offset << 1) - sorted_blocks) + 1) * sorted_blocks; + int x = (unsorted_block_id * block_size + thread_id); + if(((x + 1) % offset != 0) && (x < size)) + { + buffer[x] += buffer[x - (x % offset + 1)]; + } +} + +void run_prefix_sum_kernels(float* input, float* output, const int size) +{ + // 4.1 Define kernel constants + constexpr int threads_per_block = 128; + dim3 block_dim(threads_per_block); + + // Each thread works on 2 elements. + constexpr int items_per_block = threads_per_block * 2; + // block_prefix_sum uses shared memory dependent on the amount of threads per block. + constexpr size_t shared_size = sizeof(float) * 2 * threads_per_block; + + // 4.2 Declare and allocate device memory. + float* d_data; + HIP_CHECK(hipMalloc(&d_data, sizeof(float) * size)); + + // 4.3 Copy the inputs from host to device + HIP_CHECK(hipMemcpy(d_data, input, sizeof(float) * size, hipMemcpyHostToDevice)); + + // 4.4 Sweep over the input, multiple times if needed + // Alternatively, use hipcub::DeviceScan::ExclusiveScan + for(int offset = 1; offset < size; offset *= items_per_block) + { + const int data_size = size / offset; + + if(size / offset > 1) + { + unsigned int total_threads = (data_size + 1) / 2; + total_threads + = ((total_threads + threads_per_block - 1) / threads_per_block) * threads_per_block; + dim3 grid_dim(total_threads / threads_per_block); + + block_prefix_sum<<>>(d_data, size, offset); + } + + if(offset > 1) + { + int total_threads = size - offset; + total_threads -= (total_threads / (offset * items_per_block)) * offset; + total_threads + = ((total_threads + threads_per_block - 1) / threads_per_block) * threads_per_block; + dim3 grid_dim(total_threads / threads_per_block); + + device_prefix_sum<<>>(d_data, size, offset); + } + } + + // 4.5 Copy the results from device to host. + HIP_CHECK(hipMemcpy(output, d_data, sizeof(float) * size, hipMemcpyDeviceToHost)); + + // 4.6 Clean up device memory allocations. + HIP_CHECK(hipFree(d_data)); +} + +int main(int argc, char* argv[]) +{ + // 1. Parse user input. + cli::Parser parser(argc, argv); + parser.set_optional("n", "size", 256); + parser.run_and_exit_if_error(); + + const int size = parser.get("n"); + if(size <= 0) + { + std::cout << "Size must be at least 1." << std::endl; + exit(0); + } + + // 2. Generate input vector. + std::cout << "Prefix sum over " << size << " items.\n" << std::endl; + + std::vector input(size); + std::vector output(size); + + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1, 1); + + std::generate(input.begin(), input.end(), [&]() { return distribution(generator); }); + + // 3. Run the prefix sum. + run_prefix_sum_kernels(input.data(), output.data(), size); + + // 4. Verify the output. + float verify = 0; + int errors = 0; + for(int i = 0; i < size; i++) + { + verify += input[i]; + errors += std::pow(output[i] - verify, 2) > 1e-8; + } + + std::cout << "Final sum on \n" + << " device: " << output.back() << "\n" + << " host : " << verify << "\n" + << std::endl; + + return report_validation_result(errors); +} diff --git a/Applications/prefix_sum/prefix_sum_diagram.svg b/Applications/prefix_sum/prefix_sum_diagram.svg new file mode 100644 index 000000000..4c55617da --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_diagram.svg @@ -0,0 +1,4 @@ + + + +
1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
3
3
7
7
11
11
15
15
10
10
26
26
3
3
6
6
3
3
11
11
7
7
18
18
10
10
26
26
36
36
10
10
15
15
21
21
28
28
5
5
11
11
18
18
block_prefix_sum
offset 1
block_prefix_sum...
block_prefix_sum
offset 2
block_prefix_sum...
device_prefix_sum
offset 2
device_prefix_sum...
block_prefix_sum
offset 4
block_prefix_sum...
device_prefix_sum
offset 4
device_prefix_sum...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Applications/prefix_sum/prefix_sum_vs2017.sln b/Applications/prefix_sum/prefix_sum_vs2017.sln new file mode 100644 index 000000000..def6981f7 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2017.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 15 +VisualStudioVersion = 15.0.33026.149 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2017", "prefix_sum_vs2017.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {2D7976DA-0CA5-4D6A-ADFE-D99F79CC68C6} + EndGlobalSection +EndGlobal diff --git a/Applications/prefix_sum/prefix_sum_vs2017.vcxproj b/Applications/prefix_sum/prefix_sum_vs2017.vcxproj new file mode 100644 index 000000000..cf4329201 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2017.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {015df085-feb3-4c7a-acee-7cffb3c9aff0} + Win32Proj + prefix_sum_vs2017 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/prefix_sum/prefix_sum_vs2017.vcxproj.filters b/Applications/prefix_sum/prefix_sum_vs2017.vcxproj.filters new file mode 100644 index 000000000..441a84073 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2017.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/prefix_sum/prefix_sum_vs2019.sln b/Applications/prefix_sum/prefix_sum_vs2019.sln new file mode 100644 index 000000000..46ae96070 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2019.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.33214.272 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2019", "prefix_sum_vs2019.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {2D7976DA-0CA5-4D6A-ADFE-D99F79CC68C6} + EndGlobalSection +EndGlobal diff --git a/Applications/prefix_sum/prefix_sum_vs2019.vcxproj b/Applications/prefix_sum/prefix_sum_vs2019.vcxproj new file mode 100644 index 000000000..4c5f17c84 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2019.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + {015df085-feb3-4c7a-acee-7cffb3c9aff0} + Win32Proj + prefix_sum_vs2019 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + \ No newline at end of file diff --git a/Applications/prefix_sum/prefix_sum_vs2019.vcxproj.filters b/Applications/prefix_sum/prefix_sum_vs2019.vcxproj.filters new file mode 100644 index 000000000..441a84073 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2019.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Applications/prefix_sum/prefix_sum_vs2022.sln b/Applications/prefix_sum/prefix_sum_vs2022.sln new file mode 100644 index 000000000..c792acc5b --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2022.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.33213.308 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2022", "prefix_sum_vs2022.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {2D7976DA-0CA5-4D6A-ADFE-D99F79CC68C6} + EndGlobalSection +EndGlobal diff --git a/Applications/prefix_sum/prefix_sum_vs2022.vcxproj b/Applications/prefix_sum/prefix_sum_vs2022.vcxproj new file mode 100644 index 000000000..2fde7b8aa --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2022.vcxproj @@ -0,0 +1,134 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {BE12D9BE-704A-4697-9D3D-5351A6E30189} + Win32Proj + prefix_sum_vs2022 + $(LatestTargetPlatformVersion) + + + + + + + + + + + Application + true + HIP_clang + Unicode + + + Application + false + HIP_clang + true + Unicode + + + + + + + + + + + + + + + + + true + applications_$(ProjectName) + + + false + applications_$(ProjectName) + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + gfx1030;gfx1100;gfx1101;gfx1102 + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + Level2 + true + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + + diff --git a/Applications/prefix_sum/prefix_sum_vs2022.vcxproj.filters b/Applications/prefix_sum/prefix_sum_vs2022.vcxproj.filters new file mode 100644 index 000000000..441a84073 --- /dev/null +++ b/Applications/prefix_sum/prefix_sum_vs2022.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {3a8ca8cf-77de-46b6-9d10-e874f6ee218e} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {f9b34cbe-ea5e-408a-844b-f390896f7360} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {44ac4fd5-a539-49a5-aa2f-85efa3d2f5fc} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Common/hiprand_utils.hpp b/Common/hiprand_utils.hpp new file mode 100644 index 000000000..18292d7a8 --- /dev/null +++ b/Common/hiprand_utils.hpp @@ -0,0 +1,47 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef COMMON_HIPRAND_UTILS_HPP +#define COMMON_HIPRAND_UTILS_HPP + +#include "example_utils.hpp" + +#include + +#include + +/// \brief Checks if the provided hipRAND status is \p HIPRAND_STATUS_SUCCESS and if not, prints an +/// error message to the standard error output and terminates the program with an error code. +#define HIPRAND_CHECK(condition) \ + do \ + { \ + const hiprandStatus_t status = condition; \ + if(status != HIPRAND_STATUS_SUCCESS) \ + { \ + std::cerr << "A hipRAND error occurred at " << __FILE__ << ':' << __LINE__ \ + << std::endl; \ + std::exit(error_exit_code); \ + } \ + } \ + while(0) + +#endif // COMMON_HIPRAND_UTILS_HPP diff --git a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile index 8652b64a4..d529c14f4 100644 --- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile +++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile @@ -87,6 +87,16 @@ RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/roc && cmake --build ./hipSOLVER-rocm-5.3.0/build --target install \ && rm -rf ./hipSOLVER-rocm-5.3.0 +# Install hipRAND +RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.3.0.tar.gz \ + && tar -xf ./rocm-5.3.0.tar.gz \ + && rm ./rocm-5.3.0.tar.gz \ + && cmake -S ./hipRAND-rocm-5.3.0 -B ./hipRAND-rocm-5.3.0/build \ + -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ + -D CMAKE_INSTALL_PREFIX=/opt/rocm \ + && cmake --build ./hipRAND-rocm-5.3.0/build --target install \ + && rm -rf ./hipRAND-rocm-5.3.0 + # Use render group as an argument from user ARG GID=109 diff --git a/README.md b/README.md index 02f61f1c8..58f770281 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,12 @@ A collection of examples to enable new users to start using ROCm. Advanced users ## Repository Contents - [Applications](https://github.com/amd/rocm-examples/tree/develop/Applications/) groups a number of examples ... . + - [bitonic_sort](https://github.com/amd/rocm-examples/tree/develop/Applications/bitonic_sort/): Showcases how to order an array of $n$ elements using a GPU implementation of the bitonic sort. + - [convolution](https://github.com/amd/rocm-examples/tree/develop/Applications/convolution/): A simple GPU implementation for the calculation of discrete convolutions. - [floyd_warshall](https://github.com/amd/rocm-examples/tree/develop/Applications/floyd_warshall/): Showcases a GPU implementation of the Floyd-Warshall algorithm for finding shortest paths in certain types of graphs. + - [histogram](https://github.com/amd/rocm-examples/tree/develop/Applications/histogram/): Histogram over a byte array with memory bank optimization. + - [monte_carlo_pi](https://github.com/amd/rocm-examples/tree/develop/Applications/monte_carlo_pi/): Monte Carlo estimation of $\pi$ using hipRAND for random number generation and hipCUB for evaluation. + - [prefix_sum](https://github.com/amd/rocm-examples/tree/develop/Applications/prefix_sum/): Showcases a GPU implementation of a prefix sum with a 2-kernel scan algorithm. - [Common](https://github.com/amd/rocm-examples/tree/develop/Common/) contains common utility functionality shared between the examples. - [HIP-Basic](https://github.com/amd/rocm-examples/tree/develop/HIP-Basic/) hosts self-contained recipes showcasing HIP runtime functionality. - [assembly_to_executable](https://github.com/amd/rocm-examples/tree/develop/HIP-Basic/assembly_to_executable): Program and accompanying build systems that show how to manually compile and link a HIP application from host and device code. diff --git a/ROCm-Examples-Portable-VS2017.sln b/ROCm-Examples-Portable-VS2017.sln index cddfabcc5..4256097cb 100644 --- a/ROCm-Examples-Portable-VS2017.sln +++ b/ROCm-Examples-Portable-VS2017.sln @@ -56,6 +56,14 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "events_vs2017", "HIP-Basic\ EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hello_world_vs2017", "HIP-Basic\hello_world\hello_world_vs2017.vcxproj", "{BD725C86-E381-4BDD-B3FC-06A42221BEBB}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2017", "Applications\histogram\histogram_vs2017.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2017", "Applications\prefix_sum\prefix_sum_vs2017.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2017", "Applications\convolution\convolution_vs2017.vcxproj", "{4232B140-4C47-4961-8A8A-F67D14DC9349}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2017", "Applications\bitonic_sort\bitonic_sort_vs2017.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -154,6 +162,22 @@ Global {BD725C86-E381-4BDD-B3FC-06A42221BEBB}.Debug|x64.Build.0 = Debug|x64 {BD725C86-E381-4BDD-B3FC-06A42221BEBB}.Release|x64.ActiveCfg = Release|x64 {BD725C86-E381-4BDD-B3FC-06A42221BEBB}.Release|x64.Build.0 = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Debug|x64.ActiveCfg = Debug|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Debug|x64.Build.0 = Debug|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Release|x64.ActiveCfg = Release|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Release|x64.Build.0 = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -183,6 +207,10 @@ Global {8C3BBAAF-04F6-46F5-96A2-81C047167343} = {8DF2222B-5CDB-44DE-AC5D-D24C6C0B0B49} {AA9E497A-FEDE-45A4-A6CE-A8E3DDDDEB82} = {8DF2222B-5CDB-44DE-AC5D-D24C6C0B0B49} {BD725C86-E381-4BDD-B3FC-06A42221BEBB} = {8DF2222B-5CDB-44DE-AC5D-D24C6C0B0B49} + {D3531843-4D0D-445D-BD8D-2352038D8221} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} + {4232B140-4C47-4961-8A8A-F67D14DC9349} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} + {265F7154-A362-45FA-B300-DB74E14BA010} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {5C96FD63-6F26-4E6F-B6D0-7FB9E1833081} diff --git a/ROCm-Examples-Portable-VS2019.sln b/ROCm-Examples-Portable-VS2019.sln index 16f10d611..fc4965604 100644 --- a/ROCm-Examples-Portable-VS2019.sln +++ b/ROCm-Examples-Portable-VS2019.sln @@ -56,6 +56,14 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hello_world_vs2019", "HIP-B EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "texture_management_vs2019", "HIP-Basic\texture_management\texture_management_vs2019.vcxproj", "{40E56BFB-1A0C-4618-BB49-A9AA635127C1}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2019", "Applications\histogram\histogram_vs2019.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2019", "Applications\prefix_sum\prefix_sum_vs2019.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2019", "Applications\convolution\convolution_vs2019.vcxproj", "{A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2019", "Applications\bitonic_sort\bitonic_sort_vs2019.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -154,6 +162,22 @@ Global {40E56BFB-1A0C-4618-BB49-A9AA635127C1}.Debug|x64.Build.0 = Debug|x64 {40E56BFB-1A0C-4618-BB49-A9AA635127C1}.Release|x64.ActiveCfg = Release|x64 {40E56BFB-1A0C-4618-BB49-A9AA635127C1}.Release|x64.Build.0 = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Debug|x64.ActiveCfg = Debug|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Debug|x64.Build.0 = Debug|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Release|x64.ActiveCfg = Release|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Release|x64.Build.0 = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -183,6 +207,10 @@ Global {547B99C2-CBE3-4E1F-A1D6-26E261D67A3E} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {5E0E9AB0-B708-481F-9226-DD92C3798341} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {40E56BFB-1A0C-4618-BB49-A9AA635127C1} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {D3531843-4D0D-445D-BD8D-2352038D8221} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} + {265F7154-A362-45FA-B300-DB74E14BA010} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {90580497-38BF-428E-A951-6EC6CFC68193} diff --git a/ROCm-Examples-Portable-VS2022.sln b/ROCm-Examples-Portable-VS2022.sln index 51840507d..3d9a7eabe 100644 --- a/ROCm-Examples-Portable-VS2022.sln +++ b/ROCm-Examples-Portable-VS2022.sln @@ -56,6 +56,14 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libhip_static_host_vs2022", EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hello_world_vs2022", "HIP-Basic\hello_world\hello_world_vs2022.vcxproj", "{AA92EF7E-2323-4497-ACCD-B76FB196C545}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2022", "Applications\histogram\histogram_vs2022.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2022", "Applications\prefix_sum\prefix_sum_vs2022.vcxproj", "{BE12D9BE-704A-4697-9D3D-5351A6E30189}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2022", "Applications\convolution\convolution_vs2022.vcxproj", "{E98F33FC-C29B-4229-A853-51C490D74E3E}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2022", "Applications\bitonic_sort\bitonic_sort_vs2022.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -154,6 +162,22 @@ Global {AA92EF7E-2323-4497-ACCD-B76FB196C545}.Debug|x64.Build.0 = Debug|x64 {AA92EF7E-2323-4497-ACCD-B76FB196C545}.Release|x64.ActiveCfg = Release|x64 {AA92EF7E-2323-4497-ACCD-B76FB196C545}.Release|x64.Build.0 = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Debug|x64.ActiveCfg = Debug|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Debug|x64.Build.0 = Debug|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Release|x64.ActiveCfg = Release|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Release|x64.Build.0 = Release|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Debug|x64.ActiveCfg = Debug|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Debug|x64.Build.0 = Debug|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Release|x64.ActiveCfg = Release|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Release|x64.Build.0 = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -183,6 +207,10 @@ Global {DF601563-B579-4571-88C9-3EC7312D567F} = {94393B51-B70E-4111-A22C-6A752D41E454} {5FC8C701-B961-4719-A465-FC0FC84D2EEE} = {94393B51-B70E-4111-A22C-6A752D41E454} {AA92EF7E-2323-4497-ACCD-B76FB196C545} = {94393B51-B70E-4111-A22C-6A752D41E454} + {D3531843-4D0D-445D-BD8D-2352038D8221} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} + {BE12D9BE-704A-4697-9D3D-5351A6E30189} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} + {E98F33FC-C29B-4229-A853-51C490D74E3E} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} + {265F7154-A362-45FA-B300-DB74E14BA010} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {D648FD37-D8CB-4EA5-8445-38BEF36F6736} diff --git a/ROCm-Examples-VS2017.sln b/ROCm-Examples-VS2017.sln index c5ed06f9e..4e540da88 100644 --- a/ROCm-Examples-VS2017.sln +++ b/ROCm-Examples-VS2017.sln @@ -148,10 +148,18 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syevj_vs2017", "Libraries\h EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sygvj_vs2017", "Libraries\hipSOLVER\sygvj\sygvj_vs2017.vcxproj", "{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2017", "Applications\prefix_sum\prefix_sum_vs2017.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "getf2_vs2017", "Libraries\rocSOLVER\getf2\getf2_vs2017.vcxproj", "{D1C40C14-2881-43C7-8DAD-72BAAB169333}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "getrf_vs2017", "Libraries\hipSOLVER\getrf\getrf_vs2017.vcxproj", "{D1C40C14-2881-43C7-8DAD-72BAAB169323}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2017", "Applications\histogram\histogram_vs2017.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2017", "Applications\convolution\convolution_vs2017.vcxproj", "{4232B140-4C47-4961-8A8A-F67D14DC9349}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2017", "Applications\bitonic_sort\bitonic_sort_vs2017.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syevdx_vs2017", "Libraries\hipSOLVER\syevdx\syevdx_vs2017.vcxproj", "{90387A34-8095-4343-8AA5-F0F350EAC462}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syev_vs2017", "Libraries\rocSOLVER\syev\syev_vs2017.vcxproj", "{8F15AAA6-12F8-44A9-AFA1-752F263B094F}" @@ -440,6 +448,10 @@ Global {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Debug|x64.Build.0 = Debug|x64 {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Release|x64.ActiveCfg = Release|x64 {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Release|x64.Build.0 = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169333}.Debug|x64.ActiveCfg = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169333}.Debug|x64.Build.0 = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169333}.Release|x64.ActiveCfg = Release|x64 @@ -448,6 +460,18 @@ Global {D1C40C14-2881-43C7-8DAD-72BAAB169323}.Debug|x64.Build.0 = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169323}.Release|x64.ActiveCfg = Release|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169323}.Release|x64.Build.0 = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Debug|x64.ActiveCfg = Debug|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Debug|x64.Build.0 = Debug|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Release|x64.ActiveCfg = Release|x64 + {4232B140-4C47-4961-8A8A-F67D14DC9349}.Release|x64.Build.0 = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 {90387A34-8095-4343-8AA5-F0F350EAC462}.Debug|x64.ActiveCfg = Debug|x64 {90387A34-8095-4343-8AA5-F0F350EAC462}.Debug|x64.Build.0 = Debug|x64 {90387A34-8095-4343-8AA5-F0F350EAC462}.Release|x64.ActiveCfg = Release|x64 @@ -599,8 +623,12 @@ Global {88F5329E-8AEB-4CA0-BA95-59BA4DE42477} = {2700C908-113C-4429-A889-DF34D44AB29B} {7139C801-489D-464A-82DC-3ABDB706C7A2} = {2700C908-113C-4429-A889-DF34D44AB29B} {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943} = {2700C908-113C-4429-A889-DF34D44AB29B} + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} {D1C40C14-2881-43C7-8DAD-72BAAB169333} = {2CD1AF85-3AEE-4002-AF14-69D50BA39DA7} {D1C40C14-2881-43C7-8DAD-72BAAB169323} = {2700C908-113C-4429-A889-DF34D44AB29B} + {D3531843-4D0D-445D-BD8D-2352038D8221} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} + {4232B140-4C47-4961-8A8A-F67D14DC9349} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} + {265F7154-A362-45FA-B300-DB74E14BA010} = {0328C27A-BB25-46F6-89F7-4EEF7AC225D8} {90387A34-8095-4343-8AA5-F0F350EAC462} = {2700C908-113C-4429-A889-DF34D44AB29B} {8F15AAA6-12F8-44A9-AFA1-752F263B094F} = {2CD1AF85-3AEE-4002-AF14-69D50BA39DA7} {0C4830AF-B13C-4880-B556-C5AAC1A5897F} = {2CD1AF85-3AEE-4002-AF14-69D50BA39DA7} diff --git a/ROCm-Examples-VS2019.sln b/ROCm-Examples-VS2019.sln index a78b1ac71..9cf540ad8 100644 --- a/ROCm-Examples-VS2019.sln +++ b/ROCm-Examples-VS2019.sln @@ -148,10 +148,18 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syevj_vs2019", "Libraries\h EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sygvj_vs2019", "Libraries\hipSOLVER\sygvj\sygvj_vs2019.vcxproj", "{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2019", "Applications\prefix_sum\prefix_sum_vs2019.vcxproj", "{015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "getf2_vs2019", "Libraries\rocSOLVER\getf2\getf2_vs2019.vcxproj", "{D1C40C14-2881-43C7-8DAD-72BAAB169334}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "getrf_vs2019", "Libraries\hipSOLVER\getrf\getrf_vs2019.vcxproj", "{D1C40C14-2881-43C7-8DAD-72BAAB169324}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2019", "Applications\histogram\histogram_vs2019.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2019", "Applications\convolution\convolution_vs2019.vcxproj", "{A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2019", "Applications\bitonic_sort\bitonic_sort_vs2019.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syevdx_vs2019", "Libraries\hipSOLVER\syevdx\syevdx_vs2019.vcxproj", "{729914AA-2062-4B79-930E-630C6C3A60C7}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syev_vs2019", "Libraries\rocSOLVER\syev\syev_vs2019.vcxproj", "{EA84A9DF-D7EE-4E10-8DE5-0E411C2AC0A3}" @@ -440,6 +448,10 @@ Global {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Debug|x64.Build.0 = Debug|x64 {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Release|x64.ActiveCfg = Release|x64 {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Release|x64.Build.0 = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.ActiveCfg = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Debug|x64.Build.0 = Debug|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.ActiveCfg = Release|x64 + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0}.Release|x64.Build.0 = Release|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169334}.Debug|x64.ActiveCfg = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169334}.Debug|x64.Build.0 = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169334}.Release|x64.ActiveCfg = Release|x64 @@ -448,6 +460,18 @@ Global {D1C40C14-2881-43C7-8DAD-72BAAB169324}.Debug|x64.Build.0 = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169324}.Release|x64.ActiveCfg = Release|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169324}.Release|x64.Build.0 = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Debug|x64.ActiveCfg = Debug|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Debug|x64.Build.0 = Debug|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Release|x64.ActiveCfg = Release|x64 + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F}.Release|x64.Build.0 = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 {729914AA-2062-4B79-930E-630C6C3A60C7}.Debug|x64.ActiveCfg = Debug|x64 {729914AA-2062-4B79-930E-630C6C3A60C7}.Debug|x64.Build.0 = Debug|x64 {729914AA-2062-4B79-930E-630C6C3A60C7}.Release|x64.ActiveCfg = Release|x64 @@ -598,8 +622,12 @@ Global {1209C293-D1F0-4BFC-B6D0-65A96B801135} = {2700C908-113C-4429-A889-DF34D44AB29B} {7139C801-489D-464A-82DC-3ABDB706C7A2} = {2700C908-113C-4429-A889-DF34D44AB29B} {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943} = {2700C908-113C-4429-A889-DF34D44AB29B} + {015DF085-FEB3-4C7A-ACEE-7CFFB3C9AFF0} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} {D1C40C14-2881-43C7-8DAD-72BAAB169334} = {B03B9E85-3FED-4902-9B24-433CF352AB6C} {D1C40C14-2881-43C7-8DAD-72BAAB169324} = {2700C908-113C-4429-A889-DF34D44AB29B} + {D3531843-4D0D-445D-BD8D-2352038D8221} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} + {A557F6A4-C0FD-4D65-B1BD-A201FABAEF7F} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} + {265F7154-A362-45FA-B300-DB74E14BA010} = {9254BAD9-FDFC-4645-B2C8-EEB42F1F069D} {729914AA-2062-4B79-930E-630C6C3A60C7} = {2700C908-113C-4429-A889-DF34D44AB29B} {EA84A9DF-D7EE-4E10-8DE5-0E411C2AC0A3} = {B03B9E85-3FED-4902-9B24-433CF352AB6C} {C11381F8-089B-462C-8544-932666818546} = {B03B9E85-3FED-4902-9B24-433CF352AB6C} diff --git a/ROCm-Examples-VS2022.sln b/ROCm-Examples-VS2022.sln index fc7ce0808..bfe9dfb3d 100644 --- a/ROCm-Examples-VS2022.sln +++ b/ROCm-Examples-VS2022.sln @@ -148,10 +148,18 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syevj_vs2022", "Libraries\h EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sygvj_vs2022", "Libraries\hipSOLVER\sygvj\sygvj_vs2022.vcxproj", "{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "prefix_sum_vs2022", "Applications\prefix_sum\prefix_sum_vs2022.vcxproj", "{BE12D9BE-704A-4697-9D3D-5351A6E30189}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "getf2_vs2022", "Libraries\rocSOLVER\getf2\getf2_vs2022.vcxproj", "{D1C40C14-2881-43C7-8DAD-72BAAB169335}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "getrf_vs2022", "Libraries\hipSOLVER\getrf\getrf_vs2022.vcxproj", "{D1C40C14-2881-43C7-8DAD-72BAAB169325}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "histogram_vs2022", "Applications\histogram\histogram_vs2022.vcxproj", "{D3531843-4D0D-445D-BD8D-2352038D8221}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "convolution_vs2022", "Applications\convolution\convolution_vs2022.vcxproj", "{E98F33FC-C29B-4229-A853-51C490D74E3E}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic_sort_vs2022", "Applications\bitonic_sort\bitonic_sort_vs2022.vcxproj", "{265F7154-A362-45FA-B300-DB74E14BA010}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syevdx_vs2022", "Libraries\hipSOLVER\syevdx\syevdx_vs2022.vcxproj", "{DB1441F5-295D-4A82-BB70-122212A26A09}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "syev_vs2022", "Libraries\rocSOLVER\syev\syev_vs2022.vcxproj", "{DCA81AEF-6607-48B5-90E7-8699A5ACAF74}" @@ -440,6 +448,10 @@ Global {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Debug|x64.Build.0 = Debug|x64 {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Release|x64.ActiveCfg = Release|x64 {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943}.Release|x64.Build.0 = Release|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Debug|x64.ActiveCfg = Debug|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Debug|x64.Build.0 = Debug|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Release|x64.ActiveCfg = Release|x64 + {BE12D9BE-704A-4697-9D3D-5351A6E30189}.Release|x64.Build.0 = Release|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169335}.Debug|x64.ActiveCfg = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169335}.Debug|x64.Build.0 = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169335}.Release|x64.ActiveCfg = Release|x64 @@ -448,6 +460,18 @@ Global {D1C40C14-2881-43C7-8DAD-72BAAB169325}.Debug|x64.Build.0 = Debug|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169325}.Release|x64.ActiveCfg = Release|x64 {D1C40C14-2881-43C7-8DAD-72BAAB169325}.Release|x64.Build.0 = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.ActiveCfg = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Debug|x64.Build.0 = Debug|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.ActiveCfg = Release|x64 + {D3531843-4D0D-445D-BD8D-2352038D8221}.Release|x64.Build.0 = Release|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Debug|x64.ActiveCfg = Debug|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Debug|x64.Build.0 = Debug|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Release|x64.ActiveCfg = Release|x64 + {E98F33FC-C29B-4229-A853-51C490D74E3E}.Release|x64.Build.0 = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.ActiveCfg = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Debug|x64.Build.0 = Debug|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.ActiveCfg = Release|x64 + {265F7154-A362-45FA-B300-DB74E14BA010}.Release|x64.Build.0 = Release|x64 {DB1441F5-295D-4A82-BB70-122212A26A09}.Debug|x64.ActiveCfg = Debug|x64 {DB1441F5-295D-4A82-BB70-122212A26A09}.Debug|x64.Build.0 = Debug|x64 {DB1441F5-295D-4A82-BB70-122212A26A09}.Release|x64.ActiveCfg = Release|x64 @@ -598,7 +622,12 @@ Global {C2804CFB-1D49-4E39-9757-3901F50CF3AA} = {2700C908-113C-4429-A889-DF34D44AB29B} {7139C801-489D-464A-82DC-3ABDB706C7A2} = {2700C908-113C-4429-A889-DF34D44AB29B} {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC943} = {2700C908-113C-4429-A889-DF34D44AB29B} + {BE12D9BE-704A-4697-9D3D-5351A6E30189} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} {D1C40C14-2881-43C7-8DAD-72BAAB169335} = {594C0813-02D5-4F93-A4D6-E10100A0539F} + {D1C40C14-2881-43C7-8DAD-72BAAB169325} = {2700C908-113C-4429-A889-DF34D44AB29B} + {D3531843-4D0D-445D-BD8D-2352038D8221} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} + {E98F33FC-C29B-4229-A853-51C490D74E3E} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} + {265F7154-A362-45FA-B300-DB74E14BA010} = {C735FFA9-12E1-4BEF-87B2-8891A3006505} {DB1441F5-295D-4A82-BB70-122212A26A09} = {2700C908-113C-4429-A889-DF34D44AB29B} {DCA81AEF-6607-48B5-90E7-8699A5ACAF74} = {594C0813-02D5-4F93-A4D6-E10100A0539F} {A08FB6DB-31F7-48B7-8561-59B16E311F60} = {594C0813-02D5-4F93-A4D6-E10100A0539F}