From 95687ef285ba62efa911ce628110103d5416a832 Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <61422851+Beanavil@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:40:21 +0200
Subject: [PATCH] Develop Stream 2024-03-21 general fixes (part I) (#97)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* bump the required cmake version to 3.21.3

* Fix device_globals example name

* Fix hip_streams timeout on AMD windows debug build type

* Update templates

* Update cuda container to ROCm 5.4

* Change std::bind into lambda

* HIP 5.5 fixes

* fix tests not being executed

* Make the reference to the identity and transpose op uniform

* Fix NVCC CI

* Resolve "Increase timeout for CI"

* Update fixed size arrays to C++ standards

* Add missing include in hip_texture_management

* Remove void** cast from hipMalloc

* Fix hip-libraries-cuda-ubuntu Dockerfile

* Make the windows builds less verbose

* Rework Windows CI

* Skip failing rocsparse tests

* Fix cooperative groups example

* ci: Make skipped examples more prominent in windows VS test runner

* Enable rocsparse examples in CI

* Update .gitlab/issue_templates/example.md

Fix small typo

---------

Co-authored-by: Balint Soproni <balint@streamhpc.com>
Co-authored-by: Robin Voetter <robin@streamhpc.com>
Co-authored-by: Nara Prasetya <nara@streamhpc.com>
Co-authored-by: Nol Moonen <nol@streamhpc.com>
Co-authored-by: Mátyás Aradi <matyas@streamhpc.com>
Co-authored-by: Gergely Mészáros <gergely@streamhpc.com>
Co-authored-by: Sam Wu <22262939+samjwu@users.noreply.github.com>
---
 .gitlab-ci.yml                                | 343 +++++++++---------
 .gitlab/issue_templates/example.md            |  20 +-
 .gitlab/merge_request_templates/example.md    |  16 +
 Applications/floyd_warshall/main.hip          |   4 +-
 CMakeLists.txt                                |   4 +-
 Common/example_utils.hpp                      |  14 +-
 .../hip-libraries-cuda-ubuntu.Dockerfile      |  75 ++--
 HIP-Basic/cooperative_groups/CMakeLists.txt   |   5 +
 HIP-Basic/device_globals/CMakeLists.txt       |   2 +-
 HIP-Basic/device_query/main.cpp               |   6 +-
 HIP-Basic/occupancy/main.hip                  |   4 +-
 HIP-Basic/texture_management/main.hip         |   5 +-
 .../hipBLAS/gemm_strided_batched/README.md    |  35 +-
 .../hipBLAS/gemm_strided_batched/main.hip     |  10 +-
 Libraries/hipCUB/device_radix_sort/main.hip   |   8 +-
 Libraries/hipCUB/device_sum/main.hip          |   6 +-
 Libraries/hipSOLVER/syevj/main.cpp            |   2 +-
 Libraries/rocBLAS/level_3/gemm/README.md      |  28 +-
 Libraries/rocBLAS/level_3/gemm/main.cpp       |  10 +-
 .../level_3/gemm_strided_batched/README.md    |  27 +-
 .../level_3/gemm_strided_batched/main.cpp     |  10 +-
 Libraries/rocPRIM/block_sum/main.hip          |   4 +-
 Libraries/rocPRIM/device_sum/main.hip         |   6 +-
 Libraries/rocThrust/device_ptr/main.hip       |   6 +-
 Libraries/rocThrust/norm/main.hip             |   6 +-
 Libraries/rocThrust/reduce_sum/main.hip       |   6 +-
 Libraries/rocThrust/remove_points/main.hip    |   6 +-
 Libraries/rocThrust/saxpy/main.hip            |   6 +-
 Libraries/rocThrust/vectors/main.hip          |   6 +-
 Scripts/WindowsRunner.ps1                     | 108 ++++++
 30 files changed, 463 insertions(+), 325 deletions(-)
 create mode 100644 .gitlab/merge_request_templates/example.md
 create mode 100644 Scripts/WindowsRunner.ps1

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 475c1f685..80d038fee 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -31,8 +31,10 @@ include:
 
 variables:
   CUDA_FLAGS: "-Xcompiler -Wall,-Wextra,-Werror --Werror all-warnings"
-  CXX_FLAGS: "-Wall -Wextra -Werror"
-  HIP_FLAGS: "-Wall -Wextra -Werror"
+  # We require '-Wno-unused-command-line-argument' due to the followiwng warning:
+  #   argument unused during compilation: '--rtlib=compiler-rt'
+  CXX_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror"
+  HIP_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror"
 
 stages:
   - lint
@@ -56,7 +58,7 @@ clang-format:
     - Scripts/CodeFormat/check_format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT"
 
 .build:dockerfiles:
-  timeout: 20m
+  timeout: 60m
   image:
     name: gcr.io/kaniko-project/executor:debug
     entrypoint: [""]
@@ -94,6 +96,10 @@ build:cuda-ubuntu-dockerfile:
   variables:
     TAG: cuda-ubuntu
 
+########################
+# Ubuntu make          #
+########################
+
 build:make-rocm:
   image: $DOCKER_TAG_PREFIX:rocm-ubuntu
   stage: build
@@ -116,6 +122,10 @@ build:make-cuda:
   script:
     - cd $CI_PROJECT_DIR && make CXXFLAGS="$CUDA_FLAGS" GPU_RUNTIME=CUDA -j $(nproc)
 
+########################
+# Ubuntu cmake         #
+########################
+
 .build:cmake:
   stage: build
   extends:
@@ -150,28 +160,32 @@ build:cmake-rocm:
     - cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install
 
 build:cmake-cuda:
-  image: $DOCKER_TAG_PREFIX:cuda-ubuntu
-  extends:
-    - .build:cmake
-  tags:
-    - nvcc-build
-  script:
-    - cmake
-      -S $CI_PROJECT_DIR
-      -B $CI_PROJECT_DIR/build
-      -D GPU_RUNTIME=CUDA
-      -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-      -D CMAKE_CUDA_FLAGS="$CUDA_FLAGS"
-      -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-      2>&1 | tee cmake_log.txt
-    # check if all dependencies were found
-    - |-
-      if grep -qi "could not find" cmake_log.txt; then
-          echo "Some CMake libraries could not be found"
-          exit 1
-      fi
-    - cmake --build $CI_PROJECT_DIR/build
-    - cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install
+ image: $DOCKER_TAG_PREFIX:cuda-ubuntu
+ extends:
+   - .build:cmake
+ tags:
+   - nvcc-build
+ script:
+   - cmake
+     -S $CI_PROJECT_DIR
+     -B $CI_PROJECT_DIR/build
+     -D GPU_RUNTIME=CUDA
+     -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
+     -D CMAKE_CUDA_FLAGS="$CUDA_FLAGS"
+     -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake
+     2>&1 | tee cmake_log.txt
+   # check if all dependencies were found
+   - |-
+     if grep -qi "could not find" cmake_log.txt; then
+         echo "Some CMake libraries could not be found"
+         exit 1
+     fi
+   - cmake --build $CI_PROJECT_DIR/build
+   - cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install
+
+########################
+# Ubuntu Tests         #
+########################
 
 .test:
   stage: test
@@ -196,122 +210,104 @@ test:cuda:
   needs:
     - build:cmake-cuda
 
-.test:windows:
-  extends:
-    - .rules:test
-  stage: test
-  needs: []
-  parallel:
-    matrix:
-      - BUILD_TYPE: [Debug, Release]
+########################
+# Windows VisualStudio #
+########################
 
-.test:rocm-windows:
-  extends:
-    - .test:windows
+.test:windows-rocm:
   tags:
     - windows
     - shell
     - rx6900
 
-.test:windows-vs:
-  script:
-   # MSBuild cannot properly resolve the `<Content Include=` dependencies, and will sometimes try to copy
-   # two or more files at once. This results in a warning before it retries, which is counted towards
-   # /warnAsError by default. For this reason, we disable the relevant warning (MSB3026).
-    - >
-      & $MSBUILD
-      /maxCpuCount
-      "/p:Configuration=$BUILD_TYPE"
-      /warnAsError
-      /warnAsMessage:MSB3026
-      $MSBUILD_EXTRA_OPTIONS
-      "$CI_PROJECT_DIR/$SOLUTION"
+.test:windows-nvcc:
+  tags:
+    - nvcc-windows
 
-test:rocm-windows-vs2019:
+.test:windows-vs:
+  stage: test
+  timeout: 30m
   extends:
-    - .test:rocm-windows
-    - .test:windows-vs
+    - .rules:test
+  parallel:
+    matrix:
+      - VS_VERSION:
+          - 2017
+          - 2019
+          - 2022
+        BUILD_TYPE:
+          - Debug
+          - Release
   variables:
-    MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
-    SOLUTION: "ROCm-Examples-VS2019.sln"
+    Timeout: 30
+    Filter: "*_vs$VS_VERSION.exe"
   script:
-    - !reference [".test:windows-vs", script]
-    - |-
-      $SkippedExamples = @(
-          "hip_vulkan_interop_vs2019.exe"     # Graphical
-          "hip_texture_management_vs2019.exe" # Hangs sometimes
-          "hip_hello_world_vs2019.exe"        # Crashes (known driver issue)
+    - | # Find MSBuild.exe of the associated version.
+      $MSBUILD = (
+        & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -find MSBuild\**\Bin\MSBuild.exe
+        | Select-String -Pattern $VS_VERSION
+      )[0]
+      Write-Output ("MSBuild: $MSBUILD" -f $MSBUILD)
+    - | # Fixes error MSB8036: The Windows SDK version 8.1 was not found
+      if ($VS_VERSION -eq 2017) {
+        $MSBUILD_EXTRA_OPTIONS = "/p:WindowsTargetPlatformVersion=10.0.20348.0"
+      }
+    - | # Build!
+      & $MSBUILD @(
+        "/clp:Summary;ShowEventId;ShowTimestamp"
+        "/p:Configuration=$BUILD_TYPE"
+        "/p:Verbose=false"
+        "/maxCpuCount:8"
+        "/p:CL_MPCount=8"
+        "/verbosity:minimal"
+        "/validate"
+        "/warnAsError"
+        # MSBuild cannot properly resolve the `<Content Include=` dependencies, and will sometimes try to copy
+        # two or more files at once. This results in a warning before it retries, which is counted towards
+        # /warnAsError by default. For this reason, we disable the relevant warning (MSB3026).
+        "/warnAsMessage:MSB3026"
+        "/t:build"
+        $MSBUILD_EXTRA_OPTIONS
+        "$CI_PROJECT_DIR\$SOLUTION_PREFIX$VS_VERSION.sln"
       )
-      Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" |
-      ForEach-Object {
-          if ($SkippedExamples -NotContains $_.Name) {
-              echo "--" $_.Name
-              & "$_"
-              if (!$?) {
-                throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
-              }
-          } else {
-              echo "-- SKIPPING " $_.Name
-          }
+    - | # Use external script to test examples
+      if (!$SKIP_TESTS) {
+        & $CI_PROJECT_DIR\Scripts\WindowsRunner.ps1 $CI_PROJECT_DIR\$BUILD_TYPE $Filter $Timeout $("$SkippedExamples".split(','))
+      } else {
+        Write-Output "Tests skipped!"
       }
 
-test:rocm-windows-vs2017:
+test:windows-rocm-vs:
   extends:
-    - .test:rocm-windows
+    - .test:windows-rocm
     - .test:windows-vs
+  tags:
+    - windows
+    - shell
+    - rx6900
   variables:
-    MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/MSBuild/15.0/Bin/MSBuild.exe"
-    SOLUTION: "ROCm-Examples-VS2017.sln"
-    # See https://developercommunity.visualstudio.com/t/windowstargetplatformversion-makes-it-impossible-t/140294
-    MSBUILD_EXTRA_OPTIONS: "/p:WindowsTargetPlatformVersion=10.0.20348.0"
+    SOLUTION_PREFIX: ROCm-Examples-VS
+    # hip_vulkant_interop: graphical
+    # hip_texture_management: does not work
+    # rocsparse_*: broken with new SDK
+    SkippedExamples: >
+      hip_vulkan_interop_*.exe,
+      hip_texture_management_*.exe,
 
-test:rocm-windows-vs2022:
+test:windows-nvcc-vs:
   extends:
-    - .test:rocm-windows
+    - .test:windows-nvcc
     - .test:windows-vs
-  variables:
-    MSBUILD: "C:/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe"
-    SOLUTION: "ROCm-Examples-VS2022.sln"
-
-test:rocm-windows-cmake:
-  extends:
-    - .test:rocm-windows
-  script:
-    - Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-    - Enter-VsDevShell -InstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -SkipAutomaticLocation -DevCmdArguments '/arch=x64 /host_arch=x64 /no_logo'
-    - cmake
-      -S "$CI_PROJECT_DIR"
-      -B "$CI_PROJECT_DIR/build"
-      -G Ninja
-      -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-      -D CMAKE_HIP_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-      -D CMAKE_HIP_LINK_EXECUTABLE:PATH="${env:HIP_PATH}\bin\lld-link.exe"
-      -D CMAKE_HIP_FLAGS="-fuse-ld=lld"
-      -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-      -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-      -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}"
-      -D CMAKE_HIP_FLAGS="$HIP_FLAGS"
-      -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-      -D CMAKE_HIP_ARCHITECTURES=gfx1030
-      -D CMAKE_TOOLCHAIN_FILE:PATH="C:\Tools\Microsoft\vcpkg\scripts\buildsystems\vcpkg.cmake"
-      2>&1 | Tee-Object -filepath cmake_log.txt
-    - |-
-      if (Select-String -Path cmake_log.txt -Pattern "could not find") {
-          throw "Some cmake libraries are missing"
-      }
-    - cmake --build "$CI_PROJECT_DIR/build"
-    # CMake does not copy the dependencies to the test folder, and there is no sufficiently concise way of doing it.
-    # So for now, just add the library path here.
-    - $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH
-    - cd "$CI_PROJECT_DIR/build" && ctest --output-on-failure --timeout 10
-    - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
-
-.test:nvcc-windows:
-  extends:
-    - .test:windows
   tags:
     - nvcc-windows
+  variables:
+    SOLUTION_PREFIX: ROCm-Examples-Portable-VS
+    # hip_runtime_compilation: fails on VS2017
+    SkippedExamples: >
+      hip_runtime_compilation_vs2017.exe
   before_script:
+    - | # Release builds are currently broken!
+      $SKIP_TESTS = ($BUILD_TYPE -eq "Release")
     # To test for NVIDIA, we need to set the platform toolset to HIP_nvcc. This cannot be done with /p:PlatformToolset
     # though, as some examples use the regular msvc toolchain.
     - |
@@ -321,76 +317,75 @@ test:rocm-windows-cmake:
           Set-Content $f
       }
 
-test:nvcc-windows-vs2019:
+########################
+# Windows cmake        #
+########################
+
+.test:windows-cmake:
   extends:
-    - .test:nvcc-windows
-    - .test:windows-vs
+    - .rules:test
   variables:
-    MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
-    SOLUTION: "ROCm-Examples-Portable-VS2019.sln"
+    VS_VERSION: 2022
+    BUILD_TYPE: Release
+  before_script:
+    - | # Find VS installation
+      $VS_PATH = (
+        & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -property InstallationPath
+        | Select-String -Pattern $VS_VERSION
+      )[0]
+    - | # Find DevShell.dll
+      $VS_DEV_SHELL = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -path "$VS_PATH" -find "**\Tools\Microsoft.VisualStudio.DevShell.dll"
+    - Import-Module "$VS_DEV_SHELL"
+    - Enter-VsDevShell -InstallPath "$VS_PATH" -SkipAutomaticLocation -DevCmdArguments '/arch=x64 /host_arch=x64 /no_logo'
   script:
-    - !reference [".test:windows-vs", script]
-    - |-
-      $SkippedExamples = @(
-          "hip_vulkan_interop_vs2019.exe" # Graphical
-          "hip_opengl_interop_vs2019.exe" # Graphical
-      )
-      Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" |
-      ForEach-Object {
-          if ($SkippedExamples -NotContains $_.Name) {
-              echo "--" $_.Name
-              & "$CI_PROJECT_DIR/$BUILD_TYPE/$_"
-              if (!$?) {
-                throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
-              }
-          } else {
-              echo "-- SKIPPING " $_.Name
-          }
+    - | # Ensure no libraries are missing during compilation!
+      if (Select-String -Path cmake_log.txt -Pattern "could not find") {
+          throw "Some cmake libraries are missing"
       }
+    - cmake --build "$CI_PROJECT_DIR/build"
+    # CMake does not copy the dependencies to the test folder, and there is no sufficiently concise way of doing it.
+    # So for now, just add the library path here.
+    - $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH
+    - cd "$CI_PROJECT_DIR/build"
+    - ctest --output-on-failure --timeout 15 -E "rocsparse_bsrsv|rocsparse_csrsv|rocsparse_spsv|rocsparse_bsrsm|rocsparse_csrsm|rocsparse_bsric0|rocsparse_bsrilu0|rocsparse_csric0|rocsparse_csrilu0"
+    - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
+  needs: []
 
-test:nvcc-windows-vs2017:
-  extends:
-    - .test:nvcc-windows
-    - .test:windows-vs
-  variables:
-    MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/MSBuild/15.0/Bin/MSBuild.exe"
-    SOLUTION: "ROCm-Examples-Portable-VS2017.sln"
-    # See https://developercommunity.visualstudio.com/t/windowstargetplatformversion-makes-it-impossible-t/140294
-    MSBUILD_EXTRA_OPTIONS: "/p:WindowsTargetPlatformVersion=10.0.20348.0"
-
-test:nvcc-windows-vs2022:
+test:windows-rocm-cmake:
   extends:
-    - .test:nvcc-windows
-    - .test:windows-vs
-  variables:
-    MSBUILD: "C:/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe"
-    SOLUTION: "ROCm-Examples-Portable-VS2022.sln"
+    - .test:windows-rocm
+    - .test:windows-cmake
+  script:
+    - cmake
+      -S "$CI_PROJECT_DIR"
+      -B "$CI_PROJECT_DIR/build"
+      -G Ninja
+      -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
+      -D CMAKE_HIP_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
+      -D CMAKE_HIP_LINK_EXECUTABLE:PATH="${env:HIP_PATH}\bin\lld-link.exe"
+      -D CMAKE_CXX_FLAGS="$CXX_FLAGS"
+      -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}"
+      -D CMAKE_HIP_FLAGS="$HIP_FLAGS"
+      -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
+      -D CMAKE_HIP_ARCHITECTURES=gfx1030
+      -D CMAKE_TOOLCHAIN_FILE:PATH="C:\Tools\Microsoft\vcpkg\scripts\buildsystems\vcpkg.cmake"
+      2>&1 | Tee-Object -filepath cmake_log.txt
+    - !reference [.test:windows-cmake, script]
 
-test:nvcc-windows-cmake:
+test:windows-nvcc-cmake:
   extends:
-    - .test:nvcc-windows
+    - .test:windows-nvcc
+    - .test:windows-cmake
   script:
-    # Import the VisualStudio 2022 development environment
-    - |-
-      $vs = &"C:/Program Files (x86)/Microsoft Visual Studio/Installer/vswhere.exe" -version 17.0 -property InstallationPath
-      Import-Module (Join-Path $vs "Common7/Tools/Microsoft.VisualStudio.DevShell.dll")
-      Enter-VsDevShell -VsInstallPath $vs -SkipAutomaticLocation -DevCmdArguments "/arch=x64 /host_arch=x64 /no_logo"
     # Note: The current version of the HIP SDK does not ship with CMake config files for Nvidia, so we can only test
     # the HIP-Basic and Applications examples. It is expected that some dependencies will not be found for this.
     - cmake
       -S "$CI_PROJECT_DIR"
       -B "$CI_PROJECT_DIR/build"
       -G Ninja
-      -D CMAKE_CXX_COMPILER="cl.exe"
-      -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
       -D CMAKE_TOOLCHAIN_FILE="C:/Tools/Microsoft/vcpkg/scripts/buildsystems/vcpkg.cmake"
+      -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
+      -D CMAKE_CXX_COMPILER="cl.exe"
       -D GPU_RUNTIME=CUDA
       2>&1 | Tee-Object -filepath cmake_log.txt
-    - |-
-      if (Select-String -Path cmake_log.txt -Pattern "could not find") {
-          throw "Some cmake libraries are missing"
-      }
-    - cmake --build "$CI_PROJECT_DIR/build"
-    - cd "$CI_PROJECT_DIR/build"
-    - ctest --output-on-failure --timeout 10
-    - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
+    - !reference [.test:windows-cmake, script]
diff --git a/.gitlab/issue_templates/example.md b/.gitlab/issue_templates/example.md
index 2aea591ed..8d32b8fab 100644
--- a/.gitlab/issue_templates/example.md
+++ b/.gitlab/issue_templates/example.md
@@ -1,22 +1,12 @@
 # Example checklist
 
 - Elaboration
-	- [ ] Example concept is described and agreed on
+    - [ ] Example concept is described and agreed upon
 - Implementation
-	- [ ] Example is implemented
-	- CMake support is added
-		- [ ] Linux
-		- [ ] Windows
-	- [ ] GNU Make support is added (Linux)
-	- [ ] Visual Studio project is added (Windows)
-		- [ ] Project is added to the root solution
-	- [ ] Inline code documentation is added
-	- [ ] README is added according to template
-		- [ ] Related READMEs, ToC are updated
-	- [ ] Internal CI passes
+    - [ ] Example is implemented
 - Internal review
-	- [ ] Internal code review is done
+    - [ ] Internal code review is done
 - External review
-	- [ ] Upstreaming PR is opened, external code review is done
+    - [ ] Upstreaming PR is opened, external review is done
 - Done
-	- [ ] Example merged to upstream
+    - [ ] Example merged to upstream
diff --git a/.gitlab/merge_request_templates/example.md b/.gitlab/merge_request_templates/example.md
new file mode 100644
index 000000000..1221d3020
--- /dev/null
+++ b/.gitlab/merge_request_templates/example.md
@@ -0,0 +1,16 @@
+## Notes for the reviewer
+_The reviewer should acknowledge all these topics._
+<insert notes>
+
+## Checklist before merge
+- [ ] CMake support is added
+    - [ ] Dependencies are copied via `IMPORTED_RUNTIME_ARTIFACTS` if applicable
+- [ ] GNU Make support is added (Linux)
+- [ ] Visual Studio project is added for VS2017, 2019, 2022 (Windows) (use [the script](https://projects.streamhpc.com/departments/knowledge/employee-handbook/-/wikis/Projects/AMD/Libraries/examples/Adding-Visual-Studio-Projects-to-new-examples#scripts))
+    - [ ] DLL dependencies are copied via `<Content Include`
+    - [ ] Visual Studio project is added to `ROCm-Examples-vs*.sln` (ROCm)
+    - [ ] Visual Studio project is added to `ROCm-Examples-Portable-vs*.sln` (ROCm/CUDA) if applicable
+- [ ] Inline code documentation is added
+- [ ] README is added according to template
+    - [ ] Related READMEs, ToC are updated
+- [ ] The CI passes for Linux/ROCm, Linux/CUDA, Windows/ROCm, Windows/CUDA.
diff --git a/Applications/floyd_warshall/main.hip b/Applications/floyd_warshall/main.hip
index 34f938db9..1a23ed3e1 100644
--- a/Applications/floyd_warshall/main.hip
+++ b/Applications/floyd_warshall/main.hip
@@ -198,8 +198,8 @@ int main(int argc, char* argv[])
     // Allocate device memory
     unsigned int* d_adjacency_matrix;
     unsigned int* d_next_matrix;
-    HIP_CHECK(hipMalloc((void**)&d_adjacency_matrix, size_bytes));
-    HIP_CHECK(hipMalloc((void**)&d_next_matrix, size_bytes));
+    HIP_CHECK(hipMalloc(&d_adjacency_matrix, size_bytes));
+    HIP_CHECK(hipMalloc(&d_next_matrix, size_bytes));
 
     // Create events to measure the execution time of the kernels.
     hipEvent_t start, stop;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56b6a04f4..918e2de8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.21.3 FATAL_ERROR)
 project(ROCMm-SDK-Examples LANGUAGES CXX)
 enable_testing()
 
diff --git a/Common/example_utils.hpp b/Common/example_utils.hpp
index 173a2124b..e8dc51b70 100644
--- a/Common/example_utils.hpp
+++ b/Common/example_utils.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,18 @@
 #ifndef COMMON_EXAMPLE_UTILS_HPP
 #define COMMON_EXAMPLE_UTILS_HPP
 
+// Compiling HIP on Windows includes windows.h, and this triggers many silly warnings.
+#if defined(_WIN32) && defined(__NVCC__)
+    #pragma nv_diag_suppress 108 // signed bit field of length 1
+    #pragma nv_diag_suppress 174 // expression has no effect
+    #pragma nv_diag_suppress 1835 // attribute "dllimport" does not apply here
+#endif
+
+// rocPRIM adds a #warning about printf on NAVI.
+#ifdef __clang__
+    #pragma clang diagnostic ignored "-W#warnings"
+#endif
+
 #include <cassert>
 #include <chrono>
 #include <iostream>
diff --git a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
index d529c14f4..8a27fc502 100644
--- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
@@ -27,7 +27,7 @@ RUN export DEBIAN_FRONTEND=noninteractive; \
 # Install HIP using the installer script
 RUN export DEBIAN_FRONTEND=noninteractive; \
     wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-    && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.3/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
+    && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.4/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
     && apt-get update -qq \
     && apt-get install -y hip-base hipify-clang \
     && apt-get download hip-runtime-nvidia hip-dev \
@@ -45,64 +45,71 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \
     && ldconfig
 
 # Install rocRAND
-RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.3.0.tar.gz \
-    && tar -xf ./rocm-5.3.0.tar.gz \
-    && rm ./rocm-5.3.0.tar.gz \
-    && cmake -S ./rocRAND-rocm-5.3.0 -B ./rocRAND-rocm-5.3.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.4.0.tar.gz \
+    && tar -xf ./rocm-5.4.0.tar.gz \
+    && rm ./rocm-5.4.0.tar.gz \
+    && cmake -S ./rocRAND-rocm-5.4.0 -B ./rocRAND-rocm-5.4.0/build \
         -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D BUILD_HIPRAND=OFF \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./rocRAND-rocm-5.3.0/build --target install \
-    && rm -rf ./rocRAND-rocm-5.3.0
+    && cmake --build ./rocRAND-rocm-5.4.0/build --target install \
+    && rm -rf ./rocRAND-rocm-5.4.0
 
 # Install hipCUB
-RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.3.0.tar.gz \
-    && tar -xf ./rocm-5.3.0.tar.gz \
-    && rm ./rocm-5.3.0.tar.gz \
-    && cmake -S ./hipCUB-rocm-5.3.0 -B ./hipCUB-rocm-5.3.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.4.0.tar.gz \
+    && tar -xf ./rocm-5.4.0.tar.gz \
+    && rm ./rocm-5.4.0.tar.gz \
+    && cmake -S ./hipCUB-rocm-5.4.0 -B ./hipCUB-rocm-5.4.0/build \
         -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./hipCUB-rocm-5.3.0/build --target install \
-    && rm -rf ./hipCUB-rocm-5.3.0
+    && cmake --build ./hipCUB-rocm-5.4.0/build --target install \
+    && rm -rf ./hipCUB-rocm-5.4.0
 
 # Install hipBLAS
-RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.3.0.tar.gz \
-    && tar -xf ./rocm-5.3.0.tar.gz \
-    && rm ./rocm-5.3.0.tar.gz \
-    && cmake -S ./hipBLAS-rocm-5.3.0 -B ./hipBLAS-rocm-5.3.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.4.0.tar.gz \
+    && tar -xf ./rocm-5.4.0.tar.gz \
+    && rm ./rocm-5.4.0.tar.gz \
+    && cmake -S ./hipBLAS-rocm-5.4.0 -B ./hipBLAS-rocm-5.4.0/build \
         -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
         -D USE_CUDA=ON \
-    && cmake --build ./hipBLAS-rocm-5.3.0/build --target install \
-    && rm -rf ./hipBLAS-rocm-5.3.0
+    && cmake --build ./hipBLAS-rocm-5.4.0/build --target install \
+    && rm -rf ./hipBLAS-rocm-5.4.0
 
 # Install hipSOLVER
-RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.3.0.tar.gz \
-    && tar -xf ./rocm-5.3.0.tar.gz \
-    && rm ./rocm-5.3.0.tar.gz \
-    && cmake -S ./hipSOLVER-rocm-5.3.0 -B ./hipSOLVER-rocm-5.3.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.4.0.tar.gz \
+    && tar -xf ./rocm-5.4.0.tar.gz \
+    && rm ./rocm-5.4.0.tar.gz \
+    && cmake -S ./hipSOLVER-rocm-5.4.0 -B ./hipSOLVER-rocm-5.4.0/build \
         -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
         -D USE_CUDA=ON \
-    && cmake --build ./hipSOLVER-rocm-5.3.0/build --target install \
-    && rm -rf ./hipSOLVER-rocm-5.3.0
+    && cmake --build ./hipSOLVER-rocm-5.4.0/build --target install \
+    && rm -rf ./hipSOLVER-rocm-5.4.0
 
 # Install hipRAND
-RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.3.0.tar.gz \
-    && tar -xf ./rocm-5.3.0.tar.gz \
-    && rm ./rocm-5.3.0.tar.gz \
-    && cmake -S ./hipRAND-rocm-5.3.0 -B ./hipRAND-rocm-5.3.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.4.0.tar.gz \
+    && tar -xf ./rocm-5.4.0.tar.gz \
+    && rm ./rocm-5.4.0.tar.gz \
+    && cmake -S ./hipRAND-rocm-5.4.0 -B ./hipRAND-rocm-5.4.0/build \
         -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./hipRAND-rocm-5.3.0/build --target install \
-    && rm -rf ./hipRAND-rocm-5.3.0
+        -D BUILD_WITH_LIB=CUDA \
+    && cmake --build ./hipRAND-rocm-5.4.0/build --target install \
+    && rm -rf ./hipRAND-rocm-5.4.0
 
 # Use render group as an argument from user
 ARG GID=109
 
-# Add the render group and a user with sudo permissions for the container
-RUN groupadd --system --gid ${GID} render \
-    && useradd -Um -G sudo,video,render developer \
+# Add the render group or change id if already exists
+RUN if [ $(getent group render) ]; then \
+        groupmod --gid ${GID} render; \
+    else \
+        groupadd --system --gid ${GID} render; \
+    fi
+
+# Add a user with sudo permissions for the container
+RUN useradd -Um -G sudo,video,render developer \
     && echo developer ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/developer \
     && chmod 0440 /etc/sudoers.d/developer
 
diff --git a/HIP-Basic/cooperative_groups/CMakeLists.txt b/HIP-Basic/cooperative_groups/CMakeLists.txt
index e6dac1e51..fd7bc4d62 100644
--- a/HIP-Basic/cooperative_groups/CMakeLists.txt
+++ b/HIP-Basic/cooperative_groups/CMakeLists.txt
@@ -54,6 +54,11 @@ add_test(${example_name} ${example_name})
 set(include_dirs "../../Common")
 if(GPU_RUNTIME STREQUAL "CUDA")
     list(APPEND include_dirs "${ROCM_ROOT}/include")
+else()
+    # Add NDEBUG for HIP version >= 5.5 and < 6.0 due to a known bug in the cooperative groups header
+    if( ${hip-lang_VERSION} VERSION_GREATER_EQUAL 5.5 AND ${hip-lang_VERSION} VERSION_LESS 6 )
+        add_compile_definitions(NDEBUG)
+    endif()
 endif()
 
 target_include_directories(${example_name} PRIVATE ${include_dirs})
diff --git a/HIP-Basic/device_globals/CMakeLists.txt b/HIP-Basic/device_globals/CMakeLists.txt
index eceb522de..c4031a9e5 100644
--- a/HIP-Basic/device_globals/CMakeLists.txt
+++ b/HIP-Basic/device_globals/CMakeLists.txt
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-set(example_name device_globals)
+set(example_name hip_device_globals)
 
 cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
 project(${example_name} LANGUAGES CXX)
diff --git a/HIP-Basic/device_query/main.cpp b/HIP-Basic/device_query/main.cpp
index 0e5f449ea..1c9247886 100644
--- a/HIP-Basic/device_query/main.cpp
+++ b/HIP-Basic/device_query/main.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,13 +20,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <iomanip>
 #include <iostream>
 
 #include <hip/hip_runtime.h>
 
-#include "example_utils.hpp"
-
 namespace
 {
 /// Number of characters in the first column.
diff --git a/HIP-Basic/occupancy/main.hip b/HIP-Basic/occupancy/main.hip
index 528c7ee64..eb6afefa4 100644
--- a/HIP-Basic/occupancy/main.hip
+++ b/HIP-Basic/occupancy/main.hip
@@ -165,8 +165,8 @@ int main()
     // Initialize the input data
     for(int i = 0; i < size; i++)
     {
-        h_A[i] = (float)i;
-        h_B[i] = (float)i;
+        h_A[i] = static_cast<float>(i);
+        h_B[i] = static_cast<float>(i);
     }
 
     float* d_A = nullptr;
diff --git a/HIP-Basic/texture_management/main.hip b/HIP-Basic/texture_management/main.hip
index e82b980c7..a0dc53181 100644
--- a/HIP-Basic/texture_management/main.hip
+++ b/HIP-Basic/texture_management/main.hip
@@ -24,6 +24,7 @@
 
 #include <hip/hip_runtime.h>
 
+#include <array>
 #include <iostream>
 #include <vector>
 
@@ -147,8 +148,8 @@ int main()
     HIP_CHECK(hipGetLastError());
 
     // Copy data from device back to host.
-    unsigned int h_histogram[hist_bin_count];
-    HIP_CHECK(hipMemcpy(h_histogram, d_histogram, hist_bytes, hipMemcpyDeviceToHost));
+    std::array<unsigned int, hist_bin_count> h_histogram;
+    HIP_CHECK(hipMemcpy(h_histogram.data(), d_histogram, hist_bytes, hipMemcpyDeviceToHost));
 
     // Print out results.
     std::cout << "Equal-width histogram with " << hist_bin_count << " bins of values [0, " << size
diff --git a/Libraries/hipBLAS/gemm_strided_batched/README.md b/Libraries/hipBLAS/gemm_strided_batched/README.md
index 33b0ba81c..b7bdc8eef 100644
--- a/Libraries/hipBLAS/gemm_strided_batched/README.md
+++ b/Libraries/hipBLAS/gemm_strided_batched/README.md
@@ -3,20 +3,21 @@
 ## Description
 This example illustrates the use of the hipBLAS Level 3 Strided Batched General Matrix Multiplication. The hipBLAS GEMM STRIDED BATCHED performs a matrix--matrix operation for a _batch_ of matrices as:
 
-$C[i] = \alpha \cdot f(A[i]) \cdot f(B[i]) + \beta \cdot (C[i])$
+$C[i] = \alpha \cdot A[i]' \cdot B[i]' + \beta \cdot (C[i])$
 
-for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $f(X)$ is one of the following:
-- $f(X) = X$ or
-- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
-- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
+for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $X'$ is one of the following:
+- $X' = X$ or
+- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
+- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
+In this example the identity is used.
 
 $\alpha$ and $\beta$ are scalars, and $A$, $B$ and $C$ are the batches of matrices. For each $i$, $A[i]$, $B[i]$ and $C[i]$ are matrices such that
-$f(A[i])$ is an $m \times k$ matrix, $f(B[i])$ a $k \times n$ matrix and $C[i]$ an $m \times n$ matrix.
+$A_i'$ is an $m \times k$ matrix, $B_i'$ a $k \times n$ matrix and $C_i$ an $m \times n$ matrix.
 
 
 ### Application flow
 1. Read in command-line parameters.
-2. Set $f$ operation, set sizes of matrices and get batch count.
+2. Set dimension variables of the matrices and get the batch count.
 3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix.
 4. Initialize gold standard matrix.
 5. Compute CPU reference result with strided batched subvectors.
@@ -33,19 +34,19 @@ The application provides the following optional command line arguments:
 - `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1.
 - `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1.
 - `-c` or `--count`. Batch count. Its default value is 3.
-- `-m` or `--m`. The number of rows of matrices $f(A)$ and $C$, which must be greater than 0. Its default value is 5.
-- `-n` or `--n`. The number of columns of matrices $f(B)$ and $C$, which must be greater than 0. Its default value is 5.
-- `-k` or `--k`. The number of columns of matrix $f(A)$ and rows of matrix $f(B)$, which must be greater than 0. Its default value is 5.
+- `-m` or `--m`. The number of rows of matrices $A$ and $C$, which must be greater than 0. Its default value is 5.
+- `-n` or `--n`. The number of columns of matrices $B$ and $C$, which must be greater than 0. Its default value is 5.
+- `-k` or `--k`. The number of columns of matrix $A$ and rows of matrix $B$, which must be greater than 0. Its default value is 5.
 
 ## Key APIs and Concepts
 - The performance of a numerical multi-linear algebra code can be heavily increased by using tensor contractions [ [Y. Shi et al., HiPC, pp 193, 2016.](https://doi.org/10.1109/HiPC.2016.031) ], thereby most of the hipBLAS functions have a`_batched` and a `_strided_batched` [ [C. Jhurani and P. Mullowney, JPDP Vol 75, pp 133, 2015.](https://doi.org/10.1016/j.jpdc.2014.09.003) ] extensions.<br/>
 We can apply the same multiplication operator for several matrices if we combine them into batched matrices. Batched matrix multiplication has a performance improvement for a large number of small matrices. For a constant stride between matrices, further acceleration is available by strided batched GEMM.
 - hipBLAS is initialized by calling `hipblasCreate(hipblasHandle*)` and it is terminated by calling `hipblasDestroy(hipblasHandle)`.
 - The _pointer mode_ controls whether scalar parameters must be allocated on the host (`HIPBLAS_POINTER_MODE_HOST`) or on the device (`HIPBLAS_POINTER_MODE_DEVICE`). It is controlled by `hipblasSetPointerMode`.
-- The $f$ operator -- defined in Description section -- can be
-    - `HIPBLAS_OP_N`: identity operator ($f(X) = X$),
-    - `HIPBLAS_OP_T`: transpose operator ($f(X) = X^T$) or
-    - `HIPBLAS_OP_C`: Hermitian (conjugate transpose) operator ($f(X) = X^H$).
+- The symbol $X'$ denotes the following operations, as defined in the Description section:
+    - `HIPBLAS_OP_N`: identity operator ($X' = X$),
+    - `HIPBLAS_OP_T`: transpose operator ($X' = X^T$) or
+    - `HIPBLAS_OP_C`: Hermitian (conjugate transpose) operator ($X' = X^H$).
 - `hipblasStride` strides between matrices or vectors in strided_batched functions.
 - `hipblas[HSDCZ]gemmStridedBatched`
 
@@ -60,9 +61,9 @@ We can apply the same multiplication operator for several matrices if we combine
     - `hipblasHandle_t handle`
     - `hipblasOperation_t trans_a`: transformation operator on each $A_i$ matrix
     - `hipblasOperation_t trans_b`: transformation operator on each $B_i$ matrix
-    - `int m`: number of rows in each $f(A_i)$ and $C$ matrices
-    - `int n`: number of columns in each $f(B_i)$ and $C$ matrices
-    - `int k`: number of columns in each $f(A_i)$ matrix and number of rows in each $f(B_i)$ matrix
+    - `int m`: number of rows in each $A_i'$ and $C$ matrices
+    - `int n`: number of columns in each $B_i'$ and $C$ matrices
+    - `int k`: number of columns in each $A_i'$ matrix and number of rows in each $B_i'$ matrix
     - `const float *alpha`: scalar multiplier of each $C_i$ matrix addition
     - `const float  *A`: pointer to the each $A_i$ matrix
     - `int lda`: leading dimension of each $A_i$ matrix
diff --git a/Libraries/hipBLAS/gemm_strided_batched/main.hip b/Libraries/hipBLAS/gemm_strided_batched/main.hip
index 94d0a718b..95b2d0d53 100644
--- a/Libraries/hipBLAS/gemm_strided_batched/main.hip
+++ b/Libraries/hipBLAS/gemm_strided_batched/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -42,9 +42,9 @@ int main(const int argc, const char** argv)
     parser.set_optional<float>("a", "alpha", 1.f, "Alpha scalar");
     parser.set_optional<float>("b", "beta", 1.f, "Beta scalar");
     parser.set_optional<int>("c", "count", 3, "Batch count");
-    parser.set_optional<int>("m", "m", 5, "Number of rows of matrices f(A_i) and C_i");
-    parser.set_optional<int>("n", "n", 5, "Number of columns of matrices f(B_i) and C_i");
-    parser.set_optional<int>("k", "k", 5, "Number of columns of matrix f(A_i) and rows of f(B_i)");
+    parser.set_optional<int>("m", "m", 5, "Number of rows of matrices A_i and C_i");
+    parser.set_optional<int>("n", "n", 5, "Number of columns of matrices B_i and C_i");
+    parser.set_optional<int>("k", "k", 5, "Number of columns of matrix A_i and rows of B_i");
     parser.run_and_exit_if_error();
 
     // Set sizes of matrices.
@@ -84,7 +84,7 @@ int main(const int argc, const char** argv)
     const float h_alpha = parser.get<float>("a");
     const float h_beta  = parser.get<float>("b");
 
-    // Set GEMM operation as identity operation: $f(X) = X$
+    // Set GEMM operation as identity operation: $X' = X$
     const hipblasOperation_t trans_a = HIPBLAS_OP_N;
     const hipblasOperation_t trans_b = HIPBLAS_OP_N;
 
diff --git a/Libraries/hipCUB/device_radix_sort/main.hip b/Libraries/hipCUB/device_radix_sort/main.hip
index a67609041..b11867d46 100644
--- a/Libraries/hipCUB/device_radix_sort/main.hip
+++ b/Libraries/hipCUB/device_radix_sort/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <cassert>
 #include <iostream>
 #include <vector>
@@ -27,12 +29,10 @@
 #include <hip/hip_runtime.h>
 #include <hipcub/device/device_radix_sort.hpp>
 
-#include "example_utils.hpp"
-
 int main()
 {
     // Allocate and initialize data on the host
-    const std::vector<float> h_keys{9.3, 2.1, 7.3, 4, 2.2, 5, 3.6, 2.7, 1.1, 0};
+    const std::vector<float> h_keys{9.3f, 2.1f, 7.3f, 4.0f, 2.2f, 5.0f, 3.6f, 2.7f, 1.1f, 0.0f};
     const std::vector<int>   h_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
     assert(h_keys.size() == h_values.size());
     const int num_elements = h_keys.size();
diff --git a/Libraries/hipCUB/device_sum/main.hip b/Libraries/hipCUB/device_sum/main.hip
index d400c55b1..b1450ad0e 100644
--- a/Libraries/hipCUB/device_sum/main.hip
+++ b/Libraries/hipCUB/device_sum/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,14 +20,14 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <iostream>
 #include <vector>
 
 #include <hip/hip_runtime.h>
 #include <hipcub/device/device_reduce.hpp>
 
-#include "example_utils.hpp"
-
 int main()
 {
     // Allocate and initialize data on the host
diff --git a/Libraries/hipSOLVER/syevj/main.cpp b/Libraries/hipSOLVER/syevj/main.cpp
index ccf03accb..e1afd94aa 100644
--- a/Libraries/hipSOLVER/syevj/main.cpp
+++ b/Libraries/hipSOLVER/syevj/main.cpp
@@ -55,7 +55,7 @@ int main(const int argc, char* argv[])
     // 3. Generate a random symmetric matrix
     std::default_random_engine             generator;
     std::uniform_real_distribution<double> distribution(0., 2.);
-    auto                                   random_number = std::bind(distribution, generator);
+    auto random_number = [&]() { return distribution(generator); };
 
     for(int i = 0; i < n; i++)
     {
diff --git a/Libraries/rocBLAS/level_3/gemm/README.md b/Libraries/rocBLAS/level_3/gemm/README.md
index eeb5c2741..99e2bce0e 100644
--- a/Libraries/rocBLAS/level_3/gemm/README.md
+++ b/Libraries/rocBLAS/level_3/gemm/README.md
@@ -2,17 +2,19 @@
 
 ## Description
 This example illustrates the use of the rocBLAS Level 3 General Matrix Multiplication. The rocBLAS GEMM performs a matrix--matrix operation as:
-$C = \alpha \cdot f(A) \cdot f(B) + \beta \cdot C$,
-where $f(X)$ is one of the following:
-- $f(X) = X$ or
-- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
-- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar{X_{ji}} $),
+$C = \alpha \cdot A' \cdot B' + \beta \cdot C$,
+where $X'$ is one of the following:
+- $X' = X$ or
+- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
+- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar{X_{ji}} $),
+In this example the identity is used.
+
 $\alpha and $\beta$ are scalars, and $A$, $B$ and $C$ are matrices, with
-$f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times n$ matrix.
+$A'$ an $m \times k$ matrix, $B'$ a $k \times n$ matrix and $C$ an $m \times n$ matrix.
 
 ### Application flow
 1. Read in command-line parameters.
-2. Set $f$ operation and set sizes of matrices.
+2. Set dimension variables of the matrices.
 3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix.
 4. Initialize gold standard matrix.
 5. Compute CPU reference result.
@@ -28,9 +30,9 @@ $f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times
 The application provides the following optional command line arguments:
 - `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1.
 - `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1.
-- `-m` or `--m`. The number of rows of matrices $f(A)$ and $C$, which must be greater than 0. Its default value is 5.
-- `-n` or `--n`. The number of columns of matrices $f(B)$ and $C$, which must be greater than 0. Its default value is 5.
-- `-k` or `--k`. The number of columns of matrix $f(A)$ and rows of matrix $f(B)$, which must be greater than 0. Its default value is 5.
+- `-m` or `--m`. The number of rows of matrices $A$ and $C$, which must be greater than 0. Its default value is 5.
+- `-n` or `--n`. The number of columns of matrices $B$ and $C$, which must be greater than 0. Its default value is 5.
+- `-k` or `--k`. The number of columns of matrix $A$ and rows of matrix $B$, which must be greater than 0. Its default value is 5.
 
 ## Key APIs and Concepts
 - rocBLAS is initialized by calling `rocblas_create_handle(rocblas_handle*)` and it is terminated by calling `rocblas_destroy_handle(rocblas_handle)`.
@@ -47,9 +49,9 @@ The application provides the following optional command line arguments:
     - `rocblas_handle handle`
     - `rocblas_operation transA`: transformation operator on $A$ matrix
     - `rocblas_operation transB`: transformation operator on $B$ matrix
-    - `rocblas_int m`: number of rows in $f(A)$ and $C$ matrices
-    - `rocblas_int n`: number of columns in $f(B)$ and $C$ matrices
-    - `rocblas_int k`: number of columns in $f(A)$ matrix and number of rows in $f(B)$ matrix
+    - `rocblas_int m`: number of rows in $A'$ and $C$ matrices
+    - `rocblas_int n`: number of columns in $B'$ and $C$ matrices
+    - `rocblas_int k`: number of columns in $A'$ matrix and number of rows in $B'$ matrix
     - `const float *alpha`: scalar multiplier of $C$ matrix addition
     - `const float *A`: pointer to the $A$ matrix
     - `rocblas_int lda`: leading dimension of $A$ matrix
diff --git a/Libraries/rocBLAS/level_3/gemm/main.cpp b/Libraries/rocBLAS/level_3/gemm/main.cpp
index 650b07573..8190a7b87 100644
--- a/Libraries/rocBLAS/level_3/gemm/main.cpp
+++ b/Libraries/rocBLAS/level_3/gemm/main.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -40,9 +40,9 @@ int main(const int argc, const char** argv)
     cli::Parser parser(argc, argv);
     parser.set_optional<float>("a", "alpha", 1.f, "Alpha scalar");
     parser.set_optional<float>("b", "beta", 1.f, "Beta scalar");
-    parser.set_optional<int>("m", "m", 5, "Number of rows of matrices f(A) and C");
-    parser.set_optional<int>("n", "n", 5, "Number of columns of matrices f(B) and C");
-    parser.set_optional<int>("k", "k", 5, "Number of columns of matrix f(A) and rows of f(B)");
+    parser.set_optional<int>("m", "m", 5, "Number of rows of matrices A and C");
+    parser.set_optional<int>("n", "n", 5, "Number of columns of matrices B and C");
+    parser.set_optional<int>("k", "k", 5, "Number of columns of matrix A and rows of B");
     parser.run_and_exit_if_error();
 
     // Set sizes of matrices.
@@ -73,7 +73,7 @@ int main(const int argc, const char** argv)
     const rocblas_float h_alpha = parser.get<float>("a");
     const rocblas_float h_beta  = parser.get<float>("b");
 
-    // Set GEMM operation as identity operation: $f(X) = X$
+    // Set GEMM operation as identity operation: $X' = X$
     const rocblas_operation trans_a = rocblas_operation_none;
     const rocblas_operation trans_b = rocblas_operation_none;
 
diff --git a/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md b/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md
index 057e0e599..f025c7490 100644
--- a/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md
+++ b/Libraries/rocBLAS/level_3/gemm_strided_batched/README.md
@@ -3,20 +3,21 @@
 ## Description
 This example illustrates the use of the rocBLAS Level 3 Strided Batched General Matrix Multiplication. The rocBLAS GEMM STRIDED BATCHED performs a matrix--matrix operation for a _batch_ of matrices as:
 
-$C[i] = \alpha \cdot f(A[i]) \cdot f(B[i]) + \beta \cdot (C[i])$
+$C[i] = \alpha \cdot A[i]' \cdot B[i]' + \beta \cdot (C[i])$
 
-for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $f(X)$ is one of the following:
-- $f(X) = X$ or
-- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
-- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
+for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $X'$ is one of the following:
+- $X' = X$ or
+- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
+- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
+In this example the identity is used.
 
 $\alpha$ and $\beta$ are scalars, and $A$, $B$ and $C$ are the batches of matrices. For each $i$, $A[i]$, $B[i]$ and $C[i]$ are matrices such that
-$f(A[i])$ is an $m \times k$ matrix, $f(B[i])$ a $k \times n$ matrix and $C[i]$ an $m \times n$ matrix.
+$A_i'$ is an $m \times k$ matrix, $B_i'$ a $k \times n$ matrix and $C_i$ an $m \times n$ matrix.
 
 
 ### Application flow
 1. Read in command-line parameters.
-2. Set $f$ operation, set sizes of matrices and get batch count.
+2. Set dimension variables of the matrices and get batch count and stride.
 3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix.
 4. Initialize gold standard matrix.
 5. Compute CPU reference result with strided batched subvectors.
@@ -33,9 +34,9 @@ The application provides the following optional command line arguments:
 - `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1.
 - `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1.
 - `-c` or `--count`. Batch count. Its default value is 3.
-- `-m` or `--m`. The number of rows of matrices $f(A_i)$ and $C_i$, which must be greater than 0. Its default value is 5.
-- `-n` or `--n`. The number of columns of matrices $f(B_i)$ and $C_i$, which must be greater than 0. Its default value is 5.
-- `-k` or `--k`. The number of columns of columns of matrix f(A_i) and rows of f(B_i)
+- `-m` or `--m`. The number of rows of matrices $A_i$ and $C_i$, which must be greater than 0. Its default value is 5.
+- `-n` or `--n`. The number of columns of matrices $B_i$ and $C_i$, which must be greater than 0. Its default value is 5.
+- `-k` or `--k`. The number of columns of columns of matrix $A_i$ and rows of $B_i$
 
 ## Key APIs and Concepts
 - The performance of a numerical multi-linear algebra code can be heavily increased by using tensor contractions [ [Y. Shi et al., HiPC, pp 193, 2016.](https://doi.org/10.1109/HiPC.2016.031) ], thereby most of the rocBLAS functions have a`_batched` and a `_strided_batched` [ [C. Jhurani and P. Mullowney, JPDP Vol 75, pp 133, 2015.](https://doi.org/10.1016/j.jpdc.2014.09.003) ] extensions.<br/>
@@ -57,9 +58,9 @@ We can apply the same multiplication operator for several matrices if we combine
     - `rocblas_handle handle`
     - `rocblas_operation transA`: transformation operator on $A_i$ matrix
     - `rocblas_operation transB`: transformation operator on $B_i$ matrix
-    - `rocblas_int m`: number of rows in $f(A_i)$ and $C_i$ matrices
-    - `rocblas_int n`: number of columns in $f(B_i)$ and $C_i$ matrices
-    - `rocblas_int k`: number of columns in $f(A_i)$ matrix and number of rows in $f(B_i)$ matrix
+    - `rocblas_int m`: number of rows in $A_i'$ and $C_i$ matrices
+    - `rocblas_int n`: number of columns in $B_i'$ and $C_i$ matrices
+    - `rocblas_int k`: number of columns in $A_i'$ matrix and number of rows in $B_i'$ matrix
     - `const float *alpha`: scalar multiplier of $C_i$ matrix addition
     - `const float *A`: pointer to each $A_i$ matrix
     - `rocblas_int lda`: leading dimension of each $A_i$ matrix
diff --git a/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp b/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp
index 32750eac2..471df167b 100644
--- a/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp
+++ b/Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -42,9 +42,9 @@ int main(const int argc, const char** argv)
     parser.set_optional<float>("a", "alpha", 1.f, "Alpha scalar");
     parser.set_optional<float>("b", "beta", 1.f, "Beta scalar");
     parser.set_optional<int>("c", "count", 3, "Batch count");
-    parser.set_optional<int>("m", "m", 5, "Number of rows of matrices f(A_i) and C_i");
-    parser.set_optional<int>("n", "n", 5, "Number of columns of matrices f(B_i) and C_i");
-    parser.set_optional<int>("k", "k", 5, "Number of columns of matrix f(A_i) and rows of f(B_i)");
+    parser.set_optional<int>("m", "m", 5, "Number of rows of matrices A_i and C_i");
+    parser.set_optional<int>("n", "n", 5, "Number of columns of matrices B_i and C_i");
+    parser.set_optional<int>("k", "k", 5, "Number of columns of matrix A_i and rows of B_i");
     parser.run_and_exit_if_error();
 
     // Set sizes of matrices.
@@ -84,7 +84,7 @@ int main(const int argc, const char** argv)
     const rocblas_float h_alpha = parser.get<float>("a");
     const rocblas_float h_beta  = parser.get<float>("b");
 
-    // Set GEMM operation as identity operation: $f(X) = X$.
+    // Set GEMM operation as identity operation: $X' = X$.
     const rocblas_operation trans_a = rocblas_operation_none;
     const rocblas_operation trans_b = rocblas_operation_none;
 
diff --git a/Libraries/rocPRIM/block_sum/main.hip b/Libraries/rocPRIM/block_sum/main.hip
index deb70fd81..01e5d6d43 100644
--- a/Libraries/rocPRIM/block_sum/main.hip
+++ b/Libraries/rocPRIM/block_sum/main.hip
@@ -20,6 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <iostream>
 #include <numeric>
 
@@ -28,8 +30,6 @@
 #include <rocprim/block/block_load.hpp>
 #include <rocprim/block/block_reduce.hpp>
 
-#include "example_utils.hpp"
-
 /// \brief Compute the sum of an array on the host CPU
 std::vector<int> reduce_sum_host(const std::vector<int>& data,
                                  const unsigned int      run_size,
diff --git a/Libraries/rocPRIM/device_sum/main.hip b/Libraries/rocPRIM/device_sum/main.hip
index 0d5851766..e67824a05 100644
--- a/Libraries/rocPRIM/device_sum/main.hip
+++ b/Libraries/rocPRIM/device_sum/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,14 +20,14 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <iostream>
 #include <vector>
 
 #include <hip/hip_runtime.h>
 #include <rocprim/device/device_reduce.hpp>
 
-#include "example_utils.hpp"
-
 int main()
 {
     // Allocate and initialize data on the host
diff --git a/Libraries/rocThrust/device_ptr/main.hip b/Libraries/rocThrust/device_ptr/main.hip
index f799098ec..9cb90a314 100644
--- a/Libraries/rocThrust/device_ptr/main.hip
+++ b/Libraries/rocThrust/device_ptr/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <cassert>
 #include <iostream>
 #include <sstream>
@@ -33,8 +35,6 @@
 #include <thrust/reduce.h>
 #include <thrust/sequence.h>
 
-#include "example_utils.hpp"
-
 int main()
 {
     // Allocate memory buffer to store 10 integers on the device
diff --git a/Libraries/rocThrust/norm/main.hip b/Libraries/rocThrust/norm/main.hip
index 2b2862740..9566618fa 100644
--- a/Libraries/rocThrust/norm/main.hip
+++ b/Libraries/rocThrust/norm/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <cmath>
 #include <cstddef>
 #include <iostream>
@@ -29,8 +31,6 @@
 #include <thrust/reduce.h>
 #include <thrust/transform_reduce.h>
 
-#include "example_utils.hpp"
-
 // An anonymous namespace sets static linkage to its contents.
 // This means that the contained function definitions will only be visible
 // in the current compilation unit (i.e. cpp source file).
diff --git a/Libraries/rocThrust/reduce_sum/main.hip b/Libraries/rocThrust/reduce_sum/main.hip
index 984ff8f0b..38ec13353 100644
--- a/Libraries/rocThrust/reduce_sum/main.hip
+++ b/Libraries/rocThrust/reduce_sum/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <cstddef>
 #include <iostream>
 
@@ -27,8 +29,6 @@
 #include <thrust/host_vector.h>
 #include <thrust/reduce.h>
 
-#include "example_utils.hpp"
-
 int main()
 {
     // create a host vector with 4 elements
diff --git a/Libraries/rocThrust/remove_points/main.hip b/Libraries/rocThrust/remove_points/main.hip
index 24da8363c..70812169b 100644
--- a/Libraries/rocThrust/remove_points/main.hip
+++ b/Libraries/rocThrust/remove_points/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,13 +20,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <thrust/generate.h>
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
 
-#include "example_utils.hpp"
-
 // An anonymous namespace sets static linkage to its contents.
 // This means that the contained function definitions will only be visible
 // in the current compilation unit (i.e. cpp source file).
diff --git a/Libraries/rocThrust/saxpy/main.hip b/Libraries/rocThrust/saxpy/main.hip
index 77478dda3..a82afd161 100644
--- a/Libraries/rocThrust/saxpy/main.hip
+++ b/Libraries/rocThrust/saxpy/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,14 +20,14 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <iostream>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 #include <thrust/host_vector.h>
 #include <thrust/transform.h>
 
-#include "example_utils.hpp"
-
 // This example illustrates how to implement the SAXPY operation
 // (Y[i] = a * X[i] + Y[i]) using rocThrust.
 
diff --git a/Libraries/rocThrust/vectors/main.hip b/Libraries/rocThrust/vectors/main.hip
index 74a46cc54..42c3c936c 100644
--- a/Libraries/rocThrust/vectors/main.hip
+++ b/Libraries/rocThrust/vectors/main.hip
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,13 +20,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#include "example_utils.hpp"
+
 #include <iostream>
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
-#include "example_utils.hpp"
-
 int main()
 {
     // Allocate a resizable vector in host memory.
diff --git a/Scripts/WindowsRunner.ps1 b/Scripts/WindowsRunner.ps1
new file mode 100644
index 000000000..ed96efa4c
--- /dev/null
+++ b/Scripts/WindowsRunner.ps1
@@ -0,0 +1,108 @@
+param(
+    [Parameter(Mandatory)]
+    [string]$Path = "Debug",
+    [string]$Filter = "*.exe",
+    [int]$Timeout = 10,
+    [string[]]$Skip = @()
+)
+$Skip = $Skip | ForEach-Object { $_.Trim() }
+
+Write-Host "Testing all '$Filter' in '$Path' with a timeout of $Timeout"
+Write-Host "Skipping examples that match any of:"
+foreach($item in $Skip) {
+    Write-Host "- $item"
+}
+
+$FailureCount = 0
+$Results = @()
+
+function Run-Example {
+    param(
+        [System.IO.FileInfo]$FileInfo
+    )
+
+    $Job = Start-Job -ScriptBlock {
+        param([string]$FullName)
+        $Time = Measure-Command { 
+            try {
+                $Log = & $FullName
+                $JobExitStatus = $LASTEXITCODE
+            } catch {
+                $JobExitStatus = "CRASH!"
+            }
+        }
+        return [PSCustomObject]@{
+            ExitStatus = $JobExitStatus
+            Log        = $Log
+            Time       = $Time
+        }
+    } -ArgumentList $FileInfo.FullName
+
+    # Execute the job with a timeout
+    $Job | Wait-Job -TimeOut $Timeout | Out-Null
+
+    # Get the results from the job!
+    $Result = Receive-Job $Job
+    Write-Host $Result.Log
+
+    if ($null -ne $Result.ExitStatus) {
+        $TimeSpan   = $Result.Time.toString("mm\:ss\.fff")
+        $ExitStatus = $Result.ExitStatus
+    } else {
+        $ExitStatus = "Timeout!"
+        $TimeSpan   = $null
+    }
+
+    if ($Result.ExitStatus -eq 0) {
+        # Exited gracefully!
+        $Status = "`e[32mPass`e[0m"
+        $ExitDisplay = "`e[32m$ExitStatus`e[0m"
+    } else {
+        $ExitDisplay = "`e[31m$ExitStatus`e[0m"
+        
+        # Otherwise, fail!
+        $Status = "`e[31m`e[1mFail`e[0m"
+        $FailureCount += 1
+    }
+
+    # Clean up!
+    Remove-Job -force $Job
+
+    [PSCustomObject]@{
+        Name       = $FileInfo.Name
+        State      = $Status
+        ExitStatus = $ExitDisplay
+        Time       = $TimeSpan
+    }
+}
+
+Get-ChildItem -Recurse -File -Path $Path -Filter $Filter | ForEach-Object {
+    Write-Host ("`e[36m-- {0}`e[0m" -f $_.Name)
+
+    $ShouldSkip = $false
+    foreach($F in $Skip) {
+        if ($_.Name -like $F) {
+            Write-Host "`e[33m`e[1mSkipped by wildcard:`e[0m $F"
+            $ShouldSkip = $true
+            break
+        }
+    }
+
+    # Put into a hash table and append to a list for table magic!
+    if (-not $ShouldSkip) {
+        $Results += Run-Example $_
+    } else {
+        $Results += [PSCustomObject]@{
+            Name       = $_.Name
+            State      = "`e[33m`e[1mSkip`e[0m"
+            ExitStatus = $null 
+            Time       = $null
+        }
+    }
+}
+
+$Results | Format-Table
+
+if ($FailureCount -gt 0) {
+    throw "$FailureCount failed jobs!"
+}