Quad precision (double-double) support in VkFFT

-Added double-double support in VkFFT. Requires cpu initialization in full quad precision, so only supports gcc for now. Potentially possible to add full FP128 support or some other FP128 library (like mpir) in the future. -Data has to be stored in double-double before VkFFT kernels calls (no fp128<->double-double conversion on the GPU yet). -Full 1e-32 precision, but same range as FP64. See Library for Double-Double and Quad-Double Arithmetic by Y Hida for more information on double-double. -Reuqires FMA contraction to be disabled (due to ab-cd contraction rounding mismatch). Doesn't work on Vulkan as I haven't found how to do that yet. -Fixed warnings (#138) -Added proper check for app to be zero before initializeVkFFT call and zeroing on deletion (#134) -Added an option to provide staging buffer in application and VkGPU handle (#129) -Added guards for build type (#128) -Fixed missing deallocation calls for the inverse Bluestein axes. Fixed the buffer layout size in Vulkan in some cases. -Refactored the code generator and container struct layout for better handling complex numbers (-5k loc). -Added more precision tests and benchmarks. -Will be merged in the main branch after more testing and update to the documentation.
DTolm · Sep 25, 2023 · ae5a334 · ae5a334
1 parent b4ae141
commit ae5a334
Show file tree

Hide file tree

Showing 57 changed files with 7,602 additions and 9,725 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,10 +1,15 @@
 cmake_minimum_required(VERSION 3.11)
 project(VkFFT_TestSuite)
+
+if(NOT CMAKE_BUILD_TYPE)
 set(CMAKE_CONFIGURATION_TYPES "Release" CACHE STRING "" FORCE)
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
 if (NOT DEFINED GLSLANG_GIT_TAG)
-    set(GLSLANG_GIT_TAG "origin/main")
+	set(GLSLANG_GIT_TAG "12.3.1")
 endif()
+
 include(FetchContent)
 set(VKFFT_BACKEND 0 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero, 5 - Metal")
 
@@ -22,6 +27,7 @@ endif()
 
 option(build_VkFFT_FFTW_precision "Build VkFFT FFTW precision comparison" OFF)
 option(VkFFT_use_FP128_Bluestein_RaderFFT "Use FP128 for Bluestein and Rader FFT kernel calculations. Currently requires FP128 FFT library, like FFTWl" OFF)
+option(VkFFT_use_FP128_double_double "Build VkFFT quad double-double" OFF)
 
 if (MSVC)
 	set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME})
@@ -40,6 +46,7 @@ if(build_VkFFT_FFTW_precision)
 		benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
@@ -49,14 +56,17 @@ if(build_VkFFT_FFTW_precision)
 		benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
+		benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp)
 else()		
 	add_executable(${PROJECT_NAME} VkFFT_TestSuite.cpp
 		benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
@@ -70,15 +80,18 @@ else()
 		benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
+		benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp)
 endif()
 target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)  
 add_definitions(-DVKFFT_BACKEND=${VKFFT_BACKEND})
@@ -138,7 +151,12 @@ target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10 - Vulk
 if(VkFFT_use_FP128_Bluestein_RaderFFT)
         target_compile_definitions(${PROJECT_NAME} PUBLIC -DVkFFT_use_FP128_Bluestein_RaderFFT)
 endif()  
+if(VkFFT_use_FP128_double_double)
+	target_compile_definitions(${PROJECT_NAME} PUBLIC -DVKFFT_USE_DOUBLEDOUBLE_FP128)
+	target_link_libraries(${PROJECT_NAME} PUBLIC quadmath)
+endif()
 if(${VKFFT_BACKEND} EQUAL 0)
+	set(ENABLE_OPT 0)
 	FetchContent_Declare(
 		glslang-main
 		GIT_TAG ${GLSLANG_GIT_TAG}
@@ -153,12 +171,12 @@ if(${VKFFT_BACKEND} EQUAL 0)
 	add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)
 endif()
 
-target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
 add_library(VkFFT INTERFACE)
+target_include_directories(VkFFT INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
 target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=${VKFFT_BACKEND})
 
-target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/half_lib/)
 add_library(half INTERFACE)
+target_include_directories(half INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/half_lib/)
 
 target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/vkFFT_scripts/include/)
 
@@ -168,6 +186,7 @@ elseif(${VKFFT_BACKEND} EQUAL 1)
 	find_library(CUDA_NVRTC_LIB libnvrtc nvrtc HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64" "${LIBNVRTC_LIBRARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" /usr/lib64 /usr/local/cuda/lib64)
 	add_definitions(-DCUDA_TOOLKIT_ROOT_DIR="${CUDA_TOOLKIT_ROOT_DIR}")
 	target_link_libraries(${PROJECT_NAME} PUBLIC ${CUDA_LIBRARIES} cuda ${CUDA_NVRTC_LIB} VkFFT half)
+	target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS})
 elseif(${VKFFT_BACKEND} EQUAL 2)
 	target_link_libraries(${PROJECT_NAME} PUBLIC hip::host VkFFT half)
 elseif(${VKFFT_BACKEND} EQUAL 3)
@@ -197,6 +216,16 @@ if(build_VkFFT_FFTW_precision OR VkFFT_use_FP128_Bluestein_RaderFFT)
 		NO_DEFAULT_PATH
 	  )
         target_include_directories(${PROJECT_NAME} PUBLIC ${FFTW_INCLUDES})
+if(VkFFT_use_FP128_double_double)
+	find_library(
+		FFTWQ_LIB
+		NAMES "libfftw3q" "fftw3q"
+		PATHS ${FFTW3_LIB_DIR}
+		PATH_SUFFIXES "lib" "lib64"
+		NO_DEFAULT_PATH
+	  )
+	  target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTWQ_LIB})
+endif()
 if(VkFFT_use_FP128_Bluestein_RaderFFT)        
 	find_library(
 		FFTWL_LIB
@@ -205,10 +234,9 @@ if(VkFFT_use_FP128_Bluestein_RaderFFT)
 		PATH_SUFFIXES "lib" "lib64"
 		NO_DEFAULT_PATH
 	  )
-	target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB} ${FFTWL_LIB})
-else()
-        target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
+	target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTWL_LIB})
 endif()
+	target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
 endif()
 
 if(build_VkFFT_cuFFT_benchmark)
@@ -256,6 +284,7 @@ if(build_VkFFT_cuFFT_benchmark)
 	-gencode arch=compute_80,code=compute_80 
 	-gencode arch=compute_86,code=compute_86>")
 	target_include_directories(cuFFT_scripts PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/cuFFT_scripts/include)
+	target_include_directories(cuFFT_scripts PUBLIC ${CUDA_INCLUDE_DIRS})
 	set_target_properties(cuFFT_scripts PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 	set_target_properties(cuFFT_scripts PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 	target_link_libraries(${PROJECT_NAME} PUBLIC cuFFT_scripts)

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
   - Radix-2/3/4/5/7/8/11/13 FFT. Sequences using radix 3, 5, 7, 11 and 13 have comparable performance to that of powers of 2.
   - Rader's FFT algorithm for primes from 17 up to max shared memory length (~10000). Inlined and done without additional memory transfers.
   - Bluestein's FFT algorithm for all other sequences. Full coverage of C2C range, single upload (2^12, 2^12, 2^12) for R2C/C2R/R2R. Optimized to have as few memory transfers as possible by using zero padding and merged convolution support of VkFFT.
-  - Single, double and half precision support. Double precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
+  - Single, double, half and quad (double-double) precision support. Double and quad precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
   - All transformations are performed in-place with no performance loss. Out-of-place transforms are supported by selecting different input/output buffers.
   - No additional transposition uploads. Note: Data can be reshuffled after the Four Step FFT algorithm with an additional buffer (for big sequences). Doesn't matter for convolutions - they return to the input ordering (saves memory).
   - Complex to complex (C2C), real to complex (R2C), complex to real (C2R) transformations and real to real (R2R) Discrete Cosine Transformations of types I, II, III and IV. R2R, R2C and C2R are optimized to run up to 2x times faster than C2C and take 2x less memory.
@@ -33,19 +33,19 @@ Include the vkFFT.h file and glslang compiler. Provide the library with correctl
 For single and double precision, Vulkan 1.0 is required. For half precision, Vulkan 1.1 is required.
 
 CUDA/HIP:
-Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Provide the library with correctly chosen VKFFT_BACKEND definition. Only single/double precision for now.\
+Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Provide the library with correctly chosen VKFFT_BACKEND definition.\
 To build CUDA/HIP version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the correct one and optionally enable FFTW. VKFFT_BACKEND=1 for CUDA, VKFFT_BACKEND=2 for HIP.
 
 OpenCL:
-Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Only single/double precision for now.\
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition.\
 To build OpenCL version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 3 and optionally enable FFTW.
 
 Level Zero:
-Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls. Only single/double precision for now.\
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls.\
 To build Level Zero version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 4 and optionally enable FFTW.
 
 Metal:
-Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp. Only single precision.\
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp.\
 To build Metal version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 5 and optionally enable FFTW.
 
 ## Command-line interface

diff --git a/VkFFT_TestSuite.cpp b/VkFFT_TestSuite.cpp
@@ -1,4 +1,4 @@
-#include <vector>
+#include <vector>
 #include <memory>
 #include <string.h>
 #include <chrono>
@@ -62,6 +62,9 @@
 #include "sample_6_benchmark_VkFFT_single_r2c.h"
 #include "sample_7_benchmark_VkFFT_single_Bluestein.h"
 #include "sample_8_benchmark_VkFFT_double_Bluestein.h"
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include "sample_9_benchmark_VkFFT_quadDoubleDouble.h"
+#endif
 #include "sample_10_benchmark_VkFFT_single_multipleBuffers.h"
 #ifdef USE_FFTW
 #include "sample_11_precision_VkFFT_single.h"
@@ -72,17 +75,23 @@
 #include "sample_16_precision_VkFFT_single_dct.h"
 #include "sample_17_precision_VkFFT_double_dct.h"
 #include "sample_18_precision_VkFFT_double_nonPow2.h"
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include "sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.h"
+#endif
 #endif
 #include "sample_50_convolution_VkFFT_single_1d_matrix.h"
 #include "sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.h"
 #include "sample_52_convolution_VkFFT_single_2d_batched_r2c.h"
 
 #include "sample_100_benchmark_VkFFT_single_nd_dct.h"
 #include "sample_101_benchmark_VkFFT_double_nd_dct.h"
-#include "sample_1000_VkFFT_single_2_4096.h"
+#include "sample_1000_benchmark_VkFFT_single_2_4096.h"
 #include "sample_1001_benchmark_VkFFT_double_2_4096.h"
+#include "sample_1002_benchmark_VkFFT_half_2_4096.h"
 #include "sample_1003_benchmark_VkFFT_single_3d_2_512.h"
-
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include "sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.h"
+#endif
 #ifdef USE_cuFFT
 #include "user_benchmark_cuFFT.h"
 #include "sample_0_benchmark_cuFFT_single.h"
@@ -337,6 +346,13 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
         resFFT = sample_8_benchmark_VkFFT_double_Bluestein(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+	case 9:
+	{
+		resFFT = sample_9_benchmark_VkFFT_quadDoubleDouble(vkGPU, file_output, output, isCompilerInitialized);
+		break;
+	}
+#endif
 #if(VKFFT_BACKEND==0)
     case 10:
     {
@@ -385,6 +401,13 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
         resFFT = sample_18_precision_VkFFT_double_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+	case 19:
+    {
+        resFFT = sample_19_precision_VkFFT_quadDoubleDouble_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
+        break;
+    }
+#endif
 #endif
     case 50:
     {
@@ -453,19 +476,31 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
     }
     case 1000:
     {
-        resFFT = sample_1000_VkFFT_single_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+        resFFT = sample_1000_benchmark_VkFFT_single_2_4096(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
     case 1001:
     {
         resFFT = sample_1001_benchmark_VkFFT_double_2_4096(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+	case 1002:
+	{
+		resFFT = sample_1002_benchmark_VkFFT_half_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+		break;
+	}
     case 1003:
     {
         resFFT = sample_1003_benchmark_VkFFT_single_3d_2_512(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128	
+	case 1004:
+	{
+		resFFT = sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+		break;
+	}
+#endif
     }
 #if(VKFFT_BACKEND==0)
 	vkDestroyFence(vkGPU->device, vkGPU->fence, NULL);
@@ -525,7 +560,7 @@ int main(int argc, char* argv[])
 		version_decomposed[0] = version / 10000;
 		version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
 		version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
-		printf("VkFFT v%d.%d.%d (01-08-2023). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
+		printf("VkFFT v%d.%d.%d (25-09-2023). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
 #if (VKFFT_BACKEND==0)
 		printf("Vulkan backend\n");
 #elif (VKFFT_BACKEND==1)
@@ -553,6 +588,9 @@ int main(int argc, char* argv[])
 		printf("		6 - FFT + iFFT R2C / C2R benchmark\n");
 		printf("		7 - FFT + iFFT C2C Bluestein benchmark in single precision\n");
 		printf("		8 - FFT + iFFT C2C Bluestein benchmark in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		9 - FFT + iFFT C2C benchmark 1D batched in quad double-double precision LUT\n");
+#endif
 #if (VKFFT_BACKEND==0)
 		printf("		10 - multiple buffer(4 by default) split version of benchmark 0\n");
 #endif
@@ -566,6 +604,9 @@ int main(int argc, char* argv[])
 		printf("		16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
 		printf("		17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
 		printf("		18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		19 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in quad double-double precision\n");
+#endif
 #elif USE_rocFFT
 		printf("		11 - VkFFT / rocFFT / FFTW C2C precision test in single precision\n");
 		printf("		12 - VkFFT / rocFFT / FFTW C2C precision test in double precision\n");
@@ -575,6 +616,9 @@ int main(int argc, char* argv[])
 		printf("		16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
 		printf("		17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
 		printf("		18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		19 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in quad double-double precision\n");
+#endif
 #else
 		printf("		11 - VkFFT / FFTW C2C precision test in single precision\n");
 		printf("		12 - VkFFT / FFTW C2C precision test in double precision\n");
@@ -584,6 +628,9 @@ int main(int argc, char* argv[])
 		printf("		16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
 		printf("		17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
 		printf("		18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		19 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in quad double-double precision\n");
+#endif
 #endif
 #endif
 		printf("		50 - convolution example with identity kernel\n");
@@ -600,11 +647,15 @@ int main(int argc, char* argv[])
 
 		printf("		1000 - FFT + iFFT C2C benchmark 1D batched in single precision: all supported systems from 2 to 4096\n");
 		printf("		1001 - FFT + iFFT C2C benchmark 1D batched in double precision: all supported systems from 2 to 4096\n");
+		printf("		1002 - FFT + iFFT C2C benchmark 1D batched in half precision: all supported systems from 2 to 4096\n");
 		printf("		1003 - FFT + iFFT C2C multidimensional benchmark in single precision: all supported cubes from 2 to 512\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		1004 - FFT + iFFT C2C benchmark 1D batched in quad double-double precision: all supported systems from 2 to 4096\n");
+#endif
 		printf("	-benchmark_vkfft: run VkFFT benchmark on a user-defined system:\n\
 		-X uint, -Y uint, -Z uint - FFT dimensions (default Y and Z are 1)\n");
 		printf("\
-		-P uint - precision (0 - single, 1 - double, 2 - half) (default 0)\n");
+		-P uint - precision (0 - single, 1 - double, 2 - half, 3 - quad double-double) (default 0)\n");
 		printf("\
 		-B uint - number of batched systems (default 1)\n\
 		-N uint - number of consecutive FFT+iFFT iterations (default 1)\n\

diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_1000_VkFFT_single_2_4096.h b/benchmark_scripts/vkFFT_scripts/include/sample_1000_VkFFT_single_2_4096.h
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_1000_benchmark_VkFFT_single_2_4096.h b/benchmark_scripts/vkFFT_scripts/include/sample_1000_benchmark_VkFFT_single_2_4096.h
@@ -0,0 +1,4 @@
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_1000_benchmark_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);