diff --git a/CMakeLists.txt b/CMakeLists.txt
index 141c1f0f..626b4417 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,15 @@
 cmake_minimum_required(VERSION 3.11)
 project(VkFFT_TestSuite)
+
+if(NOT CMAKE_BUILD_TYPE)
 set(CMAKE_CONFIGURATION_TYPES "Release" CACHE STRING "" FORCE)
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
+if (NOT DEFINED GLSLANG_GIT_TAG)
+	set(GLSLANG_GIT_TAG "12.3.1")
+endif()
+
 include(FetchContent)
 set(VKFFT_BACKEND 0 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero, 5 - Metal")
 
@@ -18,7 +26,8 @@ else()
 endif()
 
 option(build_VkFFT_FFTW_precision "Build VkFFT FFTW precision comparison" OFF)
-option(VkFFT_use_FP128_Bluestein_RaderFFT "Use FP128 for Bluestein and Rader FFT kernel calculations. Currently requires FP128 FFT library, like FFTWl" OFF)
+option(VkFFT_use_FP128_Bluestein_RaderFFT "Use LD for Bluestein and Rader FFT kernel calculations. Currently requires LD FFT library, like FFTWl, will be reworked" OFF)
+option(VkFFT_use_FP128_double_double "Build VkFFT quad double-double" OFF)
 
 if (MSVC)
 	set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME})
@@ -37,6 +46,7 @@ if(build_VkFFT_FFTW_precision)
 		benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
@@ -46,14 +56,17 @@ if(build_VkFFT_FFTW_precision)
 		benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
+		benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp)
 else()		
 	add_executable(${PROJECT_NAME} VkFFT_TestSuite.cpp
 		benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
@@ -67,15 +80,18 @@ else()
 		benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
 		benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
-		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
+		benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+		benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp)
 endif()
 target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)  
 add_definitions(-DVKFFT_BACKEND=${VKFFT_BACKEND})
@@ -135,10 +151,15 @@ target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10 - Vulk
 if(VkFFT_use_FP128_Bluestein_RaderFFT)
         target_compile_definitions(${PROJECT_NAME} PUBLIC -DVkFFT_use_FP128_Bluestein_RaderFFT)
 endif()  
+if(VkFFT_use_FP128_double_double)
+	target_compile_definitions(${PROJECT_NAME} PUBLIC -DVKFFT_USE_DOUBLEDOUBLE_FP128)
+	target_link_libraries(${PROJECT_NAME} PUBLIC quadmath)
+endif()
 if(${VKFFT_BACKEND} EQUAL 0)
+	set(ENABLE_OPT 0)
 	FetchContent_Declare(
 		glslang-main
-		GIT_TAG "origin/main"
+		GIT_TAG ${GLSLANG_GIT_TAG}
 		GIT_REPOSITORY https://github.com/KhronosGroup/glslang
 		SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/glslang-main
 	)
@@ -150,12 +171,12 @@ if(${VKFFT_BACKEND} EQUAL 0)
 	add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)
 endif()
 
-target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
 add_library(VkFFT INTERFACE)
+target_include_directories(VkFFT INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
 target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=${VKFFT_BACKEND})
 
-target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/half_lib/)
 add_library(half INTERFACE)
+target_include_directories(half INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/half_lib/)
 
 target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/vkFFT_scripts/include/)
 
@@ -165,6 +186,7 @@ elseif(${VKFFT_BACKEND} EQUAL 1)
 	find_library(CUDA_NVRTC_LIB libnvrtc nvrtc HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64" "${LIBNVRTC_LIBRARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" /usr/lib64 /usr/local/cuda/lib64)
 	add_definitions(-DCUDA_TOOLKIT_ROOT_DIR="${CUDA_TOOLKIT_ROOT_DIR}")
 	target_link_libraries(${PROJECT_NAME} PUBLIC ${CUDA_LIBRARIES} cuda ${CUDA_NVRTC_LIB} VkFFT half)
+	target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS})
 elseif(${VKFFT_BACKEND} EQUAL 2)
 	target_link_libraries(${PROJECT_NAME} PUBLIC hip::host VkFFT half)
 elseif(${VKFFT_BACKEND} EQUAL 3)
@@ -194,6 +216,16 @@ if(build_VkFFT_FFTW_precision OR VkFFT_use_FP128_Bluestein_RaderFFT)
 		NO_DEFAULT_PATH
 	  )
         target_include_directories(${PROJECT_NAME} PUBLIC ${FFTW_INCLUDES})
+if(VkFFT_use_FP128_double_double)
+	find_library(
+		FFTWQ_LIB
+		NAMES "libfftw3q" "fftw3q"
+		PATHS ${FFTW3_LIB_DIR}
+		PATH_SUFFIXES "lib" "lib64"
+		NO_DEFAULT_PATH
+	  )
+	  target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTWQ_LIB})
+endif()
 if(VkFFT_use_FP128_Bluestein_RaderFFT)        
 	find_library(
 		FFTWL_LIB
@@ -202,10 +234,9 @@ if(VkFFT_use_FP128_Bluestein_RaderFFT)
 		PATH_SUFFIXES "lib" "lib64"
 		NO_DEFAULT_PATH
 	  )
-	target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB} ${FFTWL_LIB})
-else()
-        target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
+	target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTWL_LIB})
 endif()
+	target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
 endif()
 
 if(build_VkFFT_cuFFT_benchmark)
@@ -253,6 +284,7 @@ if(build_VkFFT_cuFFT_benchmark)
 	-gencode arch=compute_80,code=compute_80 
 	-gencode arch=compute_86,code=compute_86>")
 	target_include_directories(cuFFT_scripts PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/cuFFT_scripts/include)
+	target_include_directories(cuFFT_scripts PUBLIC ${CUDA_INCLUDE_DIRS})
 	set_target_properties(cuFFT_scripts PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 	set_target_properties(cuFFT_scripts PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 	target_link_libraries(${PROJECT_NAME} PUBLIC cuFFT_scripts)
diff --git a/README.md b/README.md
index aa1a4cf9..0f3d28b4 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
   - Radix-2/3/4/5/7/8/11/13 FFT. Sequences using radix 3, 5, 7, 11 and 13 have comparable performance to that of powers of 2.
   - Rader's FFT algorithm for primes from 17 up to max shared memory length (~10000). Inlined and done without additional memory transfers.
   - Bluestein's FFT algorithm for all other sequences. Full coverage of C2C range, single upload (2^12, 2^12, 2^12) for R2C/C2R/R2R. Optimized to have as few memory transfers as possible by using zero padding and merged convolution support of VkFFT.
-  - Single, double and half precision support. Double precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
+  - Single, double, half and quad (double-double) precision support. Double and quad precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
   - All transformations are performed in-place with no performance loss. Out-of-place transforms are supported by selecting different input/output buffers.
   - No additional transposition uploads. Note: Data can be reshuffled after the Four Step FFT algorithm with an additional buffer (for big sequences). Doesn't matter for convolutions - they return to the input ordering (saves memory).
   - Complex to complex (C2C), real to complex (R2C), complex to real (C2R) transformations and real to real (R2R) Discrete Cosine Transformations of types I, II, III and IV. R2R, R2C and C2R are optimized to run up to 2x times faster than C2C and take 2x less memory.
@@ -33,19 +33,19 @@ Include the vkFFT.h file and glslang compiler. Provide the library with correctl
 For single and double precision, Vulkan 1.0 is required. For half precision, Vulkan 1.1 is required.
 
 CUDA/HIP:
-Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Provide the library with correctly chosen VKFFT_BACKEND definition. Only single/double precision for now.\
+Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Provide the library with correctly chosen VKFFT_BACKEND definition.\
 To build CUDA/HIP version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the correct one and optionally enable FFTW. VKFFT_BACKEND=1 for CUDA, VKFFT_BACKEND=2 for HIP.
 
 OpenCL:
-Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Only single/double precision for now.\
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition.\
 To build OpenCL version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 3 and optionally enable FFTW.
 
 Level Zero:
-Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls. Only single/double precision for now.\
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls.\
 To build Level Zero version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 4 and optionally enable FFTW.
 
 Metal:
-Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp. Only single precision.\
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp.\
 To build Metal version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 5 and optionally enable FFTW.
 
 ## Command-line interface
diff --git a/VkFFT_TestSuite.cpp b/VkFFT_TestSuite.cpp
index 54ebb314..6497d633 100644
--- a/VkFFT_TestSuite.cpp
+++ b/VkFFT_TestSuite.cpp
@@ -1,4 +1,4 @@
-﻿#include <vector>
+#include <vector>
 #include <memory>
 #include <string.h>
 #include <chrono>
@@ -62,6 +62,9 @@
 #include "sample_6_benchmark_VkFFT_single_r2c.h"
 #include "sample_7_benchmark_VkFFT_single_Bluestein.h"
 #include "sample_8_benchmark_VkFFT_double_Bluestein.h"
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include "sample_9_benchmark_VkFFT_quadDoubleDouble.h"
+#endif
 #include "sample_10_benchmark_VkFFT_single_multipleBuffers.h"
 #ifdef USE_FFTW
 #include "sample_11_precision_VkFFT_single.h"
@@ -72,6 +75,9 @@
 #include "sample_16_precision_VkFFT_single_dct.h"
 #include "sample_17_precision_VkFFT_double_dct.h"
 #include "sample_18_precision_VkFFT_double_nonPow2.h"
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include "sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.h"
+#endif
 #endif
 #include "sample_50_convolution_VkFFT_single_1d_matrix.h"
 #include "sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.h"
@@ -79,10 +85,13 @@
 
 #include "sample_100_benchmark_VkFFT_single_nd_dct.h"
 #include "sample_101_benchmark_VkFFT_double_nd_dct.h"
-#include "sample_1000_VkFFT_single_2_4096.h"
+#include "sample_1000_benchmark_VkFFT_single_2_4096.h"
 #include "sample_1001_benchmark_VkFFT_double_2_4096.h"
+#include "sample_1002_benchmark_VkFFT_half_2_4096.h"
 #include "sample_1003_benchmark_VkFFT_single_3d_2_512.h"
-
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include "sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.h"
+#endif
 #ifdef USE_cuFFT
 #include "user_benchmark_cuFFT.h"
 #include "sample_0_benchmark_cuFFT_single.h"
@@ -337,6 +346,13 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
         resFFT = sample_8_benchmark_VkFFT_double_Bluestein(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+	case 9:
+	{
+		resFFT = sample_9_benchmark_VkFFT_quadDoubleDouble(vkGPU, file_output, output, isCompilerInitialized);
+		break;
+	}
+#endif
 #if(VKFFT_BACKEND==0)
     case 10:
     {
@@ -385,6 +401,13 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
         resFFT = sample_18_precision_VkFFT_double_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+	case 19:
+    {
+        resFFT = sample_19_precision_VkFFT_quadDoubleDouble_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
+        break;
+    }
+#endif
 #endif
     case 50:
     {
@@ -441,19 +464,21 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
         resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 4);
         break;
     }
-    case 200: case 201:
+    case 200: case 201: case 202:
     {
         resFFT = user_benchmark_VkFFT(vkGPU, file_output, output, isCompilerInitialized, userParams);
         break;
     }
-	case 202:
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+	case 203:
     {
         resFFT = user_benchmark_VkFFT(vkGPU, file_output, output, isCompilerInitialized, userParams);
         break;
     }
+#endif
     case 1000:
     {
-        resFFT = sample_1000_VkFFT_single_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+        resFFT = sample_1000_benchmark_VkFFT_single_2_4096(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
     case 1001:
@@ -461,11 +486,23 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
         resFFT = sample_1001_benchmark_VkFFT_double_2_4096(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+	case 1002:
+	{
+		resFFT = sample_1002_benchmark_VkFFT_half_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+		break;
+	}
     case 1003:
     {
         resFFT = sample_1003_benchmark_VkFFT_single_3d_2_512(vkGPU, file_output, output, isCompilerInitialized);
         break;
     }
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128	
+	case 1004:
+	{
+		resFFT = sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+		break;
+	}
+#endif
     }
 #if(VKFFT_BACKEND==0)
 	vkDestroyFence(vkGPU->device, vkGPU->fence, NULL);
@@ -525,7 +562,7 @@ int main(int argc, char* argv[])
 		version_decomposed[0] = version / 10000;
 		version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
 		version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
-		printf("VkFFT v%d.%d.%d (01-08-2023). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
+		printf("VkFFT v%d.%d.%d (23-10-2023). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
 #if (VKFFT_BACKEND==0)
 		printf("Vulkan backend\n");
 #elif (VKFFT_BACKEND==1)
@@ -553,6 +590,9 @@ int main(int argc, char* argv[])
 		printf("		6 - FFT + iFFT R2C / C2R benchmark\n");
 		printf("		7 - FFT + iFFT C2C Bluestein benchmark in single precision\n");
 		printf("		8 - FFT + iFFT C2C Bluestein benchmark in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		9 - FFT + iFFT C2C benchmark 1D batched in double-double emulation of quad precision LUT\n");
+#endif
 #if (VKFFT_BACKEND==0)
 		printf("		10 - multiple buffer(4 by default) split version of benchmark 0\n");
 #endif
@@ -566,6 +606,9 @@ int main(int argc, char* argv[])
 		printf("		16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
 		printf("		17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
 		printf("		18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		19 - VkFFT / FFTW C2C precision test in double-double emulation of quad precision\n");
+#endif
 #elif USE_rocFFT
 		printf("		11 - VkFFT / rocFFT / FFTW C2C precision test in single precision\n");
 		printf("		12 - VkFFT / rocFFT / FFTW C2C precision test in double precision\n");
@@ -575,6 +618,9 @@ int main(int argc, char* argv[])
 		printf("		16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
 		printf("		17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
 		printf("		18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		19 - VkFFT / FFTW C2C precision test in double-double emulation of quad precision\n");
+#endif
 #else
 		printf("		11 - VkFFT / FFTW C2C precision test in single precision\n");
 		printf("		12 - VkFFT / FFTW C2C precision test in double precision\n");
@@ -584,6 +630,9 @@ int main(int argc, char* argv[])
 		printf("		16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
 		printf("		17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
 		printf("		18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision test in double precision\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		19 - VkFFT / FFTW C2C precision test in double-double emulation of quad precision\n");
+#endif
 #endif
 #endif
 		printf("		50 - convolution example with identity kernel\n");
@@ -600,11 +649,15 @@ int main(int argc, char* argv[])
 
 		printf("		1000 - FFT + iFFT C2C benchmark 1D batched in single precision: all supported systems from 2 to 4096\n");
 		printf("		1001 - FFT + iFFT C2C benchmark 1D batched in double precision: all supported systems from 2 to 4096\n");
+		printf("		1002 - FFT + iFFT C2C benchmark 1D batched in half precision: all supported systems from 2 to 4096\n");
 		printf("		1003 - FFT + iFFT C2C multidimensional benchmark in single precision: all supported cubes from 2 to 512\n");
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+		printf("		1004 - FFT + iFFT C2C benchmark 1D batched in double-double emulation of quad precision: all supported systems from 2 to 4096\n");
+#endif
 		printf("	-benchmark_vkfft: run VkFFT benchmark on a user-defined system:\n\
 		-X uint, -Y uint, -Z uint - FFT dimensions (default Y and Z are 1)\n");
 		printf("\
-		-P uint - precision (0 - single, 1 - double, 2 - half) (default 0)\n");
+		-P uint - precision (0 - single, 1 - double, 2 - half, 3 - double-double) (default 0)\n");
 		printf("\
 		-B uint - number of batched systems (default 1)\n\
 		-N uint - number of consecutive FFT+iFFT iterations (default 1)\n\
diff --git a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_double.cu b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_double.cu
index 14fc4630..ec7b52ea 100644
--- a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_double.cu
+++ b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_double.cu
@@ -27,7 +27,7 @@ void launch_precision_cuFFT_double(void* inputC, void* output_cuFFT, int device_
 		fprintf(stderr, "Cuda error: Failed to allocate\n");
 		return;
 	}
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		cufftPlan1d(&planZ2Z, dims[0], CUFFT_Z2Z, 1);
 		break;
diff --git a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_half.cu b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_half.cu
index 42f67756..d3365c95 100644
--- a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_half.cu
+++ b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_half.cu
@@ -33,7 +33,7 @@ void launch_precision_cuFFT_half(void* inputC, void* output_cuFFT, int device_id
 	cufftResult res = cufftCreate(&planHalf);
 	size_t ws = 0;
 	long long local_dims[3];
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		local_dims[0] = (long long)dims[0];
 		local_dims[1] = (long long)dims[1];
diff --git a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_r2c.cu b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_r2c.cu
index 2e41977b..efb0afad 100644
--- a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_r2c.cu
+++ b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_r2c.cu
@@ -29,7 +29,7 @@ void launch_precision_cuFFT_r2c(void* inputC, void* output_cuFFT, int device_id,
 		fprintf(stderr, "Cuda error: Failed to allocate\n");
 		return;
 	}
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		cufftPlan1d(&planR2C, dims[0], CUFFT_R2C, 1);
 		break;
diff --git a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_single.cu b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_single.cu
index 1d418490..059d54cd 100644
--- a/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_single.cu
+++ b/benchmark_scripts/cuFFT_scripts/src/precision_cuFFT_single.cu
@@ -27,7 +27,7 @@ void launch_precision_cuFFT_single(void* inputC, void* output_cuFFT, int device_
 		fprintf(stderr, "Cuda error: Failed to allocate\n");
 		return;
 	}
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		cufftPlan1d(&planC2C, dims[0], CUFFT_C2C, 1);
 		break;
diff --git a/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_double.cpp b/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_double.cpp
index 8f9711a1..165daa40 100644
--- a/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_double.cpp
+++ b/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_double.cpp
@@ -26,7 +26,7 @@ void launch_precision_rocFFT_double(void* inputC, void* output_rocFFT, int devic
 		fprintf(stderr, "ROCM error: Failed to allocate\n");
 		return;
 	}
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		hipfftPlan1d(&planZ2Z, dims[0], HIPFFT_Z2Z, 1);
 		break;
diff --git a/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_r2c.cpp b/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_r2c.cpp
index 6b80f2fe..9dd03224 100644
--- a/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_r2c.cpp
+++ b/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_r2c.cpp
@@ -28,7 +28,7 @@ void launch_precision_rocFFT_r2c(void* inputC, void* output_rocFFT, int device_i
 		fprintf(stderr, "ROCM error: Failed to allocate\n");
 		return;
 	}
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		hipfftPlan1d(&planR2C, dims[0], HIPFFT_R2C, 1);
 		break;
diff --git a/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_single.cpp b/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_single.cpp
index b30fe604..cfb57cbb 100644
--- a/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_single.cpp
+++ b/benchmark_scripts/rocFFT_scripts/src/precision_rocFFT_single.cpp
@@ -26,7 +26,7 @@ void launch_precision_rocFFT_single(void* inputC, void* output_rocFFT, int devic
 		fprintf(stderr, "ROCM error: Failed to allocate\n");
 		return;
 	}
-	switch (dims[3]) {
+	switch (dims[4]) {
 	case 1:
 		hipfftPlan1d(&planC2C, dims[0], HIPFFT_C2C, 1);
 		break;
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_1000_VkFFT_single_2_4096.h b/benchmark_scripts/vkFFT_scripts/include/sample_1000_VkFFT_single_2_4096.h
deleted file mode 100644
index e07c6418..00000000
--- a/benchmark_scripts/vkFFT_scripts/include/sample_1000_VkFFT_single_2_4096.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "vkFFT.h"
-#include "utils_VkFFT.h"
-
-VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);
\ No newline at end of file
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_1000_benchmark_VkFFT_single_2_4096.h b/benchmark_scripts/vkFFT_scripts/include/sample_1000_benchmark_VkFFT_single_2_4096.h
new file mode 100644
index 00000000..545b339c
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/include/sample_1000_benchmark_VkFFT_single_2_4096.h
@@ -0,0 +1,4 @@
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_1000_benchmark_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_1002_benchmark_VkFFT_half_2_4096.h b/benchmark_scripts/vkFFT_scripts/include/sample_1002_benchmark_VkFFT_half_2_4096.h
new file mode 100644
index 00000000..608e5948
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/include/sample_1002_benchmark_VkFFT_half_2_4096.h
@@ -0,0 +1,4 @@
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_1002_benchmark_VkFFT_half_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.h b/benchmark_scripts/vkFFT_scripts/include/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.h
new file mode 100644
index 00000000..84aa6fb9
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/include/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.h
@@ -0,0 +1,4 @@
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.h b/benchmark_scripts/vkFFT_scripts/include/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.h
new file mode 100644
index 00000000..4f106420
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/include/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.h
@@ -0,0 +1,4 @@
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_19_precision_VkFFT_quadDoubleDouble_nonPow2(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);
\ No newline at end of file
diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_9_benchmark_VkFFT_quadDoubleDouble.h b/benchmark_scripts/vkFFT_scripts/include/sample_9_benchmark_VkFFT_quadDoubleDouble.h
new file mode 100644
index 00000000..668d09c4
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/include/sample_9_benchmark_VkFFT_quadDoubleDouble.h
@@ -0,0 +1,4 @@
+#include "vkFFT.h"
+#include "utils_VkFFT.h"	
+
+VkFFTResult sample_9_benchmark_VkFFT_quadDoubleDouble(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);
diff --git a/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h b/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h
index 13871188..78341521 100644
--- a/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h
+++ b/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h
@@ -16,6 +16,9 @@ typedef struct {
 	VkFence fence;//a vkGPU->fence used to synchronize dispatches
 	std::vector<const char*> enabledDeviceExtensions;
 	uint64_t enableValidationLayers;
+
+	VkBuffer* stagingBuffer;//optional pointer to the user defined staging buffer
+	VkDeviceMemory* stagingBufferMemory;//optional pointer to the user defined staging buffer memory, associated with the stagingBuffer
 #elif(VKFFT_BACKEND==1)
 	CUdevice device;
 	CUcontext context;
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
similarity index 98%
rename from benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
rename to benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
index 00942e59..1f7fef54 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
@@ -47,7 +47,7 @@
 #include "vkFFT.h"
 #include "utils_VkFFT.h"
 
-VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized)
+VkFFTResult sample_1000_benchmark_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized)
 {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
new file mode 100644
index 00000000..98ad9925
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
@@ -0,0 +1,295 @@
+//general parts
+#include <stdio.h>
+#include <vector>
+#include <memory>
+#include <string.h>
+#include <chrono>
+#include <thread>
+#include <iostream>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#if(VKFFT_BACKEND==0)
+#include "vulkan/vulkan.h"
+#include "glslang_c_interface.h"
+#elif(VKFFT_BACKEND==1)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+#include <cuda_runtime_api.h>
+#include <cuComplex.h>
+#elif(VKFFT_BACKEND==2)
+#ifndef __HIP_PLATFORM_HCC__
+#define __HIP_PLATFORM_HCC__
+#endif
+#include <hip/hip_runtime.h>
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_complex.h>
+#elif(VKFFT_BACKEND==3)
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif 
+#elif(VKFFT_BACKEND==4)
+#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
+#endif
+#include "vkFFT.h"
+#include "half.hpp"
+#include "utils_VkFFT.h"
+using half_float::half;
+
+VkFFTResult sample_1002_benchmark_VkFFT_half_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized)
+{
+	VkFFTResult resFFT = VKFFT_SUCCESS;
+#if(VKFFT_BACKEND==0)
+	VkResult res = VK_SUCCESS;
+#elif(VKFFT_BACKEND==1)
+	cudaError_t res = cudaSuccess;
+#elif(VKFFT_BACKEND==2)
+	hipError_t res = hipSuccess;
+#elif(VKFFT_BACKEND==3)
+	cl_int res = CL_SUCCESS;
+#elif(VKFFT_BACKEND==4)
+	ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
+#endif
+	if (file_output)
+		fprintf(output, "1002 - VkFFT FFT + iFFT C2C benchmark 1D batched in half precision: all supported systems from 2 to 4096\n");
+	printf("1002 - VkFFT FFT + iFFT C2C benchmark 1D batched in half precision: all supported systems from 2 to 4096\n");
+	const int num_runs = 3;
+	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
+	//memory allocated on the CPU once, makes benchmark completion faster + avoids performance issues connected to frequent allocation/deallocation.
+	float* buffer_input = (float*)malloc((uint64_t)4 * 2 * (uint64_t)pow(2, 27));
+	if (!buffer_input) return VKFFT_ERROR_MALLOC_FAILED;
+	for (uint64_t i = 0; i < 2 * (uint64_t)pow(2, 27); i++) {
+		buffer_input[i] = (float)(2 * ((float)rand()) / RAND_MAX - 1.0);
+	}
+	int num_systems = 0;
+	for (uint64_t n = 1; n < 4097; n++) {
+		double run_time[num_runs];
+		for (uint64_t r = 0; r < num_runs; r++) {
+			//Configuration + FFT application .
+			VkFFTConfiguration configuration = {};
+			VkFFTApplication app = {};
+			//FFT + iFFT sample code.
+			//Setting up FFT configuration for forward and inverse FFT.
+			configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.size[0] = n;// 4 * pow(2, n); //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
+			if (n == 1) configuration.size[0] = 4096;
+			uint64_t temp = configuration.size[0];
+			configuration.halfPrecision = true;
+			/*for (uint64_t j = 2; j < 14; j++)
+			{
+				if (temp % j == 0) {
+					temp /= j;
+					j = 1;
+				}
+			}
+			if (temp != 1) break;*/
+			configuration.numberBatches = (uint64_t)pow(2, (uint64_t)log2((uint64_t)64 * 32 * (uint64_t)pow(2, 16) / configuration.size[0]));
+			if (configuration.numberBatches < 1) configuration.numberBatches = 1;
+#if(VKFFT_BACKEND!=5)
+			if (r==0) configuration.saveApplicationToString = 1;
+			if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
+
+			//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+            configuration.device = vkGPU->device;
+#else
+            configuration.device = &vkGPU->device;
+#endif
+#if(VKFFT_BACKEND==0)
+			configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers 
+			configuration.fence = &vkGPU->fence;
+			configuration.commandPool = &vkGPU->commandPool;
+			configuration.physicalDevice = &vkGPU->physicalDevice;
+			configuration.isCompilerInitialized = isCompilerInitialized;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
+#elif(VKFFT_BACKEND==3)
+			configuration.context = &vkGPU->context;
+#elif(VKFFT_BACKEND==4)
+			configuration.context = &vkGPU->context;
+			configuration.commandQueue = &vkGPU->commandQueue;
+			configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+            configuration.queue = vkGPU->queue;
+#endif
+			//Allocate buffer for the input data.
+			uint64_t bufferSize = (uint64_t)sizeof(half) * 2 * configuration.size[0] * configuration.numberBatches;
+#if(VKFFT_BACKEND==0)
+			VkBuffer buffer = {};
+			VkDeviceMemory bufferDeviceMemory = {};
+			resFFT = allocateBuffer(vkGPU, &buffer, &bufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==1)
+			cuFloatComplex* buffer = 0;
+			res = cudaMalloc((void**)&buffer, bufferSize);
+			if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==2)
+			hipFloatComplex* buffer = 0;
+			res = hipMalloc((void**)&buffer, bufferSize);
+			if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==3)
+			cl_mem buffer = 0;
+			buffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, bufferSize, 0, &res);
+			if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==4)
+			void* buffer = 0;
+			ze_device_mem_alloc_desc_t device_desc = {};
+			device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+			res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(half), vkGPU->device, &buffer);
+			if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+            MTL::Buffer* buffer = 0;
+            buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+            configuration.buffer = &buffer;
+#endif
+
+			configuration.bufferSize = &bufferSize;
+
+			//Fill data on CPU. It is best to perform all operations on GPU after initial upload.
+			/*float* buffer_input = (float*)malloc(bufferSize);
+
+			for (uint64_t k = 0; k < configuration.size[2]; k++) {
+				for (uint64_t j = 0; j < configuration.size[1]; j++) {
+					for (uint64_t i = 0; i < configuration.size[0]; i++) {
+						buffer_input[2 * (i + j * configuration.size[0] + k * (configuration.size[0]) * configuration.size[1])] = 2 * ((float)rand()) / RAND_MAX - 1.0;
+						buffer_input[2 * (i + j * configuration.size[0] + k * (configuration.size[0]) * configuration.size[1]) + 1] = 2 * ((float)rand()) / RAND_MAX - 1.0;
+						}
+					}
+				}
+
+			*/
+			//Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+            resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+            if (resFFT != VKFFT_SUCCESS) return resFFT;
+
+			if (configuration.loadApplicationFromString) {
+				FILE* kernelCache;
+				uint64_t str_len;
+				char fname[500];
+				int VkFFT_version = VkFFTGetVersion();
+				sprintf(fname, "VkFFT_binary");
+				kernelCache = fopen(fname, "rb");
+				if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+				fseek(kernelCache, 0, SEEK_END);
+				str_len = ftell(kernelCache);
+				fseek(kernelCache, 0, SEEK_SET);
+				configuration.loadApplicationString = malloc(str_len);
+				fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+				fclose(kernelCache);
+			}
+			//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
+			resFFT = initializeVkFFT(&app, configuration);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+
+			if (configuration.loadApplicationFromString)
+				free(configuration.loadApplicationString);
+
+			if (configuration.saveApplicationToString) {
+				FILE* kernelCache;
+				char fname[500];
+				int VkFFT_version = VkFFTGetVersion();
+				sprintf(fname, "VkFFT_binary");
+				kernelCache = fopen(fname, "wb");
+				fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+				fclose(kernelCache);
+			}
+
+			//Submit FFT+iFFT.
+			uint64_t num_iter = (((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize;
+#if(VKFFT_BACKEND==0)
+			if (vkGPU->physicalDeviceProperties.vendorID == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#elif(VKFFT_BACKEND==3)
+			cl_uint vendorID;
+			clGetDeviceInfo(vkGPU->device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, 0);
+			if (vendorID == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#elif(VKFFT_BACKEND==4)
+			ze_device_properties_t device_properties;
+			res = zeDeviceGetProperties(vkGPU->device, &device_properties);
+			if (res != 0) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE;
+			if (device_properties.vendorId == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#endif
+			if (num_iter == 0) num_iter = 1;
+			double totTime = 0;
+			VkFFTLaunchParams launchParams = {};
+			resFFT = performVulkanFFTiFFT(vkGPU, &app, &launchParams, num_iter, &totTime);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			run_time[r] = totTime;
+			if (n > 1) {
+				if (r == num_runs - 1) {
+					num_systems++;
+					double std_error = 0;
+					double avg_time = 0;
+					for (uint64_t t = 0; t < num_runs; t++) {
+						avg_time += run_time[t];
+					}
+					avg_time /= num_runs;
+					for (uint64_t t = 0; t < num_runs; t++) {
+						std_error += (run_time[t] - avg_time) * (run_time[t] - avg_time);
+					}
+					std_error = sqrt(std_error / num_runs);
+					uint64_t num_tot_transfers = 0;
+					for (uint64_t i = 0; i < configuration.FFTdim; i++)
+						num_tot_transfers += app.localFFTPlan->numAxisUploads[i];
+					num_tot_transfers *= 4;
+					if (file_output)
+						fprintf(output, "VkFFT System: %" PRIu64 " %" PRIu64 " Buffer: %" PRIu64 " MB avg_time_per_step: %0.3f ms std_error: %0.3f num_iter: %" PRIu64 " benchmark: %" PRIu64 " scaled_bandwidth: %0.1f device_bandwidth: %0.1f\n", configuration.size[0], configuration.numberBatches, bufferSize / 1024 / 1024, avg_time, std_error, num_iter, (uint64_t)(((double)bufferSize / 512) / avg_time), bufferSize / 1024.0 / 1024.0 / 1.024 * 4 / avg_time, bufferSize / 1024.0 / 1024.0 / 1.024 * num_tot_transfers / avg_time);
+
+					printf("VkFFT System: %" PRIu64 " %" PRIu64 " Buffer: %" PRIu64 " MB avg_time_per_step: %0.3f ms std_error: %0.3f num_iter: %" PRIu64 " benchmark: %" PRIu64 " scaled_bandwidth: %0.1f device_bandwidth: %0.1f\n", configuration.size[0], configuration.numberBatches, bufferSize / 1024 / 1024, avg_time, std_error, num_iter, (uint64_t)(((double)bufferSize / 512) / avg_time), bufferSize / 1024.0 / 1024.0 / 1.024 * 4 / avg_time, bufferSize / 1024.0 / 1024.0 / 1.024 * num_tot_transfers / avg_time);
+					benchmark_result += ((double)bufferSize / 512) / avg_time;
+				}
+
+
+			}
+
+#if(VKFFT_BACKEND==0)
+			vkDestroyBuffer(vkGPU->device, buffer, NULL);
+			vkFreeMemory(vkGPU->device, bufferDeviceMemory, NULL);
+#elif(VKFFT_BACKEND==1)
+			cudaFree(buffer);
+#elif(VKFFT_BACKEND==2)
+			hipFree(buffer);
+#elif(VKFFT_BACKEND==3)
+			clReleaseMemObject(buffer);
+#elif(VKFFT_BACKEND==4)
+			zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+            buffer->release();
+#endif
+
+			deleteVkFFT(&app);
+
+		}
+	}
+	free(buffer_input);
+	benchmark_result /= (num_systems);
+
+	if (file_output) {
+		fprintf(output, "Benchmark score VkFFT: %" PRIu64 "\n", (uint64_t)(benchmark_result));
+#if(VKFFT_BACKEND==0)
+		fprintf(output, "Device name: %s API:%d.%d.%d\n", vkGPU->physicalDeviceProperties.deviceName, (vkGPU->physicalDeviceProperties.apiVersion >> 22), ((vkGPU->physicalDeviceProperties.apiVersion >> 12) & 0x3ff), (vkGPU->physicalDeviceProperties.apiVersion & 0xfff));
+#endif
+	}
+	printf("Benchmark score VkFFT: %" PRIu64 "\n", (uint64_t)(benchmark_result));
+#if(VKFFT_BACKEND==0)
+	printf("Device name: %s API:%d.%d.%d\n", vkGPU->physicalDeviceProperties.deviceName, (vkGPU->physicalDeviceProperties.apiVersion >> 22), ((vkGPU->physicalDeviceProperties.apiVersion >> 12) & 0x3ff), (vkGPU->physicalDeviceProperties.apiVersion & 0xfff));
+#endif
+	return resFFT;
+}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp
new file mode 100644
index 00000000..c1f32005
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp
@@ -0,0 +1,294 @@
+//general parts
+#include <stdio.h>
+#include <vector>
+#include <memory>
+#include <string.h>
+#include <chrono>
+#include <thread>
+#include <iostream>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#if(VKFFT_BACKEND==0)
+#include "vulkan/vulkan.h"
+#include "glslang_c_interface.h"
+#elif(VKFFT_BACKEND==1)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+#include <cuda_runtime_api.h>
+#include <cuComplex.h>
+#elif(VKFFT_BACKEND==2)
+#ifndef __HIP_PLATFORM_HCC__
+#define __HIP_PLATFORM_HCC__
+#endif
+#include <hip/hip_runtime.h>
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_complex.h>
+#elif(VKFFT_BACKEND==3)
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif 
+#elif(VKFFT_BACKEND==4)
+#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
+#endif
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized)
+{
+	VkFFTResult resFFT = VKFFT_SUCCESS;
+#if(VKFFT_BACKEND==0)
+	VkResult res = VK_SUCCESS;
+#elif(VKFFT_BACKEND==1)
+	cudaError_t res = cudaSuccess;
+#elif(VKFFT_BACKEND==2)
+	hipError_t res = hipSuccess;
+#elif(VKFFT_BACKEND==3)
+	cl_int res = CL_SUCCESS;
+#elif(VKFFT_BACKEND==4)
+	ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
+#endif
+	if (file_output)
+		fprintf(output, "1004 - VkFFT FFT + iFFT C2C benchmark 1D batched in double-double emulation of quad precision: all supported systems from 2 to 4096\n");
+	printf("1004 - VkFFT FFT + iFFT C2C benchmark 1D batched in double-double emulation of quad precision: all supported systems from 2 to 4096\n");
+	const int num_runs = 3;
+	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
+	//memory allocated on the CPU once, makes benchmark completion faster + avoids performance issues connected to frequent allocation/deallocation.
+	double* buffer_input = (double*)malloc((uint64_t)8 * 2 * (uint64_t)pow(2, 27));
+	if (!buffer_input) return VKFFT_ERROR_MALLOC_FAILED;
+	for (uint64_t i = 0; i < 2 * (uint64_t)pow(2, 27); i++) {
+		buffer_input[i] = (double)(2 * ((double)rand()) / RAND_MAX - 1.0);
+	}
+	int num_systems = 0;
+	for (int n = 1; n < 4097; n++) {
+		double run_time[num_runs];
+		for (uint64_t r = 0; r < num_runs; r++) {
+			//Configuration + FFT application .
+			VkFFTConfiguration configuration = {};
+			VkFFTApplication app = {};
+			//FFT + iFFT sample code.
+			//Setting up FFT configuration for forward and inverse FFT.
+			configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.size[0] = n;// 4 * pow(2, n); //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
+			if (n == 1) configuration.size[0] = 4096;
+			uint64_t temp = configuration.size[0];
+
+			/*for (uint64_t j = 2; j < 14; j++)
+			{
+				if (temp % j == 0) {
+					temp /= j;
+					j = 1;
+				}
+			}
+			if (temp != 1) break;*/
+			configuration.numberBatches = (uint64_t)pow(2, (uint64_t)log2((uint64_t)64 * 32 * (uint64_t)pow(2, 14) / configuration.size[0]));
+			if (configuration.numberBatches < 1) configuration.numberBatches = 1;
+			configuration.size[2] = 1;
+
+			configuration.quadDoubleDoublePrecision = true;
+#if(VKFFT_BACKEND!=5)
+			if (r==0) configuration.saveApplicationToString = 1;
+			if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
+			
+			//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+            configuration.device = vkGPU->device;
+#else
+            configuration.device = &vkGPU->device;
+#endif
+#if(VKFFT_BACKEND==0)
+			configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers 
+			configuration.fence = &vkGPU->fence;
+			configuration.commandPool = &vkGPU->commandPool;
+			configuration.physicalDevice = &vkGPU->physicalDevice;
+			configuration.isCompilerInitialized = isCompilerInitialized;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
+#elif(VKFFT_BACKEND==3)
+			configuration.context = &vkGPU->context;
+#elif(VKFFT_BACKEND==4)
+			configuration.context = &vkGPU->context;
+			configuration.commandQueue = &vkGPU->commandQueue;
+			configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+            configuration.queue = vkGPU->queue;
+#endif			
+
+			//Allocate buffer for the input data.
+			uint64_t bufferSize = (uint64_t)sizeof(double) * 4 * configuration.size[0] * configuration.numberBatches;
+#if(VKFFT_BACKEND==0)
+			VkBuffer buffer = {};
+			VkDeviceMemory bufferDeviceMemory = {};
+			resFFT = allocateBuffer(vkGPU, &buffer, &bufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==1)
+			cuFloatComplex* buffer = 0;
+			res = cudaMalloc((void**)&buffer, bufferSize);
+			if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==2)
+			hipFloatComplex* buffer = 0;
+			res = hipMalloc((void**)&buffer, bufferSize);
+			if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==3)
+			cl_mem buffer = 0;
+			buffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, bufferSize, 0, &res);
+			if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==4)
+			void* buffer = 0;
+			ze_device_mem_alloc_desc_t device_desc = {};
+			device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+			res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
+			if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+            MTL::Buffer* buffer = 0;
+            buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+            configuration.buffer = &buffer;
+#endif
+
+			configuration.bufferSize = &bufferSize;
+			//Fill data on CPU. It is best to perform all operations on GPU after initial upload.
+			/*float* buffer_input = (float*)malloc(bufferSize);
+
+			for (uint64_t k = 0; k < configuration.size[2]; k++) {
+				for (uint64_t j = 0; j < configuration.size[1]; j++) {
+					for (uint64_t i = 0; i < configuration.size[0]; i++) {
+						buffer_input[2 * (i + j * configuration.size[0] + k * (configuration.size[0]) * configuration.size[1])] = 2 * ((float)rand()) / RAND_MAX - 1.0;
+						buffer_input[2 * (i + j * configuration.size[0] + k * (configuration.size[0]) * configuration.size[1]) + 1] = 2 * ((float)rand()) / RAND_MAX - 1.0;
+						}
+					}
+				}
+			*/
+			//Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+            resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+            if (resFFT != VKFFT_SUCCESS) return resFFT;
+			//free(buffer_input);
+
+			if (configuration.loadApplicationFromString) {
+				FILE* kernelCache;
+				uint64_t str_len;
+				char fname[500];
+				int VkFFT_version = VkFFTGetVersion();
+				sprintf(fname, "VkFFT_binary");
+				kernelCache = fopen(fname, "rb");
+				if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+				fseek(kernelCache, 0, SEEK_END);
+				str_len = ftell(kernelCache);
+				fseek(kernelCache, 0, SEEK_SET);
+				configuration.loadApplicationString = malloc(str_len);
+				fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+				fclose(kernelCache);
+			}
+			//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
+			resFFT = initializeVkFFT(&app, configuration);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+
+			if (configuration.loadApplicationFromString)
+				free(configuration.loadApplicationString);
+
+			if (configuration.saveApplicationToString) {
+				FILE* kernelCache;
+				char fname[500];
+				int VkFFT_version = VkFFTGetVersion();
+				sprintf(fname, "VkFFT_binary");
+				kernelCache = fopen(fname, "wb");
+				fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+				fclose(kernelCache);
+			}
+
+			//Submit FFT+iFFT.
+			uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
+#if(VKFFT_BACKEND==0)
+			if (vkGPU->physicalDeviceProperties.vendorID == 0x8086) num_iter /= 4;
+#elif(VKFFT_BACKEND==3)
+			cl_uint vendorID;
+			clGetDeviceInfo(vkGPU->device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, 0);
+			if (vendorID == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#elif(VKFFT_BACKEND==4)
+			ze_device_properties_t device_properties;
+			res = zeDeviceGetProperties(vkGPU->device, &device_properties);
+			if (res != 0) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE;
+			if (device_properties.vendorId == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#endif
+			if (num_iter == 0) num_iter = 1;
+			double totTime = 0;
+			VkFFTLaunchParams launchParams = {};
+			resFFT = performVulkanFFTiFFT(vkGPU, &app, &launchParams, num_iter, &totTime);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			run_time[r] = totTime;
+			if (n > 1) {
+				if (r == num_runs - 1) {
+					num_systems++;
+					double std_error = 0;
+					double avg_time = 0;
+					for (uint64_t t = 0; t < num_runs; t++) {
+						avg_time += run_time[t];
+					}
+					avg_time /= num_runs;
+					for (uint64_t t = 0; t < num_runs; t++) {
+						std_error += (run_time[t] - avg_time) * (run_time[t] - avg_time);
+					}
+					std_error = sqrt(std_error / num_runs);
+					uint64_t num_tot_transfers = 0;
+					for (uint64_t i = 0; i < configuration.FFTdim; i++)
+						num_tot_transfers += app.localFFTPlan->numAxisUploads[i];
+					num_tot_transfers *= 4;
+					if (file_output)
+						fprintf(output, "VkFFT System: %" PRIu64 " %" PRIu64 " Buffer: %" PRIu64 " MB avg_time_per_step: %0.3f ms std_error: %0.3f num_iter: %" PRIu64 " benchmark: %" PRIu64 " scaled_bandwidth: %0.1f device_bandwidth: %0.1f\n", configuration.size[0], configuration.numberBatches, bufferSize / 1024 / 1024, avg_time, std_error, num_iter, (uint64_t)(((double)bufferSize * sizeof(float) / (2*sizeof(double)) / 1024) / avg_time), bufferSize / 1024.0 / 1024.0 / 1.024 * 4 / avg_time, bufferSize / 1024.0 / 1024.0 / 1.024 * num_tot_transfers / avg_time);
+
+					printf("VkFFT System: %" PRIu64 " %" PRIu64 " Buffer: %" PRIu64 " MB avg_time_per_step: %0.3f ms std_error: %0.3f num_iter: %" PRIu64 " benchmark: %" PRIu64 " scaled_bandwidth: %0.1f device_bandwidth: %0.1f\n", configuration.size[0], configuration.numberBatches, bufferSize / 1024 / 1024, avg_time, std_error, num_iter, (uint64_t)(((double)bufferSize * sizeof(float) / (2*sizeof(double)) / 1024) / avg_time), bufferSize / 1024.0 / 1024.0 / 1.024 * 4 / avg_time, bufferSize / 1024.0 / 1024.0 / 1.024 * num_tot_transfers / avg_time);
+					benchmark_result += ((double)bufferSize * sizeof(float) / (2*sizeof(double)) / 1024) / avg_time;
+				}
+
+
+			}
+
+#if(VKFFT_BACKEND==0)
+			vkDestroyBuffer(vkGPU->device, buffer, NULL);
+			vkFreeMemory(vkGPU->device, bufferDeviceMemory, NULL);
+#elif(VKFFT_BACKEND==1)
+			cudaFree(buffer);
+#elif(VKFFT_BACKEND==2)
+			hipFree(buffer);
+#elif(VKFFT_BACKEND==3)
+			clReleaseMemObject(buffer);
+#elif(VKFFT_BACKEND==4)
+			zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+            buffer->release();
+#endif
+			deleteVkFFT(&app);
+
+		}
+	}
+	free(buffer_input);
+	benchmark_result /= (num_systems);
+	if (file_output) {
+		fprintf(output, "Benchmark score VkFFT: %" PRIu64 "\n", (uint64_t)(benchmark_result));
+#if(VKFFT_BACKEND==0)
+		fprintf(output, "Device name: %s API:%d.%d.%d\n", vkGPU->physicalDeviceProperties.deviceName, (vkGPU->physicalDeviceProperties.apiVersion >> 22), ((vkGPU->physicalDeviceProperties.apiVersion >> 12) & 0x3ff), (vkGPU->physicalDeviceProperties.apiVersion & 0xfff));
+#endif
+	}
+	printf("Benchmark score VkFFT: %" PRIu64 "\n", (uint64_t)(benchmark_result));
+#if(VKFFT_BACKEND==0)
+	printf("Device name: %s API:%d.%d.%d\n", vkGPU->physicalDeviceProperties.deviceName, (vkGPU->physicalDeviceProperties.apiVersion >> 22), ((vkGPU->physicalDeviceProperties.apiVersion >> 12) & 0x3ff), (vkGPU->physicalDeviceProperties.apiVersion & 0xfff));
+#endif
+	return resFFT;
+}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
index f753278d..aa862217 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
@@ -195,7 +195,7 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
 			}
 			//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
 			resFFT = initializeVkFFT(&app, configuration);
-			if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT) {
+			if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R) {
 				if (r == num_runs - 1) {
 					omitted_systems++;
 					if (file_output)
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
index 2867664c..8440a243 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
@@ -196,7 +196,7 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
 			}
 			//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
 			resFFT = initializeVkFFT(&app, configuration);
-			if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT) {
+			if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R) {
 				if (r == num_runs - 1) {
 					omitted_systems++;
 					if (file_output)
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
index 59e8ec9f..8c77f7a5 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
@@ -75,18 +75,18 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
 	const int num_benchmark_samples = 63;
 	const int num_runs = 1;
 
-	uint64_t benchmark_dimensions[num_benchmark_samples][4] = {  {(uint64_t)pow(2,5), 1, 1, 1}, {(uint64_t)pow(2,6), 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1},
-		{(uint64_t)pow(2,11), 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1},
-		{(uint64_t)pow(2,19), 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1},{(uint64_t)pow(2,24), 1, 1, 1},{(uint64_t)pow(2,25), 1, 1, 1},{(uint64_t)pow(2,26), 1, 1, 1},
+	uint64_t benchmark_dimensions[num_benchmark_samples][5] = {  {(uint64_t)pow(2,5), 1, 1, 1, 1}, {(uint64_t)pow(2,6), 1, 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1, 1},
+		{(uint64_t)pow(2,11), 1, 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1, 1},
+		{(uint64_t)pow(2,19), 1, 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1, 1},{(uint64_t)pow(2,24), 1, 1, 1, 1},{(uint64_t)pow(2,25), 1, 1, 1, 1},{(uint64_t)pow(2,26), 1, 1, 1, 1},
 
-		{8, (uint64_t)pow(2,3), 1, 2},{8, (uint64_t)pow(2,4), 1, 2},{8, (uint64_t)pow(2,5), 1, 2},{8, (uint64_t)pow(2,6), 1, 2},{8, (uint64_t)pow(2,7), 1, 2},{8, (uint64_t)pow(2,8), 1, 2},{8, (uint64_t)pow(2,9), 1, 2},{8, (uint64_t)pow(2,10), 1, 2},
-		{8, (uint64_t)pow(2,11), 1, 2},{8, (uint64_t)pow(2,12), 1, 2},{8, (uint64_t)pow(2,13), 1, 2},{8, (uint64_t)pow(2,14), 1, 2},{8, (uint64_t)pow(2,15), 1, 2},{8, (uint64_t)pow(2,16), 1, 2},{8, (uint64_t)pow(2,17), 1, 2},{8, (uint64_t)pow(2,18), 1, 2},
-		{8, (uint64_t)pow(2,19), 1, 2},{8, (uint64_t)pow(2,20), 1, 2},{8, (uint64_t)pow(2,21), 1, 2},{8, (uint64_t)pow(2,22), 1, 2},{8, (uint64_t)pow(2,23), 1, 2},{8, (uint64_t)pow(2,24), 1, 2},
+		{8, (uint64_t)pow(2,3), 1, 1, 2},{8, (uint64_t)pow(2,4), 1, 1, 2},{8, (uint64_t)pow(2,5), 1, 1, 2},{8, (uint64_t)pow(2,6), 1, 1, 2},{8, (uint64_t)pow(2,7), 1, 1, 2},{8, (uint64_t)pow(2,8), 1, 1, 2},{8, (uint64_t)pow(2,9), 1, 1, 2},{8, (uint64_t)pow(2,10), 1, 1, 2},
+		{8, (uint64_t)pow(2,11), 1, 1, 2},{8, (uint64_t)pow(2,12), 1, 1, 2},{8, (uint64_t)pow(2,13), 1, 1, 2},{8, (uint64_t)pow(2,14), 1, 1, 2},{8, (uint64_t)pow(2,15), 1, 1, 2},{8, (uint64_t)pow(2,16), 1, 1, 2},{8, (uint64_t)pow(2,17), 1, 1, 2},{8, (uint64_t)pow(2,18), 1, 1, 2},
+		{8, (uint64_t)pow(2,19), 1, 1, 2},{8, (uint64_t)pow(2,20), 1, 1, 2},{8, (uint64_t)pow(2,21), 1, 1, 2},{8, (uint64_t)pow(2,22), 1, 1, 2},{8, (uint64_t)pow(2,23), 1, 1, 2},{8, (uint64_t)pow(2,24), 1, 1, 2},
 
-		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 2},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 2},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 2},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 2},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 2},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 2},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 2},
-		{ (uint64_t)pow(2,10), (uint64_t)pow(2,10), 1, 2},{ (uint64_t)pow(2,11), (uint64_t)pow(2,11), 1, 2},{ (uint64_t)pow(2,12), (uint64_t)pow(2,12), 1, 2},{ (uint64_t)pow(2,13), (uint64_t)pow(2,13), 1, 2},{ (uint64_t)pow(2,14), (uint64_t)pow(2,13), 1, 2},
+		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 1, 2},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 1, 2},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 1, 2},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 1, 2},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 1, 2},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 1, 2},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 1, 2},
+		{ (uint64_t)pow(2,10), (uint64_t)pow(2,10), 1, 1, 2},{ (uint64_t)pow(2,11), (uint64_t)pow(2,11), 1, 1, 2},{ (uint64_t)pow(2,12), (uint64_t)pow(2,12), 1, 1, 2},{ (uint64_t)pow(2,13), (uint64_t)pow(2,13), 1, 1, 2},{ (uint64_t)pow(2,14), (uint64_t)pow(2,13), 1, 1, 2},
 
-		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), (uint64_t)pow(2,3), 3},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), (uint64_t)pow(2,4), 3},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), (uint64_t)pow(2,5), 3},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), (uint64_t)pow(2,6), 3},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), (uint64_t)pow(2,7), 3},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), (uint64_t)pow(2,8), 3},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), (uint64_t)pow(2,9), 3},
+		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 3},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 3},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 3},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 3},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 3},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 3},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 3},
 	};
 
 	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
@@ -117,7 +117,7 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
 
 			fftw_complex* output_FFTW = (fftw_complex*)(malloc(sizeof(fftw_complex) * dims[0] * dims[1] * dims[2]));
 			if (!output_FFTW) return VKFFT_ERROR_MALLOC_FAILED;
-			switch (benchmark_dimensions[n][3]) {
+			switch (benchmark_dimensions[n][4]) {
 			case 1:
 				p = fftw_plan_dft_1d((int)benchmark_dimensions[n][0], inputC_double, output_FFTW, -1, FFTW_ESTIMATE);
 				break;
@@ -148,7 +148,7 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
 
 			VkFFTConfiguration configuration = {};
 			VkFFTApplication app = {};
-			configuration.FFTdim = benchmark_dimensions[n][3]; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.FFTdim = benchmark_dimensions[n][4]; //FFT dimension, 1D, 2D or 3D (default 1).
 			configuration.size[0] = benchmark_dimensions[n][0]; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
 			configuration.size[1] = benchmark_dimensions[n][1];
 			configuration.size[2] = benchmark_dimensions[n][2];
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
index a8c53fc0..4fb199ff 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
@@ -74,18 +74,18 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
 	const int num_benchmark_samples = 60;
 	const int num_runs = 1;
 
-	uint64_t benchmark_dimensions[num_benchmark_samples][4] = { {(uint64_t)pow(2,5), 1, 1, 1},{(uint64_t)pow(2,6), 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1},
-		{(uint64_t)pow(2,11), 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1},
-		{(uint64_t)pow(2,19), 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1},{(uint64_t)pow(2,24), 1, 1, 1},{(uint64_t)pow(2,25), 1, 1, 1},{(uint64_t)pow(2,26), 1, 1, 1},
+	uint64_t benchmark_dimensions[num_benchmark_samples][5] = { {(uint64_t)pow(2,5), 1, 1, 1, 1},{(uint64_t)pow(2,6), 1, 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1, 1},
+		{(uint64_t)pow(2,11), 1, 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1, 1},
+		{(uint64_t)pow(2,19), 1, 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1, 1},{(uint64_t)pow(2,24), 1, 1, 1, 1},{(uint64_t)pow(2,25), 1, 1, 1, 1},{(uint64_t)pow(2,26), 1, 1, 1, 1},
 
-		{8, (uint64_t)pow(2,3), 1, 2},{8, (uint64_t)pow(2,4), 1, 2},{8, (uint64_t)pow(2,5), 1, 2},{8, (uint64_t)pow(2,6), 1, 2},{8, (uint64_t)pow(2,7), 1, 2},{8, (uint64_t)pow(2,8), 1, 2},{8, (uint64_t)pow(2,9), 1, 2},{8, (uint64_t)pow(2,10), 1, 2},
-		{8, (uint64_t)pow(2,11), 1, 2},{8, (uint64_t)pow(2,12), 1, 2},{8, (uint64_t)pow(2,13), 1, 2},{8, (uint64_t)pow(2,14), 1, 2},{8, (uint64_t)pow(2,15), 1, 2},{8, (uint64_t)pow(2,16), 1, 2},{8, (uint64_t)pow(2,17), 1, 2},{8, (uint64_t)pow(2,18), 1, 2},
-		{8, (uint64_t)pow(2,19), 1, 2},{8, (uint64_t)pow(2,20), 1, 2},{8, (uint64_t)pow(2,21), 1, 2},{8, (uint64_t)pow(2,22), 1, 2},{8, (uint64_t)pow(2,23), 1, 2},
+		{8, (uint64_t)pow(2,3), 1, 1, 2},{8, (uint64_t)pow(2,4), 1, 1, 2},{8, (uint64_t)pow(2,5), 1, 1, 2},{8, (uint64_t)pow(2,6), 1, 1, 2},{8, (uint64_t)pow(2,7), 1, 1, 2},{8, (uint64_t)pow(2,8), 1, 1, 2},{8, (uint64_t)pow(2,9), 1, 1, 2},{8, (uint64_t)pow(2,10), 1, 1, 2},
+		{8, (uint64_t)pow(2,11), 1, 1, 2},{8, (uint64_t)pow(2,12), 1, 1, 2},{8, (uint64_t)pow(2,13), 1, 1, 2},{8, (uint64_t)pow(2,14), 1, 1, 2},{8, (uint64_t)pow(2,15), 1, 1, 2},{8, (uint64_t)pow(2,16), 1, 1, 2},{8, (uint64_t)pow(2,17), 1, 1, 2},{8, (uint64_t)pow(2,18), 1, 1, 2},
+		{8, (uint64_t)pow(2,19), 1, 1, 2},{8, (uint64_t)pow(2,20), 1, 1, 2},{8, (uint64_t)pow(2,21), 1, 1, 2},{8, (uint64_t)pow(2,22), 1, 1, 2},{8, (uint64_t)pow(2,23), 1, 1, 2},
 
-		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 2}, { (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 2},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 2},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 2},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 2},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 2},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 2},
-		{ (uint64_t)pow(2,10), (uint64_t)pow(2,10), 1, 2},{ (uint64_t)pow(2,11), (uint64_t)pow(2,11), 1, 2},{ (uint64_t)pow(2,12), (uint64_t)pow(2,12), 1, 2},{ (uint64_t)pow(2,13), (uint64_t)pow(2,13), 1, 2},
+		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 1, 2}, { (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 1, 2},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 1, 2},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 1, 2},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 1, 2},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 1, 2},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 1, 2},
+		{ (uint64_t)pow(2,10), (uint64_t)pow(2,10), 1, 1, 2},{ (uint64_t)pow(2,11), (uint64_t)pow(2,11), 1, 1, 2},{ (uint64_t)pow(2,12), (uint64_t)pow(2,12), 1, 1, 2},{ (uint64_t)pow(2,13), (uint64_t)pow(2,13), 1, 1, 2},
 
-		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), (uint64_t)pow(2,3), 3},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), (uint64_t)pow(2,4), 3},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), (uint64_t)pow(2,5), 3},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), (uint64_t)pow(2,6), 3},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), (uint64_t)pow(2,7), 3},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), (uint64_t)pow(2,8), 3}
+		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 3},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 3},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 3},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 3},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 3},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 3}
 	};
 
 	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
@@ -115,7 +115,7 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
 
 			fftw_complex* output_FFTW = (fftw_complex*)(malloc(sizeof(fftw_complex) * dims[0] * dims[1] * dims[2]));
 			if (!output_FFTW) return VKFFT_ERROR_MALLOC_FAILED;
-			switch (benchmark_dimensions[n][3]) {
+			switch (benchmark_dimensions[n][4]) {
 			case 1:
 				p = fftw_plan_dft_1d((int)benchmark_dimensions[n][0], inputC_double, output_FFTW, -1, FFTW_ESTIMATE);
 				break;
@@ -147,7 +147,7 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
 			VkFFTConfiguration configuration = {};
 			VkFFTApplication app = {};
 
-			configuration.FFTdim = benchmark_dimensions[n][3]; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.FFTdim = benchmark_dimensions[n][4]; //FFT dimension, 1D, 2D or 3D (default 1).
 			configuration.size[0] = benchmark_dimensions[n][0]; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
 			configuration.size[1] = benchmark_dimensions[n][1];
 			configuration.size[2] = benchmark_dimensions[n][2];
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
index 8879fc8a..2e5689c8 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
@@ -77,18 +77,18 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
 	const int num_benchmark_samples = 61;
 	const int num_runs = 1;
 
-	uint64_t benchmark_dimensions[num_benchmark_samples][4] = { {(uint64_t)pow(2,5), 1, 1, 1},{(uint64_t)pow(2,6), 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1},
-		{(uint64_t)pow(2,11), 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1},
-		{(uint64_t)pow(2,19), 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1},{(uint64_t)pow(2,24), 1, 1, 1},
+	uint64_t benchmark_dimensions[num_benchmark_samples][5] = { {(uint64_t)pow(2,5), 1, 1, 1, 1},{(uint64_t)pow(2,6), 1, 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1, 1},
+		{(uint64_t)pow(2,11), 1, 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1, 1},
+		{(uint64_t)pow(2,19), 1, 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1, 1},{(uint64_t)pow(2,24), 1, 1, 1, 1},
 
-		{8, (uint64_t)pow(2,3), 1, 2},{8, (uint64_t)pow(2,4), 1, 2},{8, (uint64_t)pow(2,5), 1, 2},{8, (uint64_t)pow(2,6), 1, 2},{8, (uint64_t)pow(2,7), 1, 2},{8, (uint64_t)pow(2,8), 1, 2},{8, (uint64_t)pow(2,9), 1, 2},{8, (uint64_t)pow(2,10), 1, 2},
-		{8, (uint64_t)pow(2,11), 1, 2},{8, (uint64_t)pow(2,12), 1, 2},{8, (uint64_t)pow(2,13), 1, 2},{8, (uint64_t)pow(2,14), 1, 2},{8, (uint64_t)pow(2,15), 1, 2},{8, (uint64_t)pow(2,16), 1, 2},{8, (uint64_t)pow(2,17), 1, 2},{8, (uint64_t)pow(2,18), 1, 2},
-		{8, (uint64_t)pow(2,19), 1, 2},{8, (uint64_t)pow(2,20), 1, 2},{8, (uint64_t)pow(2,21), 1, 2},{8, (uint64_t)pow(2,22), 1, 2},{8, (uint64_t)pow(2,23), 1, 2},{8, (uint64_t)pow(2,24), 1, 2},
+		{8, (uint64_t)pow(2,3), 1, 1, 2},{8, (uint64_t)pow(2,4), 1, 1, 2},{8, (uint64_t)pow(2,5), 1, 1, 2},{8, (uint64_t)pow(2,6), 1, 1, 2},{8, (uint64_t)pow(2,7), 1, 1, 2},{8, (uint64_t)pow(2,8), 1, 1, 2},{8, (uint64_t)pow(2,9), 1, 1, 2},{8, (uint64_t)pow(2,10), 1, 1, 2},
+		{8, (uint64_t)pow(2,11), 1, 1, 2},{8, (uint64_t)pow(2,12), 1, 1, 2},{8, (uint64_t)pow(2,13), 1, 1, 2},{8, (uint64_t)pow(2,14), 1, 1, 2},{8, (uint64_t)pow(2,15), 1, 1, 2},{8, (uint64_t)pow(2,16), 1, 1, 2},{8, (uint64_t)pow(2,17), 1, 1, 2},{8, (uint64_t)pow(2,18), 1, 1, 2},
+		{8, (uint64_t)pow(2,19), 1, 1, 2},{8, (uint64_t)pow(2,20), 1, 1, 2},{8, (uint64_t)pow(2,21), 1, 1, 2},{8, (uint64_t)pow(2,22), 1, 1, 2},{8, (uint64_t)pow(2,23), 1, 1, 2},{8, (uint64_t)pow(2,24), 1, 1, 2},
 
-		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 2},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 2},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 2},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 2},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 2},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 2},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 2},
-		{ (uint64_t)pow(2,10), (uint64_t)pow(2,10), 1, 2},{ (uint64_t)pow(2,11), (uint64_t)pow(2,11), 1, 2},{ (uint64_t)pow(2,12), (uint64_t)pow(2,12), 1, 2},{ (uint64_t)pow(2,13), (uint64_t)pow(2,13), 1, 2},{ (uint64_t)pow(2,14), (uint64_t)pow(2,13), 1, 2},
+		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 1, 2},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 1, 2},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 1, 2},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 1, 2},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 1, 2},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 1, 2},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 1, 2},
+		{ (uint64_t)pow(2,10), (uint64_t)pow(2,10), 1, 1, 2},{ (uint64_t)pow(2,11), (uint64_t)pow(2,11), 1, 1, 2},{ (uint64_t)pow(2,12), (uint64_t)pow(2,12), 1, 1, 2},{ (uint64_t)pow(2,13), (uint64_t)pow(2,13), 1, 1, 2},{ (uint64_t)pow(2,14), (uint64_t)pow(2,13), 1, 1, 2},
 
-		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), (uint64_t)pow(2,3), 3},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), (uint64_t)pow(2,4), 3},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), (uint64_t)pow(2,5), 3},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), (uint64_t)pow(2,6), 3},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), (uint64_t)pow(2,7), 3},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), (uint64_t)pow(2,8), 3},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), (uint64_t)pow(2,9), 3},
+		{ (uint64_t)pow(2,3), (uint64_t)pow(2,3), (uint64_t)pow(2,3), 1, 3},{ (uint64_t)pow(2,4), (uint64_t)pow(2,4), (uint64_t)pow(2,4), 1, 3},{ (uint64_t)pow(2,5), (uint64_t)pow(2,5), (uint64_t)pow(2,5), 1, 3},{ (uint64_t)pow(2,6), (uint64_t)pow(2,6), (uint64_t)pow(2,6), 1, 3},{ (uint64_t)pow(2,7), (uint64_t)pow(2,7), (uint64_t)pow(2,7), 1, 3},{ (uint64_t)pow(2,8), (uint64_t)pow(2,8), (uint64_t)pow(2,8), 1, 3},{ (uint64_t)pow(2,9), (uint64_t)pow(2,9), (uint64_t)pow(2,9), 1, 3},
 	};
 
 	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
@@ -119,7 +119,7 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
 
 			fftw_complex* output_FFTW = (fftw_complex*)(malloc(sizeof(fftw_complex) * dims[0] * dims[1] * dims[2]));
 			if (!output_FFTW) return VKFFT_ERROR_MALLOC_FAILED;
-			switch (benchmark_dimensions[n][3]) {
+			switch (benchmark_dimensions[n][4]) {
 			case 1:
 				p = fftw_plan_dft_1d((int)benchmark_dimensions[n][0], inputC_double, output_FFTW, -1, FFTW_ESTIMATE);
 				break;
@@ -146,7 +146,7 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
 
 			VkFFTConfiguration configuration = {};
 			VkFFTApplication app = {};
-			configuration.FFTdim = benchmark_dimensions[n][3]; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.FFTdim = benchmark_dimensions[n][4]; //FFT dimension, 1D, 2D or 3D (default 1).
 			configuration.size[0] = benchmark_dimensions[n][0]; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
 			configuration.size[1] = benchmark_dimensions[n][1];
 			configuration.size[2] = benchmark_dimensions[n][2];
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
index ca3d7193..87e0947a 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
@@ -304,7 +304,7 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
 				}
 				//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
 				resFFT = initializeVkFFT(&app, configuration);
-				if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT) {
+				if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R) {
 					if (file_output)
 						fprintf(output, "VkFFT DCT-%d System: %" PRIu64 "x%" PRIu64 "x%" PRIu64 "x%" PRIu64 " - UNSUPPORTED\n", t, dims[0], dims[1], dims[2], dims[3]);
 					printf("VkFFT DCT-%d System: %" PRIu64 "x%" PRIu64 "x%" PRIu64 "x%" PRIu64 " - UNSUPPORTED\n", t, dims[0], dims[1], dims[2], dims[3]);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
index d47de3e3..c8c41762 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
@@ -303,7 +303,7 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
 				}
 				//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
 				resFFT = initializeVkFFT(&app, configuration);
-				if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT) {
+				if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R) {
 					if (file_output)
 						fprintf(output, "VkFFT DCT-%d System: %" PRIu64 "x%" PRIu64 "x%" PRIu64 "x%" PRIu64 " - UNSUPPORTED\n", t, dims[0], dims[1], dims[2], dims[3]);
 					printf("VkFFT DCT-%d System: %" PRIu64 "x%" PRIu64 "x%" PRIu64 "x%" PRIu64 " - UNSUPPORTED\n", t, dims[0], dims[1], dims[2], dims[3]);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.cpp
new file mode 100644
index 00000000..6db8f4d3
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.cpp
@@ -0,0 +1,412 @@
+//general parts
+#include <stdio.h>
+#include <vector>
+#include <memory>
+#include <string.h>
+#include <chrono>
+#include <thread>
+#include <iostream>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#if(VKFFT_BACKEND==0)
+#include "vulkan/vulkan.h"
+#include "glslang_c_interface.h"
+#elif(VKFFT_BACKEND==1)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+#include <cuda_runtime_api.h>
+#include <cuComplex.h>
+#elif(VKFFT_BACKEND==2)
+#ifndef __HIP_PLATFORM_HCC__
+#define __HIP_PLATFORM_HCC__
+#endif
+#include <hip/hip_runtime.h>
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_complex.h>
+#elif(VKFFT_BACKEND==3)
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif 
+#elif(VKFFT_BACKEND==4)
+#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
+#endif
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+#include "fftw3.h"
+#ifdef USE_cuFFT
+#include "precision_cuFFT_double.h"
+#endif	
+#ifdef USE_rocFFT
+#include "precision_rocFFT_double.h"
+#endif	
+#include <iostream>
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#include <quadmath.h>
+#endif
+VkFFTResult sample_19_precision_VkFFT_quadDoubleDouble_nonPow2(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized)
+{
+	VkFFTResult resFFT = VKFFT_SUCCESS;
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+#if(VKFFT_BACKEND==0)
+	VkResult res = VK_SUCCESS;
+#elif(VKFFT_BACKEND==1)
+	cudaError_t res = cudaSuccess;
+#elif(VKFFT_BACKEND==2)
+	hipError_t res = hipSuccess;
+#elif(VKFFT_BACKEND==3)
+	cl_int res = CL_SUCCESS;
+#elif(VKFFT_BACKEND==4)
+	ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
+#endif
+	if (file_output)
+		fprintf(output, "19 - VkFFT/FFTW C2C precision test in double-double emulation of quad precision\n");
+	printf("19 - VkFFT/FFTW C2C precision test in double-double emulation of quad precision\n");
+
+	const int num_benchmark_samples = 349;
+	const int num_runs = 1;
+
+	uint64_t benchmark_dimensions[num_benchmark_samples][5] = {  {(uint64_t)pow(2,5), 1, 1, 1, 1},{(uint64_t)pow(2,6), 1, 1, 1, 1},{(uint64_t)pow(2,7), 1, 1, 1, 1},{(uint64_t)pow(2,8), 1, 1, 1, 1},{(uint64_t)pow(2,9), 1, 1, 1, 1},{(uint64_t)pow(2,10), 1, 1, 1, 1},
+		{(uint64_t)pow(2,11), 1, 1, 1, 1},{(uint64_t)pow(2,12), 1, 1, 1, 1},{(uint64_t)pow(2,13), 1, 1, 1, 1},{(uint64_t)pow(2,14), 1, 1, 1, 1},{(uint64_t)pow(2,15), 1, 1, 1, 1},{(uint64_t)pow(2,16), 1, 1, 1, 1},{(uint64_t)pow(2,17), 1, 1, 1, 1},{(uint64_t)pow(2,18), 1, 1, 1, 1},
+		{(uint64_t)pow(2,19), 1, 1, 1, 1},{(uint64_t)pow(2,20), 1, 1, 1, 1},{(uint64_t)pow(2,21), 1, 1, 1, 1},{(uint64_t)pow(2,22), 1, 1, 1, 1},{(uint64_t)pow(2,23), 1, 1, 1, 1}, {3, 1, 1, 1, 1}, {5, 1, 1, 1, 1},{6, 1, 1, 1, 1},{7, 1, 1, 1, 1},{9, 1, 1, 1, 1},{10, 1, 1, 1, 1},{11, 1, 1, 1, 1},{12, 1, 1, 1, 1},{13, 1, 1, 1, 1},{14, 1, 1, 1, 1},
+		{15, 1, 1, 1, 1},{17, 1, 1, 1, 1},{19, 1, 1, 1, 1},{21, 1, 1, 1, 1},{22, 1, 1, 1, 1},{23, 1, 1, 1, 1},{24, 1, 1, 1, 1},{25, 1, 1, 1, 1},{26, 1, 1, 1, 1},{27, 1, 1, 1, 1},{28, 1, 1, 1, 1},{29, 1, 1, 1, 1},{30, 1, 1, 1, 1},{31, 1, 1, 1, 1},{33, 1, 1, 1, 1},{35, 1, 1, 1, 1},{37, 1, 1, 1, 1},{39, 1, 1, 1, 1},{41, 1, 1, 1, 1},{43, 1, 1, 1, 1},{42, 1, 1, 1, 1},{44, 1, 1, 1, 1},{45, 1, 1, 1, 1},{47, 1, 1, 1, 1},{49, 1, 1, 1, 1},{52, 1, 1, 1, 1},{53, 1, 1, 1, 1},{55, 1, 1, 1, 1},{56, 1, 1, 1, 1},{59, 1, 1, 1, 1},{60, 1, 1, 1, 1},{61, 1, 1, 1, 1},{65, 1, 1, 1, 1},{66, 1, 1, 1, 1},{67, 1, 1, 1, 1},{71, 1, 1, 1, 1},{73, 1, 1, 1, 1},{79, 1, 1, 1, 1},{81, 1, 1, 1, 1},{83, 1, 1, 1, 1},{89, 1, 1, 1, 1},{97, 1, 1, 1, 1},
+		{121, 1, 1, 1, 1},{125, 1, 1, 1, 1},{137, 1, 1, 1, 1},{143, 1, 1, 1, 1},{169, 1, 1, 1, 1},{191, 1, 1, 1, 1},{243, 1, 1, 1, 1},{286, 1, 1, 1, 1},{343, 1, 1, 1, 1},{383, 1, 1, 1, 1},{429, 1, 1, 1, 1},{509, 1, 1, 1, 1},{572, 1, 1, 1, 1},{625, 1, 1, 1, 1},{720, 1, 1, 1, 1},{1080, 1, 1, 1, 1},{1001, 1, 1, 1, 1},{1213, 1, 1, 1, 1},{1287, 1, 1, 1, 1},{1400, 1, 1, 1, 1},{1440, 1, 1, 1, 1},{1920, 1, 1, 1, 1},{2160, 1, 1, 1, 1},{2731, 1, 1, 1, 1},{3024,1,1, 1, 1},{3500,1,1, 1, 1},
+		{3840, 1, 1, 1, 1},{4000 , 1, 1, 1, 1},{4050, 1, 1, 1, 1},{4320 , 1, 1, 1, 1},{4391, 1, 1, 1, 1},{7000,1,1, 1, 1},{7680, 1, 1, 1, 1},{7879, 1, 1, 1, 1},{9000, 1, 1, 1, 1},{11587, 1, 1, 1, 1},{7680 * 5, 1, 1, 1, 1},
+		{15319, 1, 1, 1, 1},{21269, 1, 1, 1, 1},{27283, 1, 1, 1, 1},{39829, 1, 1, 1, 1},{52733, 1, 1, 1, 1},{2000083, 1, 1, 1, 1},{4000067, 1, 1, 1, 1},{8003869, 1, 1, 1, 1},
+		{(uint64_t)pow(3,10), 1, 1, 1, 1},{(uint64_t)pow(3,11), 1, 1, 1, 1},{(uint64_t)pow(3,12), 1, 1, 1, 1},{(uint64_t)pow(3,13), 1, 1, 1, 1},{(uint64_t)pow(3,14), 1, 1, 1, 1},{(uint64_t)pow(3,15), 1, 1, 1, 1},
+		{(uint64_t)pow(5,5), 1, 1, 1, 1},{(uint64_t)pow(5,6), 1, 1, 1, 1},{(uint64_t)pow(5,7), 1, 1, 1, 1},{(uint64_t)pow(5,8), 1, 1, 1, 1},{(uint64_t)pow(5,9), 1, 1, 1, 1},
+		{(uint64_t)pow(7,4), 1, 1, 1, 1},{(uint64_t)pow(7,5), 1, 1, 1, 1},{(uint64_t)pow(7,6), 1, 1, 1, 1},{(uint64_t)pow(7,7), 1, 1, 1, 1},{(uint64_t)pow(7,8), 1, 1, 1, 1},
+		{(uint64_t)pow(11,3), 1, 1, 1, 1},{(uint64_t)pow(11,4), 1, 1, 1, 1},{(uint64_t)pow(11,5), 1, 1, 1, 1},{(uint64_t)pow(11,6), 1, 1, 1, 1},
+		{(uint64_t)pow(13,3), 1, 1, 1, 1},{(uint64_t)pow(13,4), 1, 1, 1, 1},{(uint64_t)pow(13,5), 1, 1, 1, 1},{(uint64_t)pow(13,6), 1, 1, 1, 1},
+		{8, 3, 1, 1, 2},{8, 5, 1, 1, 2},{8, 6, 1, 1, 2},{8, 7, 1, 1, 2},{8, 9, 1, 1, 2},{8, 10, 1, 1, 2},{8, 11, 1, 1, 2},{8, 12, 1, 1, 2},{8, 13, 1, 1, 2},{8, 14, 1, 1, 2},{8, 15, 1, 1, 2},{8, 17, 1, 1, 2},{8, 19, 1, 1, 2},{8, 21, 1, 1, 2},{8, 22, 1, 1, 2},{8, 23, 1, 1, 2},{8, 24, 1, 1, 2},
+		{8, 25, 1, 1, 2},{8, 26, 1, 1, 2},{8, 27, 1, 1, 2},{8, 28, 1, 1, 2},{8, 29, 1, 1, 2},{8, 30, 1, 1, 2},{8, 31, 1, 1, 2},{8, 33, 1, 1, 2},{8, 35, 1, 1, 2},{8, 37, 1, 1, 2},{8, 39, 1, 1, 2},{8, 41, 1, 1, 2},{8, 43, 1, 1, 2},{8, 44, 1, 1, 2},{8, 45, 1, 1, 2},{8, 47, 1, 1, 2},{8, 49, 1, 1, 2},{8, 52, 1, 1, 2},{8, 53, 1, 1, 2},{8, 56, 1, 1, 2},{8, 59, 1, 1, 2},{8, 60, 1, 1, 2},{8, 61, 1, 1, 2},{8, 66, 1, 1, 2},{8, 67, 1, 1, 2},{8, 71, 1, 1, 2},{8, 73, 1, 1, 2},{8, 79, 1, 1, 2},{8, 81, 1, 1, 2},{8, 83, 1, 1, 2},{8, 89, 1, 1, 2},{8, 97, 1, 1, 2},{8, 125, 1, 1, 2},{8, 243, 1, 1, 2},{8, 343, 1, 1, 2},
+		{8, 625, 1, 1, 2},{8, 720, 1, 1, 2},{8, 1080, 1, 1, 2},{8, 1287, 1, 1, 2},{8, 1400, 1, 1, 2},{8, 1440, 1, 1, 2},{8, 1920, 1, 1, 2},{8, 2160, 1, 1, 2},{8, 2731, 1, 1, 2},{8, 3024, 1, 1, 2},{8, 3500, 1, 1, 2},
+		{8, 3840, 1, 1, 2},{8, 4000, 1, 1, 2},{8, 4050, 1, 1, 2},{8, 4320, 1, 1, 2},{8, 4391, 1, 1, 2},{8, 7000, 1, 1, 2},{8, 7680, 1, 1, 2},{8, 4050 * 3, 1, 1, 2},{8, 7680 * 5, 1, 1, 2}, {720, 480, 1, 1, 2},{1280, 720, 1, 1, 2},{1920, 1080, 1, 1, 2}, {2560, 1440, 1, 1, 2},{3840, 2160, 1, 1, 2},{7680, 4320, 1, 1, 2},
+		{8,15319, 1, 1, 2},{8,21269, 1, 1, 2},{8,27283, 1, 1, 2},{8,39829, 1, 1, 2},{8,52733, 1, 1, 2},{8,2000083, 1, 1, 2},{8,4000067, 1, 1, 2},{8,8003869, 1, 1, 2},
+		{8, (uint64_t)pow(3,10), 1, 1, 2},	{8, (uint64_t)pow(3,11), 1, 1, 2}, {8, (uint64_t)pow(3,12), 1, 1, 2}, {8, (uint64_t)pow(3,13), 1, 1, 2}, {8, (uint64_t)pow(3,14), 1, 1, 2}, {8, (uint64_t)pow(3,15), 1, 1, 2},
+		{8, (uint64_t)pow(5,5), 1, 1, 2},	{8, (uint64_t)pow(5,6), 1, 1, 2}, {8, (uint64_t)pow(5,7), 1, 1, 2}, {8, (uint64_t)pow(5,8), 1, 1, 2}, {8, (uint64_t)pow(5,9), 1, 1, 2},
+		{8, (uint64_t)pow(7,4), 1, 1, 2},{8, (uint64_t)pow(7,5), 1, 1, 2},{8, (uint64_t)pow(7,6), 1, 1, 2},{8, (uint64_t)pow(7,7), 1, 1, 2},{8, (uint64_t)pow(7,8), 1, 1, 2},
+		{8, (uint64_t)pow(11,3), 1, 1, 2},{8, (uint64_t)pow(11,4), 1, 1, 2},{8, (uint64_t)pow(11,5), 1, 1, 2},{8, (uint64_t)pow(11,6), 1, 1, 2},
+		{8, (uint64_t)pow(13,3), 1, 1, 2},{8, (uint64_t)pow(13,4), 1, 1, 2},{8, (uint64_t)pow(13,5), 1, 1, 2},{8, (uint64_t)pow(13,6), 1, 1, 2},
+		{3, 3, 3, 1, 3},{5, 5, 5, 1, 3},{6, 6, 6, 1, 3},{7, 7, 7, 1, 3},{9, 9, 9, 1, 3},{10, 10, 10, 1, 3},{11, 11, 11, 1, 3},{12, 12, 12, 1, 3},{13, 13, 13, 1, 3},{14, 14, 14, 1, 3},
+		{15, 15, 15, 1, 3},{17, 17, 17, 1, 3},{21, 21, 21, 1, 3},{22, 22, 22, 1, 3},{23, 23, 23, 1, 3},{24, 24, 24, 1, 3},{25, 25, 25, 1, 3},{26, 26, 26, 1, 3},{27, 27, 27, 1, 3},{28, 28, 28, 1, 3},{29, 29, 29, 1, 3},{30, 30, 30, 1, 3},{31, 31, 31, 1, 3},{33, 33, 33, 1, 3},{35, 35, 35, 1, 3},{37, 37, 37, 1, 3},{39, 39, 39, 1, 3},{41, 41, 41, 1, 3},{42, 42, 42, 1, 3},{43, 43, 43, 1, 3},{44, 44, 44, 1, 3},{45, 45, 45, 1, 3},{47, 47, 47, 1, 3},{49, 49, 49, 1, 3},{52, 52, 52, 1, 3},{53, 53, 53, 1, 3},{56, 56, 56, 1, 3},{59, 59, 59, 1, 3},{60, 60, 60, 1, 3},{61, 61, 61, 1, 3},{81, 81, 81, 1, 3},
+		{121, 121, 121, 1, 3},{125, 125, 125, 1, 3},{143, 143, 143, 1, 3},{169, 169, 169, 1, 3},{243, 243, 243, 1, 3},
+		{3, 3, 3, 3, 4},{5, 5, 5, 5, 4},{6, 6, 6, 6, 4},{7, 7, 7, 7, 4},{9, 9, 9, 9, 4},{10, 10, 10, 10, 4},{11, 11, 11, 11, 4},{12, 12, 12, 12, 4},{13, 13, 13, 13, 4},{14, 14, 14, 14, 4},
+		{15, 15, 15, 15, 4},{17, 17, 17, 17, 4},{21, 21, 21, 21, 4},{22, 22, 22, 22, 4},{23, 23, 23, 23, 4},{24, 24, 24, 24, 4},{25, 25, 25, 25, 4},{26, 26, 26, 26, 4},{27, 27, 27, 27, 4},{28, 28, 28, 28, 4},{29, 29, 29, 29, 4},{30, 30, 30, 30, 4},{31, 31, 31, 31, 4},{33, 33, 33, 33, 4},{35, 35, 35, 35, 4},{37, 37, 37, 37, 4},{39, 39, 39, 39, 4},{41, 41, 41, 41, 4},{42, 42, 42, 42, 4},{43, 43, 43, 43, 4},{44, 44, 44, 44, 4},{45, 45, 45, 45, 4},{47, 47, 47, 47, 4},{49, 49, 49, 49, 4},{52, 52, 52, 52, 4},{53, 53, 53, 53, 4},{56, 56, 56, 56, 4},{59, 59, 59, 59, 4},{60, 60, 60, 60, 4},{61, 61, 61, 61, 4},{81, 81, 81, 81, 4},
+		{3, 5, 7, 9, 4},{5, 3, 7, 9, 4},{9, 7, 5, 3, 4},{23, 25, 27, 29, 4},{25, 23, 27, 29, 4},{29, 27, 25, 23, 4},{123, 25, 127, 129, 4},{125, 123, 27, 129, 4},{129, 127, 125, 23, 4},
+		{20000, 2, 3, 3, 4},{3, 20000, 2, 3, 4},{3, 2, 3, 20000, 4}
+	};
+
+	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
+	for (int n = 0; n < num_benchmark_samples; n++) {
+		for (int r = 0; r < 1; r++) {
+
+			double* inputC;
+			fftwq_complex* inputC_quad;
+			uint64_t dims[4] = { benchmark_dimensions[n][0] , benchmark_dimensions[n][1] , benchmark_dimensions[n][2] , benchmark_dimensions[n][3]};
+
+			inputC = (double*)(calloc(4 * dims[0] * dims[1] * dims[2] * dims[3], sizeof(double)));
+			if (!inputC) return VKFFT_ERROR_MALLOC_FAILED;
+			inputC_quad = (fftwq_complex*)(malloc(sizeof(fftwq_complex) * dims[0] * dims[1] * dims[2] * dims[3]));
+			if (!inputC_quad) return VKFFT_ERROR_MALLOC_FAILED;
+			for (uint64_t k = 0; k < dims[3]; k++) {
+				for (uint64_t l = 0; l < dims[2]; l++) {
+					for (uint64_t j = 0; j < dims[1]; j++) {
+						for (uint64_t i = 0; i < dims[0]; i++) {
+							inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])] =  (double)(2 * ((double)rand()) / RAND_MAX - 1.0);
+							inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])+1] = 1e-17 * (double)(2 * ((double)rand()) / RAND_MAX - 1.0);
+							inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])+2] = (double)(2 * ((double)rand()) / RAND_MAX - 1.0);
+							inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])+3] = 1e-17 * (double)(2 * ((double)rand()) / RAND_MAX - 1.0);
+							inputC_quad[i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2]][0] = (__float128)inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])]+(__float128)inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])+1];
+							inputC_quad[i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2]][1] = (__float128)inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])+2]+(__float128)inputC[4*(i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1] * dims[2])+3];
+						}
+					}
+				}
+			}
+			fftwq_plan p;
+
+			fftwq_complex* output_FFTW = (fftwq_complex*)(malloc(sizeof(fftwq_complex) * dims[0] * dims[1] * dims[2]* dims[3]));
+			if (!output_FFTW) return VKFFT_ERROR_MALLOC_FAILED;
+			switch (benchmark_dimensions[n][4]) {
+			case 1:
+				p = fftwq_plan_dft_1d((int)benchmark_dimensions[n][0], inputC_quad, output_FFTW, -1, FFTW_ESTIMATE);
+				break;
+			case 2:
+				p = fftwq_plan_dft_2d((int)benchmark_dimensions[n][1], (int)benchmark_dimensions[n][0], inputC_quad, output_FFTW, -1, FFTW_ESTIMATE);
+				break;
+			case 3:
+				p = fftwq_plan_dft_3d((int)benchmark_dimensions[n][2], (int)benchmark_dimensions[n][1], (int)benchmark_dimensions[n][0], inputC_quad, output_FFTW, -1, FFTW_ESTIMATE);
+				break;
+			case 4:
+				fftwq_iodim fftw_iodims[4];
+				fftw_iodims[0].n = (int)benchmark_dimensions[n][3];
+				fftw_iodims[0].is = (int)(benchmark_dimensions[n][2]*benchmark_dimensions[n][1]*benchmark_dimensions[n][0]);
+				fftw_iodims[0].os = (int)(benchmark_dimensions[n][2]*benchmark_dimensions[n][1]*benchmark_dimensions[n][0]);
+				fftw_iodims[1].n = (int)benchmark_dimensions[n][2];
+				fftw_iodims[1].is = (int)(benchmark_dimensions[n][1]*benchmark_dimensions[n][0]);
+				fftw_iodims[1].os = (int)(benchmark_dimensions[n][1]*benchmark_dimensions[n][0]);
+				fftw_iodims[2].n = (int)benchmark_dimensions[n][1];
+				fftw_iodims[2].is = (int)(benchmark_dimensions[n][0]);
+				fftw_iodims[2].os = (int)(benchmark_dimensions[n][0]);
+				fftw_iodims[3].n = (int)benchmark_dimensions[n][0];
+				fftw_iodims[3].is = 1;
+				fftw_iodims[3].os = 1;
+				fftwq_iodim howmany_dims[1];
+				howmany_dims[0].n = 1;
+				howmany_dims[0].is = 1;
+				howmany_dims[0].os = 1;
+
+				p = fftwq_plan_guru_dft(4, fftw_iodims, 1, howmany_dims, inputC_quad, output_FFTW, -1, FFTW_ESTIMATE);
+				break;
+			}
+
+			fftwq_execute(p);
+
+			double totTime = 0;
+			int num_iter = 1;
+
+			//VkFFT part
+
+			VkFFTConfiguration configuration = {};
+			VkFFTApplication app = {};
+			configuration.FFTdim = benchmark_dimensions[n][4]; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.size[0] = benchmark_dimensions[n][0]; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
+			configuration.size[1] = benchmark_dimensions[n][1];
+			configuration.size[2] = benchmark_dimensions[n][2];
+			configuration.size[3] = benchmark_dimensions[n][3];
+            configuration.quadDoubleDoublePrecision = 1;
+			//configuration.keepShaderCode = 1;
+			//configuration.printMemoryLayout = 1;
+			//configuration.disableReorderFourStep = 1;
+			//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+			configuration.device = vkGPU->device;
+#else
+			configuration.device = &vkGPU->device;
+#endif
+#if(VKFFT_BACKEND==0)
+			configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers 
+			configuration.fence = &vkGPU->fence;
+			configuration.commandPool = &vkGPU->commandPool;
+			configuration.physicalDevice = &vkGPU->physicalDevice;
+			configuration.isCompilerInitialized = isCompilerInitialized;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
+#elif(VKFFT_BACKEND==3)
+			configuration.context = &vkGPU->context;
+#elif(VKFFT_BACKEND==4)
+			configuration.context = &vkGPU->context;
+			configuration.commandQueue = &vkGPU->commandQueue;
+			configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+            configuration.queue = vkGPU->queue;
+#endif			
+
+			uint64_t numBuf = 1;
+
+			//Allocate buffers for the input data. - we use 4 in this example
+			uint64_t* bufferSize = (uint64_t*)malloc(sizeof(uint64_t) * numBuf);
+			if (!bufferSize) return VKFFT_ERROR_MALLOC_FAILED;
+			for (uint64_t i = 0; i < numBuf; i++) {
+				bufferSize[i] = {};
+				bufferSize[i] = (uint64_t)sizeof(double) * 2 * 2 * benchmark_dimensions[n][0] * benchmark_dimensions[n][1] * benchmark_dimensions[n][2] * benchmark_dimensions[n][3]/ numBuf;
+			}
+#if(VKFFT_BACKEND==0)
+			VkBuffer* buffer = (VkBuffer*)malloc(numBuf * sizeof(VkBuffer));
+			if (!buffer) return VKFFT_ERROR_MALLOC_FAILED;
+			VkDeviceMemory* bufferDeviceMemory = (VkDeviceMemory*)malloc(numBuf * sizeof(VkDeviceMemory));
+			if (!bufferDeviceMemory) return VKFFT_ERROR_MALLOC_FAILED;
+#elif(VKFFT_BACKEND==1)
+			cuDoubleComplex* buffer = 0;
+#elif(VKFFT_BACKEND==2)
+			hipDoubleComplex* buffer = 0;
+#elif(VKFFT_BACKEND==3)
+			cl_mem buffer = 0;
+#elif(VKFFT_BACKEND==4)
+			void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+            MTL::Buffer* buffer = 0;
+#endif			
+			for (uint64_t i = 0; i < numBuf; i++) {
+#if(VKFFT_BACKEND==0)
+				buffer[i] = {};
+				bufferDeviceMemory[i] = {};
+				resFFT = allocateBuffer(vkGPU, &buffer[i], &bufferDeviceMemory[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize[i]);
+				if (resFFT != VKFFT_SUCCESS) return resFFT;
+#elif(VKFFT_BACKEND==1)
+				res = cudaMalloc((void**)&buffer, bufferSize[i]);
+				if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==2)
+				res = hipMalloc((void**)&buffer, bufferSize[i]);
+				if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==3)
+				buffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, bufferSize[i], 0, &res);
+				if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==4)
+				ze_device_mem_alloc_desc_t device_desc = {};
+				device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+				res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(double), vkGPU->device, &buffer);
+				if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+                buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
+#endif
+			}
+
+			configuration.bufferNum = numBuf;
+			/*
+#if(VKFFT_BACKEND==0)
+			configuration.buffer = buffer;
+#elif(VKFFT_BACKEND==1)
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==2)
+			configuration.buffer = (void**)&buffer;
+#endif
+			*/ // Can specify buffers at launch
+			configuration.bufferSize = bufferSize;
+
+			//Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+			uint64_t shift = 0;
+			for (uint64_t i = 0; i < numBuf; i++) {
+#if(VKFFT_BACKEND==0)
+				resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(double)), &buffer[i], bufferSize[i]);
+#else
+                resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(double)), &buffer, bufferSize[i]);
+#endif
+				if (resFFT != VKFFT_SUCCESS) return resFFT;
+				shift += bufferSize[i];
+			}
+			//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
+			resFFT = initializeVkFFT(&app, configuration);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			//Submit FFT+iFFT.
+			//num_iter = 1;
+			//specify buffers at launch
+			VkFFTLaunchParams launchParams = {};
+#if(VKFFT_BACKEND==0)
+			launchParams.buffer = buffer;
+#elif(VKFFT_BACKEND==1)
+			launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==2)
+			launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==3)
+			launchParams.buffer = &buffer;
+#elif(VKFFT_BACKEND==4)
+			launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+            launchParams.buffer = &buffer;
+#endif
+			resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			double* output_VkFFT = (double*)(malloc(4*sizeof(double) * benchmark_dimensions[n][0] * benchmark_dimensions[n][1] * benchmark_dimensions[n][2] * benchmark_dimensions[n][3]));
+			if (!output_VkFFT) return VKFFT_ERROR_MALLOC_FAILED;
+			//Transfer data from GPU using staging buffer.
+			shift = 0;
+			for (uint64_t i = 0; i < numBuf; i++) {
+#if(VKFFT_BACKEND==0)
+				resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(double)), &buffer[i],4* sizeof(double) * benchmark_dimensions[n][0] * benchmark_dimensions[n][1] * benchmark_dimensions[n][2] * benchmark_dimensions[n][3]);
+#else
+                resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(double)), &buffer, 4*sizeof(double) * benchmark_dimensions[n][0] * benchmark_dimensions[n][1] * benchmark_dimensions[n][2] * benchmark_dimensions[n][3]);
+#endif
+				if (resFFT != VKFFT_SUCCESS) return resFFT;
+				shift += bufferSize[i];
+			}
+			__float128 avg_difference[2] = { 0,0 };
+			__float128 max_difference[2] = { 0,0 };
+			__float128 avg_eps[2] = { 0,0 };
+			__float128 max_eps[2] = { 0,0 };
+			for (uint64_t k = 0; k < dims[3]; k++) {
+				for (uint64_t l = 0; l < dims[2]; l++) {
+					for (uint64_t j = 0; j < dims[1]; j++) {
+						for (uint64_t i = 0; i < dims[0]; i++) {
+							uint64_t loc_i = i;
+							uint64_t loc_j = j;
+							uint64_t loc_l = l;
+
+							//if (file_output) fprintf(output, "%.2e %.2e - %.2e %.2e \n", output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]][0] / N, output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]][1] / N, output_VkFFT[(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1])][0], output_VkFFT[(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1])][1]);
+							//if (i > dims[0] - 10)
+							/*char buf[128];
+							__float128 r;
+							int n = quadmath_snprintf (buf, sizeof buf, "%+-#*.20Qe", 46, output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]][0]);
+							printf("%s ", buf);
+							n = quadmath_snprintf (buf, sizeof buf, "%+-#*.20Qe", 46, output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]][1]);
+							printf("%s - ", buf);
+							r = (__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])] + (__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])+1];
+							n = quadmath_snprintf (buf, sizeof buf, "%+-#*.20Qe", 46, r);
+							printf("%s", buf);
+							r = (__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])+2] + (__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])+3];
+							n = quadmath_snprintf (buf, sizeof buf, "%+-#*.20Qe", 46, r);
+							printf("%s\n", buf);*/
+
+							//printf("%.17Le %.17Le - %.17Le %.17Le \n", (long double)output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]][0] , (long double)output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]][1], (long double)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1])], (long double)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1])+2]);
+							//printf("%.2e %.2e \n", output_VkFFT[(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1])][0], output_VkFFT[(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1])][1]);
+
+							__float128 current_data_norm = sqrtq(output_FFTW[i + j * dims[0] + l * dims[0] * dims[1] + k * dims[0] * dims[1]* dims[2]][0] * output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2]][0] + output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2]][1] * output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2]][1]);
+
+							__float128 current_diff_x_VkFFT = ((__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])] + (__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])+1] - output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2]][0]);
+							__float128 current_diff_y_VkFFT = ((__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])+2] + (__float128)output_VkFFT[4*(loc_i + loc_j * dims[0] + loc_l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2])+3] - output_FFTW[i + j * dims[0] + l * dims[0] * dims[1]+ k * dims[0] * dims[1]* dims[2]][1]);
+							__float128 current_diff_norm_VkFFT = sqrtq(current_diff_x_VkFFT * current_diff_x_VkFFT + current_diff_y_VkFFT * current_diff_y_VkFFT);
+							if (current_diff_norm_VkFFT > max_difference[1]) max_difference[1] = current_diff_norm_VkFFT;
+							avg_difference[1] += current_diff_norm_VkFFT;
+							if ((current_diff_norm_VkFFT / current_data_norm > max_eps[1])) {
+								max_eps[1] = current_diff_norm_VkFFT / current_data_norm;
+							}
+							avg_eps[1] += current_diff_norm_VkFFT / current_data_norm;
+						}
+					}
+				}
+			}
+			avg_difference[0] /= (dims[0] * dims[1] * dims[2]* dims[3]);
+			avg_eps[0] /= (dims[0] * dims[1] * dims[2]* dims[3]);
+			avg_difference[1] /= (dims[0] * dims[1] * dims[2]* dims[3]);
+			avg_eps[1] /= (dims[0] * dims[1] * dims[2]* dims[3]);
+
+			if (file_output)
+				fprintf(output, "VkFFT System: %" PRIu64 "x%" PRIu64 "x%" PRIu64 "x%" PRIu64 " avg_difference: %.2e max_difference: %.2e avg_eps: %.2e max_eps: %.2e\n", dims[0], dims[1], dims[2],dims[3], (double)avg_difference[1], (double)max_difference[1], (double)avg_eps[1], (double)max_eps[1]);
+			printf("VkFFT System: %" PRIu64 "x%" PRIu64 "x%" PRIu64 "x%" PRIu64 " avg_difference: %.2e max_difference: %.2e avg_eps: %.2e max_eps: %.2e\n", dims[0], dims[1], dims[2], dims[3], (double)avg_difference[1], (double)max_difference[1], (double)avg_eps[1], (double)max_eps[1]);
+			free(output_VkFFT);
+			for (uint64_t i = 0; i < numBuf; i++) {
+
+#if(VKFFT_BACKEND==0)
+				vkDestroyBuffer(vkGPU->device, buffer[i], NULL);
+				vkFreeMemory(vkGPU->device, bufferDeviceMemory[i], NULL);
+#elif(VKFFT_BACKEND==1)
+				cudaFree(buffer);
+#elif(VKFFT_BACKEND==2)
+				hipFree(buffer);
+#elif(VKFFT_BACKEND==3)
+				clReleaseMemObject(buffer);
+#elif(VKFFT_BACKEND==4)
+				zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+                buffer->release();
+#endif
+
+			}
+#if(VKFFT_BACKEND==0)
+			free(buffer);
+			free(bufferDeviceMemory);
+#endif
+
+			free(bufferSize);
+			deleteVkFFT(&app);
+			free(inputC);
+			fftwq_destroy_plan(p);
+			free(inputC_quad);
+			free(output_FFTW);
+		}
+	}
+#endif
+	return resFFT;
+}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
index a6a21656..4bda0a85 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
@@ -208,7 +208,7 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
 	convolution_configuration.numberBatches = 1;//one batch - numberKernels convolutions
 	convolution_configuration.numberKernels = configuration.numberBatches;// number of convolutions on a single input
 	//Allocate separate buffer for the input data.
-	uint64_t inputBufferSize = ((uint64_t)convolution_configuration.coordinateFeatures) * sizeof(float) * 2 * (convolution_configuration.size[0] / 2 + 1) * convolution_configuration.size[1] * convolution_configuration.size[2];;
+	uint64_t inputBufferSize = ((uint64_t)convolution_configuration.coordinateFeatures) * sizeof(float) * (convolution_configuration.size[0]) * convolution_configuration.size[1] * convolution_configuration.size[2];;
 	uint64_t bufferSize = convolution_configuration.numberKernels * convolution_configuration.coordinateFeatures * sizeof(float) * 2 * (convolution_configuration.size[0] / 2 + 1) * convolution_configuration.size[1] * convolution_configuration.size[2];;
 	convolution_configuration.isInputFormatted = true; //if input is a different buffer, it doesn't have to be zeropadded/R2C padded	
 
@@ -282,7 +282,7 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
 		for (uint64_t k = 0; k < convolution_configuration.size[2]; k++) {
 			for (uint64_t j = 0; j < convolution_configuration.size[1]; j++) {
 				for (uint64_t i = 0; i < convolution_configuration.size[0]; i++) {
-					buffer_input[i + j * (convolution_configuration.size[0] + 2) + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]] = 1;
+					buffer_input[i + j * (convolution_configuration.size[0]) + k * (convolution_configuration.size[0]) * convolution_configuration.size[1] + v * (convolution_configuration.size[0]) * convolution_configuration.size[1] * convolution_configuration.size[2]] = 1;
 				}
 			}
 		}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
new file mode 100644
index 00000000..c2ac80de
--- /dev/null
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
@@ -0,0 +1,283 @@
+//general parts
+#include <stdio.h>
+#include <vector>
+#include <memory>
+#include <string.h>
+#include <chrono>
+#include <thread>
+#include <iostream>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#if(VKFFT_BACKEND==0)
+#include "vulkan/vulkan.h"
+#include "glslang_c_interface.h"
+#elif(VKFFT_BACKEND==1)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+#include <cuda_runtime_api.h>
+#include <cuComplex.h>
+#elif(VKFFT_BACKEND==2)
+#ifndef __HIP_PLATFORM_HCC__
+#define __HIP_PLATFORM_HCC__
+#endif
+#include <hip/hip_runtime.h>
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_complex.h>
+#elif(VKFFT_BACKEND==3)
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif 
+#elif(VKFFT_BACKEND==4)
+#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
+#endif
+#include "vkFFT.h"
+#include "utils_VkFFT.h"
+
+VkFFTResult sample_9_benchmark_VkFFT_quadDoubleDouble(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized)
+{
+	VkFFTResult resFFT = VKFFT_SUCCESS;
+#if(VKFFT_BACKEND==0)
+	VkResult res = VK_SUCCESS;
+#elif(VKFFT_BACKEND==1)
+	cudaError_t res = cudaSuccess;
+#elif(VKFFT_BACKEND==2)
+	hipError_t res = hipSuccess;
+#elif(VKFFT_BACKEND==3)
+	cl_int res = CL_SUCCESS;
+#elif(VKFFT_BACKEND==4)
+	ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
+#endif
+	if (file_output)
+		fprintf(output, "9 - VkFFT FFT + iFFT C2C benchmark 1D batched in double-double emulation of quad precision LUT\n");
+	printf("9 - VkFFT FFT + iFFT C2C benchmark 1D batched in double-double emulation of quad precision LUT\n");
+	const int num_runs = 3;
+	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
+	//memory allocated on the CPU once, makes benchmark completion faster + avoids performance issues connected to frequent allocation/deallocation.
+	double* buffer_input = (double*)malloc((uint64_t)8 * 2 * (uint64_t)pow(2, 27));
+	if (!buffer_input) return VKFFT_ERROR_MALLOC_FAILED;
+	for (uint64_t i = 0; i < 2 * (uint64_t)pow(2, 27); i++) {
+		buffer_input[i] = (double)2 * ((double)rand()) / RAND_MAX - 1.0;
+	}
+	for (uint64_t n = 0; n < 23; n++) {
+		double run_time[num_runs];
+		for (uint64_t r = 0; r < num_runs; r++) {
+			//Configuration + FFT application .
+			VkFFTConfiguration configuration = {};
+			VkFFTApplication app = {};
+			//FFT + iFFT sample code.
+			//Setting up FFT configuration for forward and inverse FFT.
+			configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D (default 1).
+			configuration.size[0] = 4 * (uint64_t)pow(2, n); //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.   
+			if (n == 0) configuration.size[0] = 2048;
+			configuration.numberBatches = (uint64_t)(64 * 16 * (uint64_t)pow(2, 14))/ configuration.size[0];
+			if (configuration.numberBatches < 1) configuration.numberBatches = 1;
+			configuration.size[2] = 1;
+
+
+			configuration.quadDoubleDoublePrecision = true;
+
+#if(VKFFT_BACKEND!=5)
+			if (r==0) configuration.saveApplicationToString = 1;
+			if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
+			//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+            configuration.device = vkGPU->device;
+#else
+            configuration.device = &vkGPU->device;
+#endif
+#if(VKFFT_BACKEND==0)
+			configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers 
+			configuration.fence = &vkGPU->fence;
+			configuration.commandPool = &vkGPU->commandPool;
+			configuration.physicalDevice = &vkGPU->physicalDevice;
+			configuration.isCompilerInitialized = isCompilerInitialized;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
+#elif(VKFFT_BACKEND==3)
+			configuration.context = &vkGPU->context;
+#elif(VKFFT_BACKEND==4)
+			configuration.context = &vkGPU->context;
+			configuration.commandQueue = &vkGPU->commandQueue;
+			configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+            configuration.queue = vkGPU->queue;
+#endif			
+
+			//Allocate buffer for the input data.
+			uint64_t bufferSize = (uint64_t)sizeof(double) * 4 * configuration.size[0] * configuration.numberBatches;
+#if(VKFFT_BACKEND==0)
+			VkBuffer buffer = {};
+			VkDeviceMemory bufferDeviceMemory = {};
+			resFFT = allocateBuffer(vkGPU, &buffer, &bufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==1)
+			cuFloatComplex* buffer = 0;
+			res = cudaMalloc((void**)&buffer, bufferSize);
+			if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==2)
+			hipFloatComplex* buffer = 0;
+			res = hipMalloc((void**)&buffer, bufferSize);
+			if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==3)
+			cl_mem buffer = 0;
+			buffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, bufferSize, 0, &res);
+			if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==4)
+			void* buffer = 0;
+			ze_device_mem_alloc_desc_t device_desc = {};
+			device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+			res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
+			if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+			configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+            MTL::Buffer* buffer = 0;
+            buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+            configuration.buffer = &buffer;
+#endif
+
+			configuration.bufferSize = &bufferSize;
+			//Fill data on CPU. It is best to perform all operations on GPU after initial upload.
+			/*float* buffer_input = (float*)malloc(bufferSize);
+
+			for (uint64_t k = 0; k < configuration.size[2]; k++) {
+				for (uint64_t j = 0; j < configuration.size[1]; j++) {
+					for (uint64_t i = 0; i < configuration.size[0]; i++) {
+						buffer_input[2 * (i + j * configuration.size[0] + k * (configuration.size[0]) * configuration.size[1])] = 2 * ((float)rand()) / RAND_MAX - 1.0;
+						buffer_input[2 * (i + j * configuration.size[0] + k * (configuration.size[0]) * configuration.size[1]) + 1] = 2 * ((float)rand()) / RAND_MAX - 1.0;
+						}
+					}
+				}
+			*/
+			//Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+            resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+            if (resFFT != VKFFT_SUCCESS) return resFFT;
+			//free(buffer_input);
+
+			if (configuration.loadApplicationFromString) {
+				FILE* kernelCache;
+				uint64_t str_len;
+				char fname[500];
+				int VkFFT_version = VkFFTGetVersion();
+				sprintf(fname, "VkFFT_binary");
+				kernelCache = fopen(fname, "rb");
+				if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+				fseek(kernelCache, 0, SEEK_END);
+				str_len = ftell(kernelCache);
+				fseek(kernelCache, 0, SEEK_SET);
+				configuration.loadApplicationString = malloc(str_len);
+				fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+				fclose(kernelCache);
+			}
+			//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.  
+			resFFT = initializeVkFFT(&app, configuration);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+
+			if (configuration.loadApplicationFromString)
+				free(configuration.loadApplicationString);
+
+			if (configuration.saveApplicationToString) {
+				FILE* kernelCache;
+				char fname[500];
+				int VkFFT_version = VkFFTGetVersion();
+				sprintf(fname, "VkFFT_binary");
+				kernelCache = fopen(fname, "wb");
+				fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+				fclose(kernelCache);
+			}
+
+			//Submit FFT+iFFT.
+			uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
+#if(VKFFT_BACKEND==0)
+			if (vkGPU->physicalDeviceProperties.vendorID == 0x8086) num_iter /= 4;
+#elif(VKFFT_BACKEND==3)
+			cl_uint vendorID;
+			clGetDeviceInfo(vkGPU->device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, 0);
+			if (vendorID == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#elif(VKFFT_BACKEND==4)
+			ze_device_properties_t device_properties;
+			res = zeDeviceGetProperties(vkGPU->device, &device_properties);
+			if (res != 0) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE;
+			if (device_properties.vendorId == 0x8086) num_iter /= 4;//smaller benchmark for Intel GPUs
+#endif
+			if (num_iter == 0) num_iter = 1;
+			double totTime = 0;
+			VkFFTLaunchParams launchParams = {};
+			resFFT = performVulkanFFTiFFT(vkGPU, &app, &launchParams, num_iter, &totTime);
+			if (resFFT != VKFFT_SUCCESS) return resFFT;
+			run_time[r] = totTime;
+			if (n > 0) {
+				if (r == num_runs - 1) {
+					double std_error = 0;
+					double avg_time = 0;
+					for (uint64_t t = 0; t < num_runs; t++) {
+						avg_time += run_time[t];
+					}
+					avg_time /= num_runs;
+					for (uint64_t t = 0; t < num_runs; t++) {
+						std_error += (run_time[t] - avg_time) * (run_time[t] - avg_time);
+					}
+					std_error = sqrt(std_error / num_runs);
+					uint64_t num_tot_transfers = 0;
+					for (uint64_t i = 0; i < configuration.FFTdim; i++)
+						num_tot_transfers += app.localFFTPlan->numAxisUploads[i];
+					num_tot_transfers *= 4;
+					if (file_output)
+						fprintf(output, "VkFFT System: %" PRIu64 " %" PRIu64 "x%" PRIu64 " Buffer: %" PRIu64 " MB avg_time_per_step: %0.3f ms std_error: %0.3f num_iter: %" PRIu64 " benchmark: %" PRIu64 " bandwidth: %0.1f\n", (uint64_t)log2(configuration.size[0]), configuration.size[0], configuration.numberBatches, bufferSize / 1024 / 1024, avg_time, std_error, num_iter, (uint64_t)(((double)bufferSize * sizeof(float) / (2*sizeof(double)) / 1024) / avg_time), bufferSize / 1024.0 / 1024.0 / 1.024 * num_tot_transfers / avg_time);
+
+					printf("VkFFT System: %" PRIu64 " %" PRIu64 "x%" PRIu64 " Buffer: %" PRIu64 " MB avg_time_per_step: %0.3f ms std_error: %0.3f num_iter: %" PRIu64 " benchmark: %" PRIu64 " bandwidth: %0.1f\n", (uint64_t)log2(configuration.size[0]), configuration.size[0], configuration.numberBatches, bufferSize / 1024 / 1024, avg_time, std_error, num_iter, (uint64_t)(((double)bufferSize * sizeof(float) / (2*sizeof(double)) / 1024) / avg_time), bufferSize / 1024.0 / 1024.0 / 1.024 * num_tot_transfers / avg_time);
+					benchmark_result += ((double)bufferSize * sizeof(float) / (2*sizeof(double)) / 1024) / avg_time;
+				}
+
+
+			}
+
+#if(VKFFT_BACKEND==0)
+			vkDestroyBuffer(vkGPU->device, buffer, NULL);
+			vkFreeMemory(vkGPU->device, bufferDeviceMemory, NULL);
+#elif(VKFFT_BACKEND==1)
+			cudaFree(buffer);
+#elif(VKFFT_BACKEND==2)
+			hipFree(buffer);
+#elif(VKFFT_BACKEND==3)
+			clReleaseMemObject(buffer);
+#elif(VKFFT_BACKEND==4)
+			zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+            buffer->release();
+#endif
+			deleteVkFFT(&app);
+
+		}
+	}
+	free(buffer_input);
+	benchmark_result /= 23;
+	if (file_output) {
+		fprintf(output, "Benchmark score VkFFT: %" PRIu64 "\n", (uint64_t)(benchmark_result));
+#if(VKFFT_BACKEND==0)
+		fprintf(output, "Device name: %s API:%d.%d.%d\n", vkGPU->physicalDeviceProperties.deviceName, (vkGPU->physicalDeviceProperties.apiVersion >> 22), ((vkGPU->physicalDeviceProperties.apiVersion >> 12) & 0x3ff), (vkGPU->physicalDeviceProperties.apiVersion & 0xfff));
+#endif
+	}
+	printf("Benchmark score VkFFT: %" PRIu64 "\n", (uint64_t)(benchmark_result));
+#if(VKFFT_BACKEND==0)
+	printf("Device name: %s API:%d.%d.%d\n", vkGPU->physicalDeviceProperties.deviceName, (vkGPU->physicalDeviceProperties.apiVersion >> 22), ((vkGPU->physicalDeviceProperties.apiVersion >> 12) & 0x3ff), (vkGPU->physicalDeviceProperties.apiVersion & 0xfff));
+#endif
+	return resFFT;
+}
diff --git a/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp b/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
index f6a18089..035d0d3e 100644
--- a/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
@@ -77,6 +77,9 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
 	case 2:
 		storageComplexSize = (2 * 2);
 		break;
+	case 3:
+		storageComplexSize = (4 * sizeof(double));
+		break;
 	default:
 		storageComplexSize = (2 * sizeof(float));
 		break;
@@ -120,6 +123,7 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
 			configuration.performDCT = userParams->DCT;
 			if (userParams->P == 1) configuration.doublePrecision = 1;
 			if (userParams->P == 2) configuration.halfPrecision = 1;
+			if (userParams->P == 3) configuration.quadDoubleDoublePrecision = 1;
 #if(VKFFT_BACKEND!=5)
 			if (userParams->saveApplicationToString && (n==0) && (r==0)) configuration.saveApplicationToString = 1;
 			if (userParams->loadApplicationFromString || (userParams->saveApplicationToString && ((n != 0) || (r != 0)))) configuration.loadApplicationFromString = 1;
diff --git a/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp b/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
index ee6763cb..217127b2 100644
--- a/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
@@ -244,7 +244,7 @@ VkResult createDevice(VkGPU* vkGPU, uint64_t sample_id) {
 	VkDeviceCreateInfo deviceCreateInfo = { VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO };
 	VkPhysicalDeviceFeatures deviceFeatures = {};
 	switch (sample_id) {
-	case 1: case 12: case 17: case 18: case 101: case 201: case 1001: {
+	case 1: case 9: case 12: case 17: case 18: case 19: case 101: case 201: case 203: case 1001: case 1004: {
 		deviceFeatures.shaderFloat64 = true;
 		deviceCreateInfo.enabledExtensionCount = (uint32_t)vkGPU->enabledDeviceExtensions.size();
 		deviceCreateInfo.ppEnabledExtensionNames = vkGPU->enabledDeviceExtensions.data();
@@ -257,7 +257,7 @@ VkResult createDevice(VkGPU* vkGPU, uint64_t sample_id) {
 		break;
 	}
 #if (VK_API_VERSION>10)
-	case 2: case 102: {
+	case 2: case 102: case 202: case 1002: {
 		VkPhysicalDeviceFeatures2 deviceFeatures2 = {};
 		VkPhysicalDevice16BitStorageFeatures shaderFloat16 = {};
 		shaderFloat16.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES;
@@ -364,10 +364,17 @@ VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* cpu_arr, void* output_buffer,
 	VkResult res = VK_SUCCESS;
 	VkBuffer* buffer = (VkBuffer*)output_buffer;
 	uint64_t stagingBufferSize = transferSize;
-	VkBuffer stagingBuffer = { 0 };
-	VkDeviceMemory stagingBufferMemory = { 0 };
-	resFFT = allocateBuffer(vkGPU, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
-	if (resFFT != VKFFT_SUCCESS) return resFFT;
+	VkBuffer* stagingBuffer = {0};
+	VkDeviceMemory* stagingBufferMemory = {0};
+	if (!vkGPU->stagingBuffer){
+		stagingBuffer = (VkBuffer*)calloc(1, sizeof(VkBuffer));
+		stagingBufferMemory = (VkDeviceMemory*)calloc(1, sizeof(VkDeviceMemory));
+		resFFT = allocateBuffer(vkGPU, stagingBuffer, stagingBufferMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
+		if (resFFT != VKFFT_SUCCESS) return resFFT;
+	}else{
+		stagingBuffer = vkGPU->stagingBuffer;
+		stagingBufferMemory = vkGPU->stagingBufferMemory;
+	}
 	VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 	commandBufferAllocateInfo.commandPool = vkGPU->commandPool;
 	commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
@@ -383,7 +390,7 @@ VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* cpu_arr, void* output_buffer,
 	copyRegion.srcOffset = 0;
 	copyRegion.dstOffset = 0;
 	copyRegion.size = stagingBufferSize;
-	vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer, 1, &copyRegion);
+	vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer[0], 1, &copyRegion);
 	res = vkEndCommandBuffer(commandBuffer);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
 	VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
@@ -397,12 +404,16 @@ VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* cpu_arr, void* output_buffer,
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
 	vkFreeCommandBuffers(vkGPU->device, vkGPU->commandPool, 1, &commandBuffer);
 	void* data;
-	res = vkMapMemory(vkGPU->device, stagingBufferMemory, 0, stagingBufferSize, 0, &data);
+	res = vkMapMemory(vkGPU->device, stagingBufferMemory[0], 0, stagingBufferSize, 0, &data);
 	if (resFFT != VKFFT_SUCCESS) return resFFT;
 	memcpy(cpu_arr, data, stagingBufferSize);
-	vkUnmapMemory(vkGPU->device, stagingBufferMemory);
-	vkDestroyBuffer(vkGPU->device, stagingBuffer, NULL);
-	vkFreeMemory(vkGPU->device, stagingBufferMemory, NULL);
+	vkUnmapMemory(vkGPU->device, stagingBufferMemory[0]);
+	if (!vkGPU->stagingBuffer){
+		vkDestroyBuffer(vkGPU->device, stagingBuffer[0], 0);
+		vkFreeMemory(vkGPU->device, stagingBufferMemory[0], 0);
+		free(stagingBuffer);
+		free(stagingBufferMemory);
+	}
 #elif(VKFFT_BACKEND==1)
 	cudaError_t res = cudaSuccess;
 	void* buffer = ((void**)output_buffer)[0];
@@ -477,15 +488,22 @@ VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* cpu_arr, void* input_buffer,
 	VkResult res = VK_SUCCESS;
 	VkBuffer* buffer = (VkBuffer*)input_buffer;
 	uint64_t stagingBufferSize = transferSize;
-	VkBuffer stagingBuffer = { 0 };
-	VkDeviceMemory stagingBufferMemory = { 0 };
-	resFFT = allocateBuffer(vkGPU, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
-	if (resFFT != VKFFT_SUCCESS) return resFFT;
+	VkBuffer* stagingBuffer = {0};
+	VkDeviceMemory* stagingBufferMemory = {0};
+	if (!vkGPU->stagingBuffer){
+		stagingBuffer = (VkBuffer*)calloc(1, sizeof(VkBuffer));
+		stagingBufferMemory = (VkDeviceMemory*)calloc(1, sizeof(VkDeviceMemory));
+		resFFT = allocateBuffer(vkGPU, stagingBuffer, stagingBufferMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
+		if (resFFT != VKFFT_SUCCESS) return resFFT;
+	}else{
+		stagingBuffer = vkGPU->stagingBuffer;
+		stagingBufferMemory = vkGPU->stagingBufferMemory;
+	}
 	void* data;
-	res = vkMapMemory(vkGPU->device, stagingBufferMemory, 0, stagingBufferSize, 0, &data);
+	res = vkMapMemory(vkGPU->device, stagingBufferMemory[0], 0, stagingBufferSize, 0, &data);
 	if (resFFT != VKFFT_SUCCESS) return resFFT;
 	memcpy(data, cpu_arr, stagingBufferSize);
-	vkUnmapMemory(vkGPU->device, stagingBufferMemory);
+	vkUnmapMemory(vkGPU->device, stagingBufferMemory[0]);
 	VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 	commandBufferAllocateInfo.commandPool = vkGPU->commandPool;
 	commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
@@ -501,7 +519,7 @@ VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* cpu_arr, void* input_buffer,
 	copyRegion.srcOffset = 0;
 	copyRegion.dstOffset = 0;
 	copyRegion.size = stagingBufferSize;
-	vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, &copyRegion);
+	vkCmdCopyBuffer(commandBuffer, stagingBuffer[0], buffer[0], 1, &copyRegion);
 	res = vkEndCommandBuffer(commandBuffer);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
 	VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
@@ -514,8 +532,12 @@ VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* cpu_arr, void* input_buffer,
 	res = vkResetFences(vkGPU->device, 1, &vkGPU->fence);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
 	vkFreeCommandBuffers(vkGPU->device, vkGPU->commandPool, 1, &commandBuffer);
-	vkDestroyBuffer(vkGPU->device, stagingBuffer, NULL);
-	vkFreeMemory(vkGPU->device, stagingBufferMemory, NULL);
+	if (!vkGPU->stagingBuffer){
+		vkDestroyBuffer(vkGPU->device, stagingBuffer[0], 0);
+		vkFreeMemory(vkGPU->device, stagingBufferMemory[0], 0);
+		free(stagingBuffer);
+		free(stagingBufferMemory);
+	}
 	return resFFT;
 #elif(VKFFT_BACKEND==1)
 	cudaError_t res = cudaSuccess;
diff --git a/documentation/VkFFT_API_guide.lyx b/documentation/VkFFT_API_guide.lyx
index 59ef258b..18feefd0 100644
--- a/documentation/VkFFT_API_guide.lyx
+++ b/documentation/VkFFT_API_guide.lyx
@@ -192,7 +192,7 @@ vspace{1cm}
 
 {
 \backslash
-large July 2023, version 1.3.0
+large October 2023, version 1.3.2
 \backslash
 par} 
 \end_layout
@@ -526,7 +526,7 @@ end{mdframed}
 
 \begin_layout Enumerate
 CUDA API: CUDA and NVRTC.
- Sample CMakeLists can look like this:
+ Sample CMakeLists can look like this: 
 \begin_inset ERT
 status open
 
@@ -1144,8 +1144,8 @@ Fourier Transform types and their definitions
 
 \begin_layout Standard
 VkFFT supports commonly used Complex to complex (C2C), real to complex (R2C),
- complex to real (C2R) transformations and real to real (R2R) Discrete Cosine
- Transformations of types II, III and IV.
+ complex to real (C2R) transformations and real to real (R2R) Discrete Cosine/Si
+ne Transformations of types I, II, III and IV.
  VkFFT uses the same definitions as FFTW, except for the multidimensional
  FFT axis ordering: in FFTW dimensions are ordered with the decrease in
  consecutive elements stride, while VkFFT does the opposite - the first
@@ -1178,8 +1178,8 @@ VkFFT assumes that complex numbers are stored consecutively in memory: RIRIRI...
  where R denotes the real part of the complex number and I denotes the imaginary
  part.
  There is no difference between using a float2/double2/half2 container or
- access memory as float/double/half as long as the byte order remains the
- same.
+ access memory as float/double/half/double-double as long as the byte order
+ remains the same.
 \end_layout
 
 \begin_layout Standard
@@ -1274,7 +1274,7 @@ R2R (DCT) transforms
 
 \begin_layout Standard
 R2R transforms in VkFFT are implemented in the form of Discrete cosine transform
-s of types I, II, III and IV.
+s and Discrete sine transforms of types I, II, III and IV.
  Their definitions and transforms results match FFTW:
 \end_layout
 
@@ -1288,7 +1288,7 @@ DCT-I:
 
 \begin_layout Enumerate
 DCT-II: 
-\begin_inset Formula $X_{k}=2\stackrel[n=1]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})k)$
+\begin_inset Formula $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})k)$
 \end_inset
 
 , inverse of DCT-III
@@ -1310,6 +1310,38 @@ DCT-IV:
 , inverse of DCT-IV (itself)
 \end_layout
 
+\begin_layout Enumerate
+DST-I: 
+\begin_inset Formula $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}sin(\frac{\pi}{N+1}(n+1)(k+1))$
+\end_inset
+
+, inverse of DST-I (itself)
+\end_layout
+
+\begin_layout Enumerate
+DST-II: 
+\begin_inset Formula $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}sin(\frac{\pi}{N}(n+\frac{1}{2})(k+1))$
+\end_inset
+
+, inverse of DST-III
+\end_layout
+
+\begin_layout Enumerate
+DST-III: 
+\begin_inset Formula $X_{k}=(-1)^{k}x_{N-1}+2\stackrel[n=0]{N-2}{\sum}x_{n}sin(\frac{\pi}{N}(n+1)(k+\frac{1}{2}))$
+\end_inset
+
+, inverse of DST-II
+\end_layout
+
+\begin_layout Enumerate
+DST-IV: 
+\begin_inset Formula $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}sin(\frac{\pi}{N}(n+\frac{1}{2})(k+\frac{1}{2}))$
+\end_inset
+
+, inverse of DST-IV (itself)
+\end_layout
+
 \begin_layout Standard
 R2R transforms are performed by redefinition of them to the C2C transforms
  (internal C2C sequence length can be different from the input R2R sequence
@@ -1450,6 +1482,7 @@ For an out-of-place R2C FFT, there is no need to pad buffer with real numbers,
 \end_inset
 
  complex numbers for the frequency space.
+ 
 \end_layout
 
 \begin_layout Standard
@@ -1482,6 +1515,11 @@ literal "false"
 )
 \end_layout
 
+\begin_layout Standard
+By default, R2C/C2R will assume the complex-padded real strides in the in-place
+ mode and non-padded real strides in out-of-place mode.
+\end_layout
+
 \begin_layout Subsection
 VkFFT algorithms
 \end_layout
@@ -1632,7 +1670,8 @@ ate FFT results out-of-place.
 To estimate if your sequence size is single upload or not, divide the amount
  of available shared memory (48KB - Nvidia GPUs with Vulkan/OpenCL API,
  64KB - AMD GPUs, 100KB - Nvidia GPUs in CUDA API) by the complex size used
- for calculations (8 byte - single precision, 16 byte - double precision).
+ for calculations (4 byte - half precision, 8 byte - single precision, 16
+ byte - double precision, 32 byte - double-double).
  For 64KB of shared memory, we get 8192 as max single upload single-precision
  non-strided FFT, 4096 for double precision.
  For strided axes (H and D parts of the layout) these numbers have to be
@@ -1695,11 +1734,12 @@ For even sequences there exists an easy mapping between R2C/C2R FFTs and
 \end_layout
 
 \begin_layout Subsubsection
-R2R Discrete Cosine Transforms
+R2R Discrete Cosine/Sine Transforms
 \end_layout
 
 \begin_layout Standard
 There exist many different mappings between DCT and FFT.
+ DSTs are reformulated as DCTs inside the VkFFT, so they use the same algorithms.
  As of now, VkFFT has the following algorithms implemented (all single-upload
  for now):
 \end_layout
@@ -1789,15 +1829,16 @@ VkFFT accuracy
 \end_layout
 
 \begin_layout Standard
-To measure how VkFFT (single/double/half precision) results compare to cuFFT/roc
-FFT (single/double/half precision) and FFTW (double precision), multiple
- sets of systems covering full supported C2C/R2C+C2R/R2R FFT range are filled
- with random complex data on the scale of [-1,1] and one transform was performed
- on each system.
+To measure how VkFFT (single/double/half/double-double precision) results
+ compare to cuFFT/rocFFT (single/double/half precision) and FFTW (double/quad
+ precision), multiple sets of systems covering full supported C2C/R2C+C2R/R2R
+ FFT range are filled with random complex data on the scale of [-1,1] and
+ one transform was performed on each system.
  Samples 11(single), 12(double), 13(half), 14(non-power of 2 C2C, single),
  15(R2C+C2R, single), 16(DCT-I/II/III/IV, single), 17(DCT-I/II/III/IV, double),
- 18(non-power of 2 C2C, double) are available in VkFFT Benchmark Suite to
- perform VkFFT verification on any of the target platforms.
+ 18(non-power of 2 C2C, double), 19(double-double emulation of quad precision,
+ C2C) are available in VkFFT Benchmark Suite to perform VkFFT verification
+ on any of the target platforms.
  Overall, the Cooley-Tukey algorithm (Stockham autosort) exhibits logarithmic
  relative error scaling, similar to those of other GPU FFT libraries.
  Typically, the more computationally expensive algorithm is - the worse
@@ -1823,15 +1864,18 @@ Double precision in VkFFT also supports two modes of calculation - by using
 \end_layout
 
 \begin_layout Standard
-Half precision is currently only supported in the Vulkan backend and is
- often experiencing precision problems with the first number of the resulting
- FFT sequence, which is the sum of all input numbers.
+Half precision is supported in all backends.
  Half precision is implemented only as a memory trick - all on-chip computations
- are done in single precision, but this doesn't help with the first number
- problem.
+ are done in single precision.
  Half precision can use SFU or LUT as well.
 \end_layout
 
+\begin_layout Standard
+Double-double emulation of quad precision uses precomputed on CPU in quad
+ precision twiddle factors and implements only additions and multiplications
+ on GPU (for now).
+\end_layout
+
 \begin_layout Standard
 VkFFT also supports mixed-precision operations, where memory storing is
  done at lower precision, compared to the on-chip calculations.
@@ -1869,8 +1913,8 @@ phase vectors used in the Four Step FFT algorithm between stages
 \end_layout
 
 \begin_layout Itemize
-phase vectors used in DCT-II/III/IV to perform a mapping between R2R and
- C2C
+phase vectors used in DCT/DST-II/III/IV to perform a mapping between R2R
+ and C2C
 \end_layout
 
 \begin_layout Itemize
@@ -1940,6 +1984,58 @@ To do Bluestein's FFT algorithm, precomputed sequences
  of this size.
 \end_layout
 
+\begin_layout Subsection
+VkFFT support for double-double emulation of quad precision
+\end_layout
+
+\begin_layout Standard
+Since VkFFT 1.3.2 experimental support for double-double emulation of quad
+ precision has been added.
+ Double-double number is defined as an unevaluated sum of two double numbers,
+ second one being on the order of ~1 ULP of the first one (like 1 and 1e-16).
+ This boosts the significand from 53 to 106 bits and leaves the exponent
+ the same (11bits).
+ For example, true quad precision would have 15 bits of exponent and 113
+ bits significand.
+ The range of double-double stays the same as in double precision (1e308
+ vs 1e4932), while precision boosts from ~1e-16 to ~1e-32 (vs ~1e-34 of
+ true quad precision).
+ The double-double format of VkFFT is equivalent to __ibm128 type in gcc.
+ VkFFT uses quadmath library to precompute twiddle factors and initialize
+ kernels for double-double in the true quad precision.
+ Currently, conversion between quad and double-double is only possible on
+ CPU in true quad (GPU can only do operations on data in double-double format).
+ This will be changed later.
+\end_layout
+
+\begin_layout Standard
+To enable double-double support, link against quadmath and set VkFFT_use_FP128_d
+ouble_double compile flag to true.
+\end_layout
+
+\begin_layout Standard
+The performance impact is rather harsh - the code becomes compute-bound
+ even on modern HPC clusters.
+ The simple radix algorithms work well and are usable (~2x impact compared
+ to double precision), while Bluestein and big primes Rader algorithms incur
+ a high toll on operations count.
+ This can be improved in the future, but probably not to the extent of being
+ VRAM bandwidth-bound on modern accelerators.
+\end_layout
+
+\begin_layout Standard
+The main usage for the double-double precision should be mixed-precision
+ calculations, where most of the time is spent in regular FP64 and only
+ some parts that hit precision boundaries are calculated in double-double.
+ For example, in VkFFT it is possible to perform a double precision FFT
+ with all on-chip calculations done in double-double.
+ This way the final errors should be on the scale of 1 ULP.
+\end_layout
+
+\begin_layout Standard
+
+\end_layout
+
 \begin_layout Standard
 \begin_inset Newpage newpage
 \end_inset
@@ -2195,6 +2291,13 @@ I have also explored other ways to implement these container abstractions
  
 \end_layout
 
+\begin_layout Standard
+VkFFT supports two base number modes (used for all FP calculations on CPU)
+ - long double or quad (__float128).
+ The latter is needed to calculate the twiddle factors in double-double
+ emulation of quad precision.
+\end_layout
+
 \begin_layout Standard
 \begin_inset Newpage newpage
 \end_inset
@@ -2297,6 +2400,12 @@ ENSIONS
 
 \begin_layout Plain Layout
 
+VKFFT_ERROR_NONZERO_APP_INITIALIZATION = 8,	// The app pointer is not pointing
+ to the zero-filled memory block
+\end_layout
+
+\begin_layout Plain Layout
+
 VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001,	// No physical device is provided
  (Vulkan API)
 \end_layout
@@ -2438,6 +2547,11 @@ ern is enabled
 
 \begin_layout Plain Layout
 
+VKFFT_ERROR_EMPTY_app = 2015,	// app pointer is zero
+\end_layout
+
+\begin_layout Plain Layout
+
 VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,	// VkFFT has encountered unsupported
  radix (more than 13) during decomposition and Bluestein's FFT fallback
  did not work
@@ -2457,8 +2571,8 @@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,	// VkFFT can not do this
 
 \begin_layout Plain Layout
 
-VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004,	// VkFFT can not do this
- sequence length currently - multi-upload R2R transforms, odd DCT-IV transforms
+VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004,	// VkFFT can not do this
+ sequence length currently - multi-upload R2R transforms
 \end_layout
 
 \begin_layout Plain Layout
@@ -2781,7 +2895,7 @@ VkFFT has a unified plan management model - all different transform types/
  while the next one will go into how to configure VkFFTConfiguration correctly.
  All of the functions operate on VkFFTApplication and VkFFTConfiguration
  assuming they have been zero-initialized before usage, so do not forget
- to do this when initializing:
+ to do this when initializing (otherwise VkFFT will throw an error):
 \end_layout
 
 \begin_layout Standard
@@ -3146,6 +3260,7 @@ This section will cover all the parameters that can be specified in the
  VkFFTConfiguration struct.
  It will start with a short description of the struct (intended to be used
  as a cheat sheet), then go for each field in detail.
+ Here pfUINT and pfINT are defined as uint64_t and int64_t respectively.
 \end_layout
 
 \begin_layout Standard
@@ -3178,12 +3293,12 @@ typedef struct {
 
 \begin_layout Plain Layout
 
-uint64_t FFTdim;	// FFT dimensionality (1, 2 or 3)
+pfUINT FFTdim;	// FFT dimensionality (1, 2 or 3)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t size[VKFFT_MAX_FFT_DIMENSIONS];	// WHD+ - system dimensions
+pfUINT size[VKFFT_MAX_FFT_DIMENSIONS];	// WHD+ - system dimensions
 \end_layout
 
 \begin_layout Plain Layout
@@ -3220,8 +3335,8 @@ VkFence* fence;	// Pointer to Vulkan fence, created with vkCreateFence
 
 \begin_layout Plain Layout
 
-uint64_t isCompilerInitialized;	// Specify if glslang compiler has been
- intialized before (0 - off, 1 - on).
+pfUINT isCompilerInitialized;	// Specify if glslang compiler has been intialized
+ before (0 - off, 1 - on).
  Default 0 
 \end_layout
 
@@ -3232,7 +3347,7 @@ uint64_t isCompilerInitialized;	// Specify if glslang compiler has been
 
 \begin_layout Plain Layout
 
-CUdevice* device;	// Pointer to CUDA device, obtained from cuDeviceGet 	
+CUdevice* device;	// Pointer to CUDA device, obtained from cuDeviceGet
 \end_layout
 
 \begin_layout Plain Layout
@@ -3244,9 +3359,10 @@ cudaStream_t* stream;	// Pointer to streams (can be more than 1), where
 
 \begin_layout Plain Layout
 
-uint64_t num_streams;	// Try to submit CUDA kernels in multiple streams
- for asynchronous execution.
- Default 1 
+pfUINT num_streams;	// Try to submit CUDA kernels in multiple streams for
+ asynchronous execution.
+ Default 0, set to >=1 if you pass values in the stream pointer.
+  
 \end_layout
 
 \begin_layout Plain Layout
@@ -3268,9 +3384,9 @@ hipStream_t* stream;	// Pointer to streams (can be more than 1), where to
 
 \begin_layout Plain Layout
 
-uint64_t num_streams;	// Try to submit HIP kernels in multiple streams for
+pfUINT num_streams;	// Try to submit HIP kernels in multiple streams for
  asynchronous execution.
- Default 1 
+ Default 0, set to >=1 if you pass values in the stream pointer.
 \end_layout
 
 \begin_layout Plain Layout
@@ -3355,7 +3471,7 @@ mmandQueue()
 
 \begin_layout Plain Layout
 
-uint64_t userTempBuffer;	// Buffer allocated by app automatically if needed
+pfUINT userTempBuffer;	// Buffer allocated by app automatically if needed
  to reorder Four step algorithm.
  Setting to non zero value enables manual user allocation (0 - off, 1 -
  on)
@@ -3363,13 +3479,13 @@ uint64_t userTempBuffer;	// Buffer allocated by app automatically if needed
 
 \begin_layout Plain Layout
 
-uint64_t bufferNum;	// Multiple buffer sequence storage is Vulkan only.
+pfUINT bufferNum;	// Multiple buffer sequence storage is Vulkan only.
  Default 1
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t tempBufferNum;	// Multiple buffer sequence storage is Vulkan only.
+pfUINT tempBufferNum;	// Multiple buffer sequence storage is Vulkan only.
  Default 1, buffer allocated by app automatically if needed to reorder Four
  step algorithm.
  Setting to non zero value enables manual user allocation 	
@@ -3377,31 +3493,30 @@ uint64_t tempBufferNum;	// Multiple buffer sequence storage is Vulkan only.
 
 \begin_layout Plain Layout
 
-uint64_t inputBufferNum;	// Multiple buffer sequence storage is Vulkan only.
+pfUINT inputBufferNum;	// Multiple buffer sequence storage is Vulkan only.
  Default 1, if isInputFormatted is enabled 
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t outputBufferNum;	// Multiple buffer sequence storage is Vulkan
- only.
+pfUINT outputBufferNum;	// Multiple buffer sequence storage is Vulkan only.
  Default 1, if isOutputFormatted is enabled 
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t kernelNum;	// Multiple buffer sequence storage is Vulkan only.
+pfUINT kernelNum;	// Multiple buffer sequence storage is Vulkan only.
  Default 1, if performConvolution is enabled
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t* bufferSize;	// Array of buffers sizes in bytes
+pfUINT* bufferSize;	// Array of buffers sizes in bytes
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t* tempBufferSize;	// Array of temp buffers sizes in bytes.
+pfUINT* tempBufferSize;	// Array of temp buffers sizes in bytes.
  Default set to bufferSize sum, buffer allocated by app automatically if
  needed to reorder Four step algorithm.
  Setting to non zero value enables manual user allocation 
@@ -3409,20 +3524,20 @@ uint64_t* tempBufferSize;	// Array of temp buffers sizes in bytes.
 
 \begin_layout Plain Layout
 
-uint64_t* inputBufferSize;	// Array of input buffers sizes in bytes, if
- isInputFormatted is enabled
+pfUINT* inputBufferSize;	// Array of input buffers sizes in bytes, if isInputFor
+matted is enabled
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t* outputBufferSize;	// Array of output buffers sizes in bytes, if
+pfUINT* outputBufferSize;	// Array of output buffers sizes in bytes, if
  isOutputFormatted is enabled
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t* kernelSize;	// Array of kernel buffers sizes in bytes, if performConvo
-lution is enabled
+pfUINT* kernelSize;	// Array of kernel buffers sizes in bytes, if performConvolu
+tion is enabled
 \end_layout
 
 \begin_layout Plain Layout
@@ -3655,7 +3770,7 @@ MTL::Buffer** kernel;	// Pointer to device buffer used to read kernel data
 
 \begin_layout Plain Layout
 
-uint64_t bufferOffset;	// Specify if VkFFT has to offset the first element
+pfUINT bufferOffset;	// Specify if VkFFT has to offset the first element
  position inside the buffer.
  In bytes.
  Default 0
@@ -3663,7 +3778,7 @@ uint64_t bufferOffset;	// Specify if VkFFT has to offset the first element
 
 \begin_layout Plain Layout
 
-uint64_t tempBufferOffset;	// Specify if VkFFT has to offset the first element
+pfUINT tempBufferOffset;	// Specify if VkFFT has to offset the first element
  position inside the temp buffer.
  In bytes.
  Default 0
@@ -3671,23 +3786,23 @@ uint64_t tempBufferOffset;	// Specify if VkFFT has to offset the first element
 
 \begin_layout Plain Layout
 
-uint64_t inputBufferOffset;	// Specify if VkFFT has to offset the first
- element position inside the input buffer.
+pfUINT inputBufferOffset;	// Specify if VkFFT has to offset the first element
+ position inside the input buffer.
  In bytes.
  Default 0
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t outputBufferOffset;	// Specify if VkFFT has to offset the first
- element position inside the output buffer.
+pfUINT outputBufferOffset;	// Specify if VkFFT has to offset the first element
+ position inside the output buffer.
  In bytes.
  Default 0
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t kernelOffset;	// Specify if VkFFT has to offset the first element
+pfUINT kernelOffset;	// Specify if VkFFT has to offset the first element
  position inside the kernel.
  In bytes.
  Default 0
@@ -3695,8 +3810,8 @@ uint64_t kernelOffset;	// Specify if VkFFT has to offset the first element
 
 \begin_layout Plain Layout
 
-uint64_t specifyOffsetsAtLaunch;	// Specify if offsets will be selected
- with launch parameters VkFFTLaunchParams (0 - off, 1 - on).
+pfUINT specifyOffsetsAtLaunch;	// Specify if offsets will be selected with
+ launch parameters VkFFTLaunchParams (0 - off, 1 - on).
  Default 0
 \end_layout
 
@@ -3711,7 +3826,7 @@ uint64_t specifyOffsetsAtLaunch;	// Specify if offsets will be selected
 
 \begin_layout Plain Layout
 
-uint64_t coalescedMemory;	// In bytes, for Nvidia and AMD is equal to 32,
+pfUINT coalescedMemory;	// In bytes, for Nvidia and AMD is equal to 32,
  Intel is equal 64, scaled for half precision.
  Going to work regardless, but if specified by user correctly, the performance
  will be higher.
@@ -3719,38 +3834,38 @@ uint64_t coalescedMemory;	// In bytes, for Nvidia and AMD is equal to 32,
 
 \begin_layout Plain Layout
 
-uint64_t aimThreads;	// Aim at this many threads per block.
+pfUINT aimThreads;	// Aim at this many threads per block.
  Default 128
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t numSharedBanks;	// How many banks shared memory has.
+pfUINT numSharedBanks;	// How many banks shared memory has.
  Default 32
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t inverseReturnToInputBuffer;	// return data to the input buffer
- in inverse transform (0 - off, 1 - on).
+pfUINT inverseReturnToInputBuffer;	// return data to the input buffer in
+ inverse transform (0 - off, 1 - on).
  isInputFormatted must be enabled
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t numberBatches;	// N - used to perform multiple batches of initial
+pfUINT numberBatches;	// N - used to perform multiple batches of initial
  data.
  Default 1
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t useUint64;	// Use 64-bit addressing mode in generated kernels
+pfUINT useUint64;	// Use 64-bit addressing mode in generated kernels
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this
+pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this
  dimension (0 - FFT enabled, 1 - FFT disabled).
  Default 0.
  Doesn't work for R2C first axis for now.
@@ -3759,10 +3874,10 @@ uint64_t omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this
 
 \begin_layout Plain Layout
 
-uint64_t performBandwidthBoost; // Try to reduce coalsesced number by a
- factor of X to get bigger sequence in one upload for strided axes.
- Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise
-  
+pfUINT performBandwidthBoost; // Try to reduce coalsesced number by a factor
+ of X to get bigger sequence in one upload for strided axes.
+ Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST),
+ 0 otherwise  
 \end_layout
 
 \begin_layout Plain Layout
@@ -3771,19 +3886,32 @@ uint64_t performBandwidthBoost; // Try to reduce coalsesced number by a
 
 \begin_layout Plain Layout
 
-uint64_t doublePrecision;	// Perform calculations in double precision (0
- - off, 1 - on).
+pfUINT doublePrecision;	// Perform calculations in double precision (0 -
+ off, 1 - on).
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT quadDoubleDoublePrecision; // Perform calculations in double-double
+ emulation of quad precision (0 - off, 1 - on).
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT quadDoubleDoublePrecisionDoubleMemory; // Perform calculations in
+ double-double emulation of quad precision, while all memory storage is
+ done in FP64.
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t halfPrecision;	// Perform calculations in half precision (0 - off,
+pfUINT halfPrecision;	// Perform calculations in half precision (0 - off,
  1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t halfPrecisionMemoryOnly;	// Use half precision only as input/output
+pfUINT halfPrecisionMemoryOnly;	// Use half precision only as input/output
  buffer.
  Input/Output have to be allocated as half, buffer/tempBuffer have to be
  allocated as float (out-of-place mode only).
@@ -3792,65 +3920,82 @@ uint64_t halfPrecisionMemoryOnly;	// Use half precision only as input/output
 
 \begin_layout Plain Layout
 
-uint64_t doublePrecisionFloatMemory;	// Use FP64 precision for all calculations,
+pfUINT doublePrecisionFloatMemory;	// Use FP64 precision for all calculations,
  while all memory storage is done in FP32.
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t performR2C;	// Perform R2C/C2R decomposition (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t performDCT;	// Perform DCT transformation (X - DCT type, 1-4)
+pfUINT performR2C;	// Perform R2C/C2R decomposition (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t disableMergeSequencesR2C;	// Disable merging of two real sequences
+pfUINT performDCT;	// Perform DCT transformation (X - DCT type, 1-4)
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT performDST;	// Perform DST transformation (X - DST type, 1-4)
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT disableMergeSequencesR2C;	// Disable merging of two real sequences
  to reduce calculations (0 - off, 1 - on) 
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t normalize;	// Normalize inverse transform (0 - off, 1 - on)
+pfUINT normalize;	// Normalize inverse transform (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t disableReorderFourStep;	// Disables unshuffling of Four step algorithm.
+pfUINT disableReorderFourStep;	// Disables unshuffling of Four step algorithm.
  Requires tempbuffer allocation (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t useLUT;	// Switches from calculating sincos to using precomputed
- LUT tables (0 - off, 1 - on).
+pfINT useLUT;	// Switches from calculating sincos to using precomputed LUT
+ tables (0 - off, 1 - on).
  Configured by initialization routine
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t makeForwardPlanOnly;	// Generate code only for forward FFT (0 -
- off, 1 - on)
+pfINT useLUT_4step; // Switches from calculating sincos to using precomputed
+ LUT tables for intermediate roots of 1 in the Four-step FFT algorithm.
+ (-1 - off, 0 - auto, 1 - on).
+ Configured by initialization routine
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t makeInversePlanOnly;	// Generate code only for inverse FFT (0 -
- off, 1 - on)
+pfUINT makeForwardPlanOnly;	// Generate code only for forward FFT (0 - off,
+ 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t bufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Buffer strides - default
+pfUINT makeInversePlanOnly;	// Generate code only for inverse FFT (0 - off,
+ 1 - on)
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT bufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Buffer strides - default
  set to x - x*y - x*y*z values
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t isInputFormatted;	// Specify if input buffer is padded - 0 - padded,
+pfUINT isInputFormatted;	// Specify if input buffer is padded - 0 - padded,
  1 - not padded.
  For example if it is not padded for R2C if out-of-place mode is selected
  (only if numberBatches==1 and numberKernels==1)
@@ -3858,43 +4003,64 @@ uint64_t isInputFormatted;	// Specify if input buffer is padded - 0 - padded,
 
 \begin_layout Plain Layout
 
-uint64_t isOutputFormatted;	// Specify if output buffer is padded - 0 -
- padded, 1 - not padded.
+pfUINT isOutputFormatted;	// Specify if output buffer is padded - 0 - padded,
+ 1 - not padded.
  For example if it is not padded for R2C if out-of-place mode is selected
  (only if numberBatches==1 and numberKernels==1)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t inputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Input buffer strides.
+pfUINT inputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Input buffer strides.
  Used if isInputFormatted is enabled.
  Default set to bufferStride values
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t outputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Output buffer
- strides.
+pfUINT outputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Output buffer strides.
  Used if isInputFormatted is enabled.
  Default set to bufferStride values
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t considerAllAxesStrided;	// Will create plan for non-strided axis
+pfUINT swapTo2Stage4Step; // Specify at which number to switch from 1 upload
+ to 2 upload 4-step FFT, in case if making max sequence size lower than
+ coalesced sequence helps to combat TLB misses.
+ Default 0 - disabled.
+ 	
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT swapTo3Stage4Step;	// Specify at which power of 2 to switch from
+ 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower
+ than coalesced sequence helps to combat TLB misses.
+ Default 0 - disabled.
+ Must be at least 17
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT considerAllAxesStrided;	// Will create plan for non-strided axis
  similar as a strided axis - used with disableReorderFourStep to get the
  same layout for Bluestein kernel (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t keepShaderCode;	// Will keep shader code and print all executed
- shaders during the plan execution in order (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t printMemoryLayout;	// Will print order of buffers used in shaders
+pfUINT keepShaderCode;	// Will keep shader code and print all executed shaders
+ during the plan execution in order (0 - off, 1 - on)
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT printMemoryLayout;	// Will print order of buffers used in shaders
  (0 - off, 1 - on) 
 \end_layout
 
@@ -3904,9 +4070,9 @@ uint64_t printMemoryLayout;	// Will print order of buffers used in shaders
 
 \begin_layout Plain Layout
 
-uint64_t saveApplicationToString;	// Will save all compiled binaries to
- VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated
- with deleteVkFFT call).
+pfUINT saveApplicationToString;	// Will save all compiled binaries to VkFFTAppli
+cation.saveApplicationString (will be allocated by VkFFT, deallocated with
+ deleteVkFFT call).
  VkFFTApplication.applicationStringSize will contain size of binary in bytes.
  Currently disabled in Metal backend.
  (0 - off, 1 - on)
@@ -3914,8 +4080,8 @@ uint64_t saveApplicationToString;	// Will save all compiled binaries to
 
 \begin_layout Plain Layout
 
-uint64_t loadApplicationFromString;	// Will load all binaries from loadApplicati
-onString instead of recompiling them (must be allocated by user, must contain
+pfUINT loadApplicationFromString;	// Will load all binaries from loadApplication
+String instead of recompiling them (must be allocated by user, must contain
  what saveApplicationToString call generated previously in VkFFTApplication.saveA
 pplicationString).
  Currently disabled in Metal backend.
@@ -3937,8 +4103,8 @@ void* loadApplicationString;	// Memory binary array through which user can
 
 \begin_layout Plain Layout
 
-uint64_t disableSetLocale;	// disables all VkFFT attempts to set locale
- to C - user must ensure that VkFFT has C locale during the plan initialization.
+pfUINT disableSetLocale;	// disables all VkFFT attempts to set locale to
+ C - user must ensure that VkFFT has C locale during the plan initialization.
  This option is needed for multithreading.
  Default 0.
  
@@ -3950,12 +4116,12 @@ uint64_t disableSetLocale;	// disables all VkFFT attempts to set locale
 
 \begin_layout Plain Layout
 
-//optional Bluestein optimizations: (default 0 if not stated otherwise)
+// Optional Bluestein optimizations: (default 0 if not stated otherwise)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t fixMaxRadixBluestein;	// controls the padding of sequences in Bluestein
+pfUINT fixMaxRadixBluestein;	// controls the padding of sequences in Bluestein
  convolution.
  If specified, padded sequence will be made of up to fixMaxRadixBluestein
  primes.
@@ -3966,14 +4132,14 @@ uint64_t fixMaxRadixBluestein;	// controls the padding of sequences in Bluestein
 
 \begin_layout Plain Layout
 
-uint64_t forceBluesteinSequenceSize;	// force the sequence size to pad to
+pfUINT forceBluesteinSequenceSize;	// force the sequence size to pad to
  in Bluestein's algorithm.
  Must be at least 2*N-1 and decomposable with primes 2-13.
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t useCustomBluesteinPaddingPattern;	// force the sequence sizes to
+pfUINT useCustomBluesteinPaddingPattern;	// force the sequence sizes to
  pad to in Bluestein's algorithm, but on a range.
  This number specifies the number of elements in primeSizes and in paddedSizes
  arrays.
@@ -3989,12 +4155,12 @@ uint64_t useCustomBluesteinPaddingPattern;	// force the sequence sizes to
 
 \begin_layout Plain Layout
 
-uint64_t* primeSizes;	// described in useCustomBluesteinPaddingPattern
+pfUINT* primeSizes;	// described in useCustomBluesteinPaddingPattern
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t* paddedSizes;	// described in useCustomBluesteinPaddingPattern
+pfUINT* paddedSizes;	// described in useCustomBluesteinPaddingPattern
 \end_layout
 
 \begin_layout Plain Layout
@@ -4003,7 +4169,7 @@ uint64_t* paddedSizes;	// described in useCustomBluesteinPaddingPattern
 
 \begin_layout Plain Layout
 
-uint64_t fixMinRaderPrimeMult;	// start direct multiplication Rader's algorithm
+pfUINT fixMinRaderPrimeMult;	// start direct multiplication Rader's algorithm
  for radix primes from this number.
  This means that VkFFT will inline custom Rader kernels if sequence is divisible
  by these primes.
@@ -4013,8 +4179,8 @@ uint64_t fixMinRaderPrimeMult;	// start direct multiplication Rader's algorithm
 
 \begin_layout Plain Layout
 
-uint64_t fixMaxRaderPrimeMult;	// switch from Mult Rader's algorithm for
- radix primes from this number.
+pfUINT fixMaxRaderPrimeMult;	// switch from Mult Rader's algorithm for radix
+ primes from this number.
  Current limitation for Rader is maxThreadNum/2+1, realistically you would
  want to switch somewhere on 30-100 range.
  Default is vendor-specific (currently ~40)
@@ -4026,8 +4192,8 @@ uint64_t fixMaxRaderPrimeMult;	// switch from Mult Rader's algorithm for
 
 \begin_layout Plain Layout
 
-uint64_t fixMinRaderPrimeFFT;	// start FFT convolution version of Rader
- for radix primes from this number.
+pfUINT fixMinRaderPrimeFFT;	// start FFT convolution version of Rader for
+ radix primes from this number.
  Better than direct multiplication version for almost all primes (except
  small ones, like 17-23 on some GPUs).
  Must be bigger or equal to fixMinRaderPrimeMult.
@@ -4037,7 +4203,7 @@ uint64_t fixMinRaderPrimeFFT;	// start FFT convolution version of Rader
 
 \begin_layout Plain Layout
 
-uint64_t fixMaxRaderPrimeFFT;	// switch to Bluestein's algorithm for radix
+pfUINT fixMaxRaderPrimeFFT;	// switch to Bluestein's algorithm for radix
  primes from this number.
  Switch may happen earlier if prime can't fit in shared memory.
  Default is 16384, which is bigger than most current GPU's shared memory.
@@ -4054,26 +4220,26 @@ uint64_t fixMaxRaderPrimeFFT;	// switch to Bluestein's algorithm for radix
 
 \begin_layout Plain Layout
 
-uint64_t performZeropadding[VKFFT_MAX_FFT_DIMENSIONS];	// Don't read some
+pfUINT performZeropadding[VKFFT_MAX_FFT_DIMENSIONS];	// Don't read some
  data/perform computations if some input sequences are zeropadded for each
  axis (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t fft_zeropad_left[VKFFT_MAX_FFT_DIMENSIONS];	// Specify start boundary
+pfUINT fft_zeropad_left[VKFFT_MAX_FFT_DIMENSIONS];	// Specify start boundary
  of zero block in the system for each axis
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t fft_zeropad_right[VKFFT_MAX_FFT_DIMENSIONS];	// Specify end boundary
+pfUINT fft_zeropad_right[VKFFT_MAX_FFT_DIMENSIONS];	// Specify end boundary
  of zero block in the system for each axis
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t frequencyZeroPadding;	// Set to 1 if zeropadding of frequency domain,
+pfUINT frequencyZeroPadding;	// Set to 1 if zeropadding of frequency domain,
  default 0 - spatial zeropadding
 \end_layout
 
@@ -4088,41 +4254,40 @@ uint64_t frequencyZeroPadding;	// Set to 1 if zeropadding of frequency domain,
 
 \begin_layout Plain Layout
 
-uint64_t performConvolution;	// Perform convolution in this application
- (0 - off, 1 - on).
+pfUINT performConvolution;	// Perform convolution in this application (0
+ - off, 1 - on).
  Disables reorderFourStep parameter
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t coordinateFeatures;	// C - coordinate, or dimension of features
- vector.
+pfUINT coordinateFeatures;	// C - coordinate, or dimension of features vector.
  In matrix convolution - size of a vector
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t matrixConvolution;	// If equal to 2 perform 2x2, if equal to 3
- perform 3x3 matrix-vector convolution.
+pfUINT matrixConvolution;	// If equal to 2 perform 2x2, if equal to 3 perform
+ 3x3 matrix-vector convolution.
  Overrides coordinateFeatures
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t symmetricKernel;	// Specify if kernel in 2x2 or 3x3 matrix convolution
+pfUINT symmetricKernel;	// Specify if kernel in 2x2 or 3x3 matrix convolution
  is symmetric
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t numberKernels;	// N - only used in convolution step - specify how
+pfUINT numberKernels;	// N - only used in convolution step - specify how
  many kernels were initialized before.
  Expands one input to multiple (batched) output
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t kernelConvolution;	// Specify if this application is used to create
+pfUINT kernelConvolution;	// Specify if this application is used to create
  kernel for convolution, so it has the same properties.
  performConvolution has to be set to 0 for kernel creation
 \end_layout
@@ -4138,10 +4303,10 @@ uint64_t kernelConvolution;	// Specify if this application is used to create
 
 \begin_layout Plain Layout
 
-uint64_t registerBoost;	// Specify if register file size is bigger than
- shared memory and can be used to extend it X times (on Nvidia 256KB register
- file can be used instead of 32KB of shared memory, set this constant to
- 4 to emulate 128KB of shared memory).
+pfUINT registerBoost;	// Specify if register file size is bigger than shared
+ memory and can be used to extend it X times (on Nvidia 256KB register file
+ can be used instead of 32KB of shared memory, set this constant to 4 to
+ emulate 128KB of shared memory).
  Defaults: Nvidia - 4 in Vulkan/OpenCL, 1 in CUDA backend; AMD - 2 if shared
  memory >= 64KB, else 4 in Vulkan/OpenCL backend, 1 in HIP backend; Intel
  - 1 if shared memory >= 64KB, else 2 in Vulkan/OpenCL/Level Zero backends,
@@ -4150,40 +4315,35 @@ uint64_t registerBoost;	// Specify if register file size is bigger than
 
 \begin_layout Plain Layout
 
-uint64_t registerBoostNonPow2;	// Specify if register overutilization should
+pfUINT registerBoostNonPow2;	// Specify if register overutilization should
  be used on non power of 2 sequences (0 - off, 1 - on)
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t registerBoost4Step;	// Specify if register file overutilization
- should be used in big sequences (>2^14), same definition as registerBoost.
+pfUINT registerBoost4Step;	// Specify if register file overutilization should
+ be used in big sequences (>2^14), same definition as registerBoost.
  Default 1
 \end_layout
 
 \begin_layout Plain Layout
 
-//not used techniques:
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t swapTo3Stage4Step;	// Specify at which power of 2 to switch from
- 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower
- than coalesced sequence helps to combat TLB misses.
- Default 0 - disabled.
- Must be at least 17
+//not used techniques:
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t devicePageSize;	// In KB, the size of a page on the GPU.
+pfUINT devicePageSize;	// In KB, the size of a page on the GPU.
  Setting to 0 disables local buffer split in pages
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t localPageSize;	// In KB, the size to split page into if sequence
+pfUINT localPageSize;	// In KB, the size to split page into if sequence
  spans multiple devicePageSize pages
 \end_layout
 
@@ -4199,77 +4359,77 @@ uint64_t localPageSize;	// In KB, the size to split page into if sequence
 
 \begin_layout Plain Layout
 
-uint64_t computeCapabilityMajor;	// CUDA/HIP compute capability of the device
+pfUINT computeCapabilityMajor;	// CUDA/HIP compute capability of the device
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t computeCapabilityMinor;	// CUDA/HIP compute capability of the device
+pfUINT computeCapabilityMinor;	// CUDA/HIP compute capability of the device
  	
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t maxComputeWorkGroupCount[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGr
-oupCount from VkPhysicalDeviceLimits
+pfUINT maxComputeWorkGroupCount[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGrou
+pCount from VkPhysicalDeviceLimits
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t maxComputeWorkGroupSize[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGro
-upCount from VkPhysicalDeviceLimits
+pfUINT maxComputeWorkGroupSize[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGroup
+Count from VkPhysicalDeviceLimits
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t maxThreadsNum;	// Max number of threads from VkPhysicalDeviceLimits
+pfUINT maxThreadsNum;	// Max number of threads from VkPhysicalDeviceLimits
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t sharedMemorySizeStatic;	// Available for static allocation shared
+pfUINT sharedMemorySizeStatic;	// Available for static allocation shared
  memory size, in bytes
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t sharedMemorySize;	// Available for allocation shared memory size,
+pfUINT sharedMemorySize;	// Available for allocation shared memory size,
  in bytes
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t sharedMemorySizePow2;	// Power of 2 which is less or equal to sharedMem
-orySize, in bytes
+pfUINT sharedMemorySizePow2;	// Power of 2 which is less or equal to sharedMemor
+ySize, in bytes
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t warpSize;	// Number of threads per warp/wavefront.
+pfUINT warpSize;	// Number of threads per warp/wavefront.
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t halfThreads;	// Intel fix
+pfUINT halfThreads;	// Intel fix
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t allocateTempBuffer;	// Buffer allocated by app automatically if
- needed to reorder Four step algorithm.
+pfUINT allocateTempBuffer;	// Buffer allocated by app automatically if needed
+ to reorder Four step algorithm.
  Parameter to check if it has been allocated
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t reorderFourStep;	// Unshuffle Four step algorithm.
+pfUINT reorderFourStep;	// Unshuffle Four step algorithm.
  Requires tempbuffer allocation (0 - off, 1 - on).
  Default 1.
 \end_layout
 
 \begin_layout Plain Layout
 
-int64_t maxCodeLength;	// Specify how big can be buffer used for code generation
+pfINT maxCodeLength;	// Specify how big can be buffer used for code generation
  (in char).
  Default 1000000 chars.
  
@@ -4277,7 +4437,7 @@ int64_t maxCodeLength;	// Specify how big can be buffer used for code generation
 
 \begin_layout Plain Layout
 
-int64_t maxTempLength;	// Specify how big can be buffer used for intermediate
+pfINT maxTempLength;	// Specify how big can be buffer used for intermediate
  string sprintfs be (in char).
  Default 5000 chars.
  If code segfaults for some reason - try increasing this number.
@@ -4285,19 +4445,19 @@ int64_t maxTempLength;	// Specify how big can be buffer used for intermediate
 
 \begin_layout Plain Layout
 
-uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluest
-einPaddingPattern
+pfUINT autoCustomBluesteinPaddingPattern; // default value for useCustomBluestei
+nPaddingPattern
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow
+pfUINT useRaderUintLUT; // allocate additional LUT to store g_pow
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 -
- AMD, etc
+pfUINT vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD,
+ etc
 \end_layout
 
 \begin_layout Plain Layout
@@ -4332,12 +4492,12 @@ cudaEvent_t* stream_event;	// Filled at app creation
 
 \begin_layout Plain Layout
 
-uint64_t streamCounter;	// Filled at app creation
+pfUINT streamCounter;	// Filled at app creation
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t streamID;	// Filled at app creation
+pfUINT streamID;	// Filled at app creation
 \end_layout
 
 \begin_layout Plain Layout
@@ -4352,12 +4512,12 @@ hipEvent_t* stream_event;	// Filled at app creation
 
 \begin_layout Plain Layout
 
-uint64_t streamCounter;	// Filled at app creation
+pfUINT streamCounter;	// Filled at app creation
 \end_layout
 
 \begin_layout Plain Layout
 
-uint64_t streamID;	// Filled at app creation
+pfUINT streamID;	// Filled at app creation
 \end_layout
 
 \begin_layout Plain Layout
@@ -4490,7 +4650,8 @@ cudaStream_t* stream - Pointer to streams (can be more than 1), where to
 \begin_layout Itemize
 uint64_t num_streams - Try to submit CUDA kernels in multiple streams for
  asynchronous execution.
- Default 1 
+ Default 0, set to >=1 if you pass values in the stream pointer.
+ 
 \end_layout
 
 \begin_layout Standard
@@ -4513,7 +4674,8 @@ hipStream_t* stream - Pointer to streams (can be more than 1), where to
 \begin_layout Itemize
 uint64_t num_streams - Try to submit HIP kernels in multiple streams for
  asynchronous execution.
- Default 1 
+ Default 0, set to >=1 if you pass values in the stream pointer.
+  
 \end_layout
 
 \begin_layout Standard
@@ -4719,18 +4881,25 @@ uint64_t numberBatches - N parameter of the transform.
 
 \begin_layout Standard
 uint64_t performR2C - perform R2C/C2R decomposition.
- performDCT must be set to 0.
+ performDCT and performDST must be set to 0.
  Default 0, set to 1 to enable.
  Optional parameter.
 \end_layout
 
 \begin_layout Standard
 uint64_t performDCT - perform DCT transformation.
- performR2C must be set to 0.
+ performR2C and performDST must be set to 0.
  Default 0, set to X for DCT-X (currently supported X: 1, 2, 3 and 4).
  Optional parameter.
 \end_layout
 
+\begin_layout Standard
+uint64_t performDST - perform DST transformation.
+ performR2C and performDCT must be set to 0.
+ Default 0, set to X for DST-X (currently supported X: 1, 2, 3 and 4).
+ Optional parameter.
+\end_layout
+
 \begin_layout Standard
 uint64_t normalize - enabling this parameter will make the inverse transform
  divide the result by the FFT length.
@@ -4762,18 +4931,30 @@ uint64_t doublePrecisionFloatMemory - perform calculations in double precision,
  In Vulkan/OpenCL/Level Zero your device must support double precision functiona
 lity.
  Metal API does not support double precision.
- Experimental feature.
  Optional parameter.
 \end_layout
 
+\begin_layout Standard
+uint64_t quadDoubleDoublePrecision - perform calculations in double-double
+ emulation of quad precision (0 - off, 1 - on).
+ Optional parameter.
+ Requires quadmath library (for now).
+\end_layout
+
+\begin_layout Standard
+uint64_t quadDoubleDoublePrecisionDoubleMemory - perform calculations in
+ double-double emulation of quad precision, while all memory storage is
+ done in FP64.
+ Optional parameter.
+ Requires quadmath library (for now).
+\end_layout
+
 \begin_layout Standard
 uint64_t halfPrecision - half-precision in VkFFT is implemented only as
  memory optimization.
  All calculations are done in single precision (similar way as doublePrecisionFl
 oatMemory works for double and single precision).
  Default 0, set to 1 to enable.
- Works only in Vulkan API now, experimental feature (half precision seems
- to have bad precision for the first FFT element).
  Optional parameter.
 \end_layout
 
@@ -4789,7 +4970,6 @@ uint64_t halfPrecisionMemoryOnly - another way of performing half-precision
  the base halfPrecision case.
  halfPrecision must be set to 1.
  Default 0, set to 1 to enable.
- Works only in Vulkan API now, experimental feature.
  Optional parameter.
 \end_layout
 
@@ -4879,8 +5059,8 @@ uint64_t useUint64 - forces 64-bit addressing in generated kernels.
 \begin_layout Standard
 uint64_t performBandwidthBoost - try to reduce coalsesced number by a factor
  of X to get bigger sequence in one upload for strided axes.
- Default: -1(inf) for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0
- otherwise 
+ Default: -1(inf) for DCT and DST, 2 for Bluestein's algorithm (or -1 if
+ DCT and DST), 0 otherwise 
 \end_layout
 
 \begin_layout Standard
@@ -5401,6 +5581,11 @@ Sample 7 - FFT + iFFT C2C Bluestein benchmark in single precision
 Sample 8 - FFT + iFFT C2C Bluestein benchmark in double precision
 \end_layout
 
+\begin_layout Itemize
+Sample 9 - FFT + iFFT C2C benchmark 1D batched in double-double emulation
+ of quad precision
+\end_layout
+
 \begin_layout Itemize
 Sample 10 - multiple buffers (4 by default) split version of benchmark 0
 \end_layout
@@ -5445,6 +5630,11 @@ Sample 18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision
  test in double precision
 \end_layout
 
+\begin_layout Itemize
+Sample 19 - VkFFT / FFTW C2C precision test in double-double emulation of
+ quad precision
+\end_layout
+
 \begin_layout Itemize
 Sample 50 - convolution example with identity kernel
 \end_layout
@@ -5473,7 +5663,12 @@ Sample 1000 - FFT + iFFT C2C benchmark 1D batched in single precision: all
 \end_layout
 
 \begin_layout Itemize
-Sample 1001 - FFT + iFFT C2C benchmark 1D batched in single precision: all
+Sample 1001 - FFT + iFFT C2C benchmark 1D batched in double precision: all
+ supported systems from 2 to 4096
+\end_layout
+
+\begin_layout Itemize
+Sample 1002 - FFT + iFFT C2C benchmark 1D batched in half precision: all
  supported systems from 2 to 4096
 \end_layout
 
@@ -5482,6 +5677,11 @@ Sample 1003 - FFT + iFFT C2C benchmark 1D batched in single precision: all
  supported systems from 2 to 4096
 \end_layout
 
+\begin_layout Itemize
+Sample 1004 - FFT + iFFT C2C benchmark 1D batched in double-double emulation
+ of quad precision: all supported systems from 2 to 4096
+\end_layout
+
 \begin_layout Subsection
 utils_VkFFT helper routines
 \end_layout
@@ -7154,6 +7354,11 @@ if (P == 2) configuration.halfPrecision = 1;
 
 \begin_layout Plain Layout
 
+if (P == 3) configuration.quadDoubleDoublePrecision = 1;
+\end_layout
+
+\begin_layout Plain Layout
+
 \end_layout
 
 \begin_layout Plain Layout
diff --git a/documentation/VkFFT_API_guide.pdf b/documentation/VkFFT_API_guide.pdf
index c855ec43..8001ed18 100644
Binary files a/documentation/VkFFT_API_guide.pdf and b/documentation/VkFFT_API_guide.pdf differ
diff --git a/documentation/VkFFT_API_guide.tex b/documentation/VkFFT_API_guide.tex
index bcbd7d21..d71eccd7 100644
--- a/documentation/VkFFT_API_guide.tex
+++ b/documentation/VkFFT_API_guide.tex
@@ -44,7 +44,7 @@
 {\Large Dmitrii Tolmachev\par} 	
 	
 \vspace{1cm} 	
-{\large July 2023, version 1.3.0\par} 
+{\large October 2023, version 1.3.2\par} 
 \end{titlepage}
 
 \newpage{}
@@ -128,7 +128,7 @@ \subsection{Installing VkFFT}
 target_link_libraries(${PROJECT_NAME} PUBLIC SPIRV glslang Vulkan::Vulkan VkFFT)
 \end{minted}
 \end{mdframed}
-\item CUDA API: CUDA and NVRTC. Sample CMakeLists can look like this:\begin{mdframed}[backgroundcolor=bg]
+\item CUDA API: CUDA and NVRTC. Sample CMakeLists can look like this: \begin{mdframed}[backgroundcolor=bg]
 \begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
 find_package(CUDA 9.0 REQUIRED) 	
 enable_language(CUDA) 	
@@ -264,14 +264,14 @@ \subsection{Fourier Transform types and their definitions}
 
 VkFFT supports commonly used Complex to complex (C2C), real to complex
 (R2C), complex to real (C2R) transformations and real to real (R2R)
-Discrete Cosine Transformations of types II, III and IV. VkFFT uses
-the same definitions as FFTW, except for the multidimensional FFT
-axis ordering: in FFTW dimensions are ordered with the decrease in
-consecutive elements stride, while VkFFT does the opposite - the first
-axis is the non-strided axis (the one that has elements located consecutively
-in memory with no gaps, usually named as the X-axis). So, in FFTW
-dimensions are specified as ZYX and in VkFFT as XYZ. This felt more
-logical to me - no matter if there are 1, 2, 3 or more dimensions,
+Discrete Cosine/Sine Transformations of types I, II, III and IV. VkFFT
+uses the same definitions as FFTW, except for the multidimensional
+FFT axis ordering: in FFTW dimensions are ordered with the decrease
+in consecutive elements stride, while VkFFT does the opposite - the
+first axis is the non-strided axis (the one that has elements located
+consecutively in memory with no gaps, usually named as the X-axis).
+So, in FFTW dimensions are specified as ZYX and in VkFFT as XYZ. This
+felt more logical to me - no matter if there are 1, 2, 3 or more dimensions,
 the user can always find the axis with the same stride at the same
 position. This choice doesn't require any modification in the user's
 data management - just provide the FFT dimensions in the reverse order
@@ -291,7 +291,7 @@ \subsection{Fourier Transform types and their definitions}
 VkFFT assumes that complex numbers are stored consecutively in memory:
 RIRIRI... where R denotes the real part of the complex number and
 I denotes the imaginary part. There is no difference between using
-a float2/double2/half2 container or access memory as float/double/half
+a float2/double2/half2 container or access memory as float/double/half/double-double
 as long as the byte order remains the same.
 
 This section and the next one will cover the basics of VkFFT data
@@ -326,17 +326,25 @@ \subsubsection{R2C/C2R transforms}
 \subsubsection{R2R (DCT) transforms}
 
 R2R transforms in VkFFT are implemented in the form of Discrete cosine
-transforms of types I, II, III and IV. Their definitions and transforms
-results match FFTW:
+transforms and Discrete sine transforms of types I, II, III and IV.
+Their definitions and transforms results match FFTW:
 \begin{enumerate}
 \item DCT-I: $X_{k}=x_{0}+(-1)^{k}x_{N-1}+2\stackrel[n=1]{N-2}{\sum}x_{n}cos(\frac{\pi}{N-1}nk)$,
 inverse of DCT-I (itself)
-\item DCT-II: $X_{k}=2\stackrel[n=1]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})k)$,
+\item DCT-II: $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})k)$,
 inverse of DCT-III
 \item DCT-III: $X_{k}=x_{0}+2\stackrel[n=1]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}n(k+\frac{1}{2}))$,
 inverse of DCT-II
 \item DCT-IV: $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})(k+\frac{1}{2}))$,
 inverse of DCT-IV (itself)
+\item DST-I: $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}sin(\frac{\pi}{N+1}(n+1)(k+1))$,
+inverse of DST-I (itself)
+\item DST-II: $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}sin(\frac{\pi}{N}(n+\frac{1}{2})(k+1))$,
+inverse of DST-III
+\item DST-III: $X_{k}=(-1)^{k}x_{N-1}+2\stackrel[n=0]{N-2}{\sum}x_{n}sin(\frac{\pi}{N}(n+1)(k+\frac{1}{2}))$,
+inverse of DST-II
+\item DST-IV: $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}sin(\frac{\pi}{N}(n+\frac{1}{2})(k+\frac{1}{2}))$,
+inverse of DST-IV (itself)
 \end{enumerate}
 R2R transforms are performed by redefinition of them to the C2C transforms
 (internal C2C sequence length can be different from the input R2R
@@ -423,7 +431,7 @@ \subsubsection{VkFFT buffers strides. A special case of R2C/C2R transforms}
 For an out-of-place R2C FFT, there is no need to pad buffer with real
 numbers, but user must specify H stride there (as it differs to default
 one) - $N_{x}$ real elements for real space and $floor(\frac{N_{x}}{2})+1$
-complex numbers for the frequency space.
+complex numbers for the frequency space. 
 
 An out-of-place C2R FFT is a more tricky transform. In the multidimensional
 case, the main buffer will be written to and read from multiple times.
@@ -440,6 +448,9 @@ \subsubsection{VkFFT buffers strides. A special case of R2C/C2R transforms}
 buffer; C2R will modify the buffer it reads from in some cases (see
 issue \href{https://github.com/DTolm/VkFFT/issues/58\#issuecomment-1007205682}{\#58})
 
+By default, R2C/C2R will assume the complex-padded real strides in
+the in-place mode and non-padded real strides in out-of-place mode.
+
 \subsection{VkFFT algorithms}
 
 VkFFT implements a wide range of algorithms to compute different types
@@ -513,13 +524,14 @@ \subsubsection{The Four-Step FFT algorithm}
 To estimate if your sequence size is single upload or not, divide
 the amount of available shared memory (48KB - Nvidia GPUs with Vulkan/OpenCL
 API, 64KB - AMD GPUs, 100KB - Nvidia GPUs in CUDA API) by the complex
-size used for calculations (8 byte - single precision, 16 byte - double
-precision). For 64KB of shared memory, we get 8192 as max single upload
-single-precision non-strided FFT, 4096 for double precision. For strided
-axes (H and D parts of the layout) these numbers have to be divided
-by 4 and 2 respectively to achieve coalescing, resulting in 2048 length
-for single upload in both precisions. For more information on coalescing
-see: coalescing API reference.
+size used for calculations (4 byte - half precision, 8 byte - single
+precision, 16 byte - double precision, 32 byte - double-double). For
+64KB of shared memory, we get 8192 as max single upload single-precision
+non-strided FFT, 4096 for double precision. For strided axes (H and
+D parts of the layout) these numbers have to be divided by 4 and 2
+respectively to achieve coalescing, resulting in 2048 length for single
+upload in both precisions. For more information on coalescing see:
+coalescing API reference.
 
 In the case of the Four-Step FFT algorithm, tempBuffer size has to
 be at least the same as the default main buffer size. It does not
@@ -560,10 +572,11 @@ \subsubsection{R2C/C2R multi-upload FFT algorithm}
 done with the help of the Four-Step FFT algorithm. When FFT is done,
 separate post-processing for R2C/pre-processing for C2R is applied.
 
-\subsubsection{R2R Discrete Cosine Transforms}
+\subsubsection{R2R Discrete Cosine/Sine Transforms}
 
-There exist many different mappings between DCT and FFT. As of now,
-VkFFT has the following algorithms implemented (all single-upload
+There exist many different mappings between DCT and FFT. DSTs are
+reformulated as DCTs inside the VkFFT, so they use the same algorithms.
+As of now, VkFFT has the following algorithms implemented (all single-upload
 for now):
 \begin{itemize}
 \item DCT-I - mapping between R2R and C2C of the $2N-2$ length. For non-strided
@@ -615,16 +628,17 @@ \subsubsection{Convolution and cross-correlation support}
 
 \subsection{VkFFT accuracy}
 
-To measure how VkFFT (single/double/half precision) results compare
-to cuFFT/rocFFT (single/double/half precision) and FFTW (double precision),
-multiple sets of systems covering full supported C2C/R2C+C2R/R2R FFT
-range are filled with random complex data on the scale of {[}-1,1{]}
-and one transform was performed on each system. Samples 11(single),
-12(double), 13(half), 14(non-power of 2 C2C, single), 15(R2C+C2R,
-single), 16(DCT-I/II/III/IV, single), 17(DCT-I/II/III/IV, double),
-18(non-power of 2 C2C, double) are available in VkFFT Benchmark Suite
-to perform VkFFT verification on any of the target platforms. Overall,
-the Cooley-Tukey algorithm (Stockham autosort) exhibits logarithmic
+To measure how VkFFT (single/double/half/double-double precision)
+results compare to cuFFT/rocFFT (single/double/half precision) and
+FFTW (double/quad precision), multiple sets of systems covering full
+supported C2C/R2C+C2R/R2R FFT range are filled with random complex
+data on the scale of {[}-1,1{]} and one transform was performed on
+each system. Samples 11(single), 12(double), 13(half), 14(non-power
+of 2 C2C, single), 15(R2C+C2R, single), 16(DCT-I/II/III/IV, single),
+17(DCT-I/II/III/IV, double), 18(non-power of 2 C2C, double), 19(double-double
+emulation of quad precision, C2C) are available in VkFFT Benchmark
+Suite to perform VkFFT verification on any of the target platforms.
+Overall, the Cooley-Tukey algorithm (Stockham autosort) exhibits logarithmic
 relative error scaling, similar to those of other GPU FFT libraries.
 Typically, the more computationally expensive algorithm is - the worse
 its precision is. So, Bluestein's algorithm has lower accuracy than
@@ -642,12 +656,13 @@ \subsection{VkFFT accuracy}
 one, as polynomial sincos approximation is too compute-heavy for modern
 GPUs. It is selected by default on all devices.
 
-Half precision is currently only supported in the Vulkan backend and
-is often experiencing precision problems with the first number of
-the resulting FFT sequence, which is the sum of all input numbers.
-Half precision is implemented only as a memory trick - all on-chip
-computations are done in single precision, but this doesn't help with
-the first number problem. Half precision can use SFU or LUT as well.
+Half precision is supported in all backends. Half precision is implemented
+only as a memory trick - all on-chip computations are done in single
+precision. Half precision can use SFU or LUT as well.
+
+Double-double emulation of quad precision uses precomputed on CPU
+in quad precision twiddle factors and implements only additions and
+multiplications on GPU (for now).
 
 VkFFT also supports mixed-precision operations, where memory storing
 is done at lower precision, compared to the on-chip calculations.
@@ -668,8 +683,8 @@ \subsubsection{LUT allocations}
 \begin{itemize}
 \item twiddle factors for each radix stage of Stockham FFT calculation
 \item phase vectors used in the Four Step FFT algorithm between stages
-\item phase vectors used in DCT-II/III/IV to perform a mapping between R2R
-and C2C
+\item phase vectors used in DCT/DST-II/III/IV to perform a mapping between
+R2R and C2C
 \item phase vectors used in post-processing for R2C/pre-processing for C2R
 for even length sequences as C2C of half size
 \end{itemize}
@@ -706,6 +721,41 @@ \subsubsection{Bluestein's buffers allocation}
 the biggest size needed among axes and allocated tempBuffer of this
 size.
 
+\subsection{VkFFT support for double-double emulation of quad precision}
+
+Since VkFFT 1.3.2 experimental support for double-double emulation
+of quad precision has been added. Double-double number is defined
+as an unevaluated sum of two double numbers, second one being on the
+order of \textasciitilde 1 ULP of the first one (like 1 and 1e-16).
+This boosts the significand from 53 to 106 bits and leaves the exponent
+the same (11bits). For example, true quad precision would have 15
+bits of exponent and 113 bits significand. The range of double-double
+stays the same as in double precision (1e308 vs 1e4932), while precision
+boosts from \textasciitilde 1e-16 to \textasciitilde 1e-32 (vs \textasciitilde 1e-34
+of true quad precision). The double-double format of VkFFT is equivalent
+to \_\_ibm128 type in gcc. VkFFT uses quadmath library to precompute
+twiddle factors and initialize kernels for double-double in the true
+quad precision. Currently, conversion between quad and double-double
+is only possible on CPU in true quad (GPU can only do operations on
+data in double-double format). This will be changed later.
+
+To enable double-double support, link against quadmath and set VkFFT\_use\_FP128\_double\_double
+compile flag to true.
+
+The performance impact is rather harsh - the code becomes compute-bound
+even on modern HPC clusters. The simple radix algorithms work well
+and are usable (\textasciitilde 2x impact compared to double precision),
+while Bluestein and big primes Rader algorithms incur a high toll
+on operations count. This can be improved in the future, but probably
+not to the extent of being VRAM bandwidth-bound on modern accelerators.
+
+The main usage for the double-double precision should be mixed-precision
+calculations, where most of the time is spent in regular FP64 and
+only some parts that hit precision boundaries are calculated in double-double.
+For example, in VkFFT it is possible to perform a double precision
+FFT with all on-chip calculations done in double-double. This way
+the final errors should be on the scale of 1 ULP.
+
 \newpage{}
 
 \section{Runtime code optimization platform\label{sec:Modern-GPU-architecture.}}
@@ -866,6 +916,11 @@ \subsection{Container abstractions}
 as a bonus. All this is subject to change in the future if people
 present convincing arguments. 
 
+VkFFT supports two base number modes (used for all FP calculations
+on CPU) - long double or quad (\_\_float128). The latter is needed
+to calculate the twiddle factors in double-double emulation of quad
+precision.
+
 \newpage{}
 
 \section{VkFFT API Reference}
@@ -892,6 +947,7 @@ \subsection{Return value VkFFTResult}
 VKFFT_ERROR_NULL_TEMP_PASSED = 5,	// Internal kernel generation error
 VKFFT_ERROR_MATH_FAILED = 6,	// Math instruction in code generator failed     
 VKFFT_ERROR_FFTdim_GT_MAX_FFT_DIMENSIONS = 7,	// User specified a number of dimensions higher than the code is compiled to handle with VKFFT_MAX_FFT_DIMENSIONS
+VKFFT_ERROR_NONZERO_APP_INITIALIZATION = 8,	// The app pointer is not pointing to the zero-filled memory block
 VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001,	// No physical device is provided (Vulkan API)
 VKFFT_ERROR_INVALID_DEVICE = 1002,	// No device is provided (All APIs)
 VKFFT_ERROR_INVALID_QUEUE = 1003,	// No queue is provided (Vulkan API)
@@ -916,10 +972,11 @@ \subsection{Return value VkFFTResult}
 VKFFT_ERROR_EMPTY_kernel = 2012,	// Same error as VKFFT_ERROR_EMPTY_buffer if performConvolution is enabled
 VKFFT_ERROR_EMPTY_applicationString = 2013,	// loadApplicationString is zero when loadApplicationFromString is enabled
 VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays = 2014,	// pointers to primeSizes or paddedSizes arrays are zero when useCustomBluesteinPaddingPattern is enabled	
+VKFFT_ERROR_EMPTY_app = 2015,	// app pointer is zero
 VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,	// VkFFT has encountered unsupported radix (more than 13) during decomposition and Bluestein's FFT fallback did not work
 VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002,	// VkFFT can not do this sequence length currently - it requires mor than three-upload Four step FFT
 VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,	// VkFFT can not do this sequence length currently - odd multi-upload R2C/C2R FFTs
-VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004,	// VkFFT can not do this sequence length currently - multi-upload R2R transforms, odd DCT-IV transforms
+VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004,	// VkFFT can not do this sequence length currently - multi-upload R2R transforms
 VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005,	// VkFFT can not omit sequences in convolution calculations and R2C/C2R case
 VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001,	// VkFFT failed to allocate GPU memory
 VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002,	// 4002-4052 are handlers for errors of used backend APIs. They may indicate a driver failure. If they are thrown - report to the GitHub repo
@@ -989,7 +1046,7 @@ \subsection{VkFFT application management functions}
 one will go into how to configure VkFFTConfiguration correctly. All
 of the functions operate on VkFFTApplication and VkFFTConfiguration
 assuming they have been zero-initialized before usage, so do not forget
-to do this when initializing:
+to do this when initializing (otherwise VkFFT will throw an error):
 
 \begin{mdframed}[backgroundcolor=bg]
 \begin{minted}[tabsize=4,obeytabs,breaklines]{C}
@@ -1107,29 +1164,30 @@ \subsection{VkFFT configuration}
 This section will cover all the parameters that can be specified in
 the VkFFTConfiguration struct. It will start with a short description
 of the struct (intended to be used as a cheat sheet), then go for
-each field in detail.
+each field in detail. Here pfUINT and pfINT are defined as uint64\_t
+and int64\_t respectively.
 
 \begin{mdframed}[backgroundcolor=bg]
 \begin{minted}[tabsize=4,obeytabs,breaklines]{C}
 typedef struct {
 // Required parameters: 	
-uint64_t FFTdim;	// FFT dimensionality (1, 2 or 3)
-uint64_t size[VKFFT_MAX_FFT_DIMENSIONS];	// WHD+ - system dimensions
+pfUINT FFTdim;	// FFT dimensionality (1, 2 or 3)
+pfUINT size[VKFFT_MAX_FFT_DIMENSIONS];	// WHD+ - system dimensions
 #if(VKFFT_BACKEND==0) //Vulkan API
 VkPhysicalDevice* physicalDevice;	// Pointer to Vulkan physical device, obtained from vkEnumeratePhysicalDevices
 VkDevice* device;	// Pointer to Vulkan device, created with vkCreateDevice
 VkQueue* queue;	// Pointer to Vulkan queue, created with vkGetDeviceQueue
 VkCommandPool* commandPool;	// Pointer to Vulkan command pool, created with vkCreateCommandPool
 VkFence* fence;	// Pointer to Vulkan fence, created with vkCreateFence
-uint64_t isCompilerInitialized;	// Specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0 
+pfUINT isCompilerInitialized;	// Specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0 
 #elif(VKFFT_BACKEND==1) //CUDA API
-CUdevice* device;	// Pointer to CUDA device, obtained from cuDeviceGet 	
+CUdevice* device;	// Pointer to CUDA device, obtained from cuDeviceGet
 cudaStream_t* stream;	// Pointer to streams (can be more than 1), where to execute the kernels. Deafult 0
-uint64_t num_streams;	// Try to submit CUDA kernels in multiple streams for asynchronous execution. Default 1 
+pfUINT num_streams;	// Try to submit CUDA kernels in multiple streams for asynchronous execution. Default 0, set to >=1 if you pass values in the stream pointer.  
 #elif(VKFFT_BACKEND==2) //HIP API
 hipDevice_t* device;	// Pointer to HIP device, obtained from hipDeviceGet
 hipStream_t* stream;	// Pointer to streams (can be more than 1), where to execute the kernels. Deafult 0
-uint64_t num_streams;	// Try to submit HIP kernels in multiple streams for asynchronous execution. Default 1 
+pfUINT num_streams;	// Try to submit HIP kernels in multiple streams for asynchronous execution. Default 0, set to >=1 if you pass values in the stream pointer.
 #elif(VKFFT_BACKEND==3) //OpenCL API
 cl_platform_id* platform;	// NOT REQUIRED
 cl_device_id* device;	// Pointer to OpenCL device, obtained from clGetDeviceIDs
@@ -1145,17 +1203,17 @@ \subsection{VkFFT configuration}
 #endif
 
 // Data parameters (buffers can be specified at launch):
-uint64_t userTempBuffer;	// Buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on)
-uint64_t bufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1
-uint64_t tempBufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation 	
-uint64_t inputBufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled 
-uint64_t outputBufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled 
-uint64_t kernelNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled
-uint64_t* bufferSize;	// Array of buffers sizes in bytes
-uint64_t* tempBufferSize;	// Array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation 
-uint64_t* inputBufferSize;	// Array of input buffers sizes in bytes, if isInputFormatted is enabled
-uint64_t* outputBufferSize;	// Array of output buffers sizes in bytes, if isOutputFormatted is enabled
-uint64_t* kernelSize;	// Array of kernel buffers sizes in bytes, if performConvolution is enabled
+pfUINT userTempBuffer;	// Buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on)
+pfUINT bufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1
+pfUINT tempBufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation 	
+pfUINT inputBufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled 
+pfUINT outputBufferNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled 
+pfUINT kernelNum;	// Multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled
+pfUINT* bufferSize;	// Array of buffers sizes in bytes
+pfUINT* tempBufferSize;	// Array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation 
+pfUINT* inputBufferSize;	// Array of input buffers sizes in bytes, if isInputFormatted is enabled
+pfUINT* outputBufferSize;	// Array of output buffers sizes in bytes, if isOutputFormatted is enabled
+pfUINT* kernelSize;	// Array of kernel buffers sizes in bytes, if performConvolution is enabled
 #if(VKFFT_BACKEND==0) //Vulkan API
 VkBuffer* buffer;	// Pointer to array of buffers (or one buffer) used for computations
 VkBuffer* tempBuffer;	// Needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation
@@ -1193,116 +1251,124 @@ \subsection{VkFFT configuration}
 MTL::Buffer** outputBuffer;	// Pointer to device buffer used to read data from if isOutputFormatted is enabled
 MTL::Buffer** kernel;	// Pointer to device buffer used to read kernel data from if performConvolution is enabled
 #endif
-uint64_t bufferOffset;	// Specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0
-uint64_t tempBufferOffset;	// Specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0
-uint64_t inputBufferOffset;	// Specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0
-uint64_t outputBufferOffset;	// Specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
-uint64_t kernelOffset;	// Specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
-uint64_t specifyOffsetsAtLaunch;	// Specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0
+pfUINT bufferOffset;	// Specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0
+pfUINT tempBufferOffset;	// Specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0
+pfUINT inputBufferOffset;	// Specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0
+pfUINT outputBufferOffset;	// Specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
+pfUINT kernelOffset;	// Specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
+pfUINT specifyOffsetsAtLaunch;	// Specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0
 
 // Optional: (default 0 if not stated otherwise)
-uint64_t coalescedMemory;	// In bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Going to work regardless, but if specified by user correctly, the performance will be higher.
-uint64_t aimThreads;	// Aim at this many threads per block. Default 128
-uint64_t numSharedBanks;	// How many banks shared memory has. Default 32
-uint64_t inverseReturnToInputBuffer;	// return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled
-uint64_t numberBatches;	// N - used to perform multiple batches of initial data. Default 1
-uint64_t useUint64;	// Use 64-bit addressing mode in generated kernels
-uint64_t omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C first axis for now. Doesn't work with convolutions.
-uint64_t performBandwidthBoost; // Try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise  
-
-uint64_t doublePrecision;	// Perform calculations in double precision (0 - off, 1 - on).
-uint64_t halfPrecision;	// Perform calculations in half precision (0 - off, 1 - on)
-uint64_t halfPrecisionMemoryOnly;	// Use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out-of-place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on)
-uint64_t doublePrecisionFloatMemory;	// Use FP64 precision for all calculations, while all memory storage is done in FP32.
-uint64_t performR2C;	// Perform R2C/C2R decomposition (0 - off, 1 - on)
-uint64_t performDCT;	// Perform DCT transformation (X - DCT type, 1-4)
-uint64_t disableMergeSequencesR2C;	// Disable merging of two real sequences to reduce calculations (0 - off, 1 - on) 
-uint64_t normalize;	// Normalize inverse transform (0 - off, 1 - on)
-uint64_t disableReorderFourStep;	// Disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
-uint64_t useLUT;	// Switches from calculating sincos to using precomputed LUT tables (0 - off, 1 - on). Configured by initialization routine
-uint64_t makeForwardPlanOnly;	// Generate code only for forward FFT (0 - off, 1 - on)
-uint64_t makeInversePlanOnly;	// Generate code only for inverse FFT (0 - off, 1 - on)
-uint64_t bufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Buffer strides - default set to x - x*y - x*y*z values
-uint64_t isInputFormatted;	// Specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
-uint64_t isOutputFormatted;	// Specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
-uint64_t inputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
-uint64_t outputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
-uint64_t considerAllAxesStrided;	// Will create plan for non-strided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on)
-uint64_t keepShaderCode;	// Will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on)
-uint64_t printMemoryLayout;	// Will print order of buffers used in shaders (0 - off, 1 - on) 
-
-uint64_t saveApplicationToString;	// Will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). VkFFTApplication.applicationStringSize will contain size of binary in bytes. Currently disabled in Metal backend. (0 - off, 1 - on)
-uint64_t loadApplicationFromString;	// Will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). Currently disabled in Metal backend. (0 - off, 1 - on). Mutually exclusive with saveApplicationToString
+pfUINT coalescedMemory;	// In bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Going to work regardless, but if specified by user correctly, the performance will be higher.
+pfUINT aimThreads;	// Aim at this many threads per block. Default 128
+pfUINT numSharedBanks;	// How many banks shared memory has. Default 32
+pfUINT inverseReturnToInputBuffer;	// return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled
+pfUINT numberBatches;	// N - used to perform multiple batches of initial data. Default 1
+pfUINT useUint64;	// Use 64-bit addressing mode in generated kernels
+pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C first axis for now. Doesn't work with convolutions.
+pfUINT performBandwidthBoost; // Try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST), 0 otherwise  
+
+pfUINT doublePrecision;	// Perform calculations in double precision (0 - off, 1 - on).
+pfUINT quadDoubleDoublePrecision; // Perform calculations in double-double emulation of quad precision (0 - off, 1 - on).
+pfUINT quadDoubleDoublePrecisionDoubleMemory; // Perform calculations in double-double emulation of quad precision, while all memory storage is done in FP64.
+pfUINT halfPrecision;	// Perform calculations in half precision (0 - off, 1 - on)
+pfUINT halfPrecisionMemoryOnly;	// Use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out-of-place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on)
+pfUINT doublePrecisionFloatMemory;	// Use FP64 precision for all calculations, while all memory storage is done in FP32.
+
+pfUINT performR2C;	// Perform R2C/C2R decomposition (0 - off, 1 - on)
+pfUINT performDCT;	// Perform DCT transformation (X - DCT type, 1-4)
+pfUINT performDST;	// Perform DST transformation (X - DST type, 1-4)
+pfUINT disableMergeSequencesR2C;	// Disable merging of two real sequences to reduce calculations (0 - off, 1 - on) 
+pfUINT normalize;	// Normalize inverse transform (0 - off, 1 - on)
+pfUINT disableReorderFourStep;	// Disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
+pfINT useLUT;	// Switches from calculating sincos to using precomputed LUT tables (0 - off, 1 - on). Configured by initialization routine
+pfINT useLUT_4step; // Switches from calculating sincos to using precomputed LUT tables for intermediate roots of 1 in the Four-step FFT algorithm. (-1 - off, 0 - auto, 1 - on). Configured by initialization routine
+pfUINT makeForwardPlanOnly;	// Generate code only for forward FFT (0 - off, 1 - on)
+pfUINT makeInversePlanOnly;	// Generate code only for inverse FFT (0 - off, 1 - on)
+pfUINT bufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Buffer strides - default set to x - x*y - x*y*z values
+pfUINT isInputFormatted;	// Specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
+pfUINT isOutputFormatted;	// Specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
+pfUINT inputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
+pfUINT outputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];	// Output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
+pfUINT swapTo2Stage4Step; // Specify at which number to switch from 1 upload to 2 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. 	
+pfUINT swapTo3Stage4Step;	// Specify at which power of 2 to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 17
+pfUINT considerAllAxesStrided;	// Will create plan for non-strided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on)
+
+pfUINT keepShaderCode;	// Will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on)
+pfUINT printMemoryLayout;	// Will print order of buffers used in shaders (0 - off, 1 - on) 
+
+pfUINT saveApplicationToString;	// Will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). VkFFTApplication.applicationStringSize will contain size of binary in bytes. Currently disabled in Metal backend. (0 - off, 1 - on)
+pfUINT loadApplicationFromString;	// Will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). Currently disabled in Metal backend. (0 - off, 1 - on). Mutually exclusive with saveApplicationToString
 void* loadApplicationString;	// Memory binary array through which user can load VkFFT binaries, must be provided by user if loadApplicationFromString = 1. Use rb/wb flags to load/save.
 
-uint64_t disableSetLocale;	// disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0. 
+pfUINT disableSetLocale;	// disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0. 
 
-//optional Bluestein optimizations: (default 0 if not stated otherwise)
-uint64_t fixMaxRadixBluestein;	// controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13.
-uint64_t forceBluesteinSequenceSize;	// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13.
-uint64_t useCustomBluesteinPaddingPattern;	// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. paddedSizes - array of lengths to pad to. paddedSizes[i] will be the padding size for all non-decomposable sequences from primeSizes[i] to primeSizes[i+1] (will use default scheme after last one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme (for example). Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern.
-uint64_t* primeSizes;	// described in useCustomBluesteinPaddingPattern
-uint64_t* paddedSizes;	// described in useCustomBluesteinPaddingPattern
+// Optional Bluestein optimizations: (default 0 if not stated otherwise)
+pfUINT fixMaxRadixBluestein;	// controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13.
+pfUINT forceBluesteinSequenceSize;	// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13.
+pfUINT useCustomBluesteinPaddingPattern;	// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. paddedSizes - array of lengths to pad to. paddedSizes[i] will be the padding size for all non-decomposable sequences from primeSizes[i] to primeSizes[i+1] (will use default scheme after last one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme (for example). Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern.
+pfUINT* primeSizes;	// described in useCustomBluesteinPaddingPattern
+pfUINT* paddedSizes;	// described in useCustomBluesteinPaddingPattern
 
-uint64_t fixMinRaderPrimeMult;	// start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader.
-uint64_t fixMaxRaderPrimeMult;	// switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40)
+pfUINT fixMinRaderPrimeMult;	// start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader.
+pfUINT fixMaxRaderPrimeMult;	// switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40)
 
-uint64_t fixMinRaderPrimeFFT;	// start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. 
-uint64_t fixMaxRaderPrimeFFT;	// switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory.
+pfUINT fixMinRaderPrimeFFT;	// start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. 
+pfUINT fixMaxRaderPrimeFFT;	// switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory.
 
 // Optional zero padding control parameters: (default 0 if not stated otherwise)
-uint64_t performZeropadding[VKFFT_MAX_FFT_DIMENSIONS];	// Don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on)
-uint64_t fft_zeropad_left[VKFFT_MAX_FFT_DIMENSIONS];	// Specify start boundary of zero block in the system for each axis
-uint64_t fft_zeropad_right[VKFFT_MAX_FFT_DIMENSIONS];	// Specify end boundary of zero block in the system for each axis
-uint64_t frequencyZeroPadding;	// Set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding
+pfUINT performZeropadding[VKFFT_MAX_FFT_DIMENSIONS];	// Don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on)
+pfUINT fft_zeropad_left[VKFFT_MAX_FFT_DIMENSIONS];	// Specify start boundary of zero block in the system for each axis
+pfUINT fft_zeropad_right[VKFFT_MAX_FFT_DIMENSIONS];	// Specify end boundary of zero block in the system for each axis
+pfUINT frequencyZeroPadding;	// Set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding
 
 // Optional convolution control parameters: (default 0 if not stated otherwise)
-uint64_t performConvolution;	// Perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter
-uint64_t coordinateFeatures;	// C - coordinate, or dimension of features vector. In matrix convolution - size of a vector
-uint64_t matrixConvolution;	// If equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
-uint64_t symmetricKernel;	// Specify if kernel in 2x2 or 3x3 matrix convolution is symmetric
-uint64_t numberKernels;	// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output
-uint64_t kernelConvolution;	// Specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation
+pfUINT performConvolution;	// Perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter
+pfUINT coordinateFeatures;	// C - coordinate, or dimension of features vector. In matrix convolution - size of a vector
+pfUINT matrixConvolution;	// If equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
+pfUINT symmetricKernel;	// Specify if kernel in 2x2 or 3x3 matrix convolution is symmetric
+pfUINT numberKernels;	// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output
+pfUINT kernelConvolution;	// Specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation
 
 // Register overutilization (experimental): (default 0 if not stated otherwise)
-uint64_t registerBoost;	// Specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Defaults: Nvidia - 4 in Vulkan/OpenCL, 1 in CUDA backend; AMD - 2 if shared memory >= 64KB, else 4 in Vulkan/OpenCL backend, 1 in HIP backend; Intel - 1 if shared memory >= 64KB, else 2 in Vulkan/OpenCL/Level Zero backends, 1 in Metal; Default 1
-uint64_t registerBoostNonPow2;	// Specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on)
-uint64_t registerBoost4Step;	// Specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1
+pfUINT registerBoost;	// Specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Defaults: Nvidia - 4 in Vulkan/OpenCL, 1 in CUDA backend; AMD - 2 if shared memory >= 64KB, else 4 in Vulkan/OpenCL backend, 1 in HIP backend; Intel - 1 if shared memory >= 64KB, else 2 in Vulkan/OpenCL/Level Zero backends, 1 in Metal; Default 1
+pfUINT registerBoostNonPow2;	// Specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on)
+pfUINT registerBoost4Step;	// Specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1
+
 //not used techniques:
-uint64_t swapTo3Stage4Step;	// Specify at which power of 2 to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 17
-uint64_t devicePageSize;	// In KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages
-uint64_t localPageSize;	// In KB, the size to split page into if sequence spans multiple devicePageSize pages
+pfUINT devicePageSize;	// In KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages
+pfUINT localPageSize;	// In KB, the size to split page into if sequence spans multiple devicePageSize pages
 
 // Automatically filled based on device info (still can be reconfigured by user):
-uint64_t computeCapabilityMajor;	// CUDA/HIP compute capability of the device
-uint64_t computeCapabilityMinor;	// CUDA/HIP compute capability of the device 	
-uint64_t maxComputeWorkGroupCount[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGroupCount from VkPhysicalDeviceLimits
-uint64_t maxComputeWorkGroupSize[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGroupCount from VkPhysicalDeviceLimits
-uint64_t maxThreadsNum;	// Max number of threads from VkPhysicalDeviceLimits
-uint64_t sharedMemorySizeStatic;	// Available for static allocation shared memory size, in bytes
-uint64_t sharedMemorySize;	// Available for allocation shared memory size, in bytes
-uint64_t sharedMemorySizePow2;	// Power of 2 which is less or equal to sharedMemorySize, in bytes
-uint64_t warpSize;	// Number of threads per warp/wavefront.
-uint64_t halfThreads;	// Intel fix
-uint64_t allocateTempBuffer;	// Buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated
-uint64_t reorderFourStep;	// Unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1.
-int64_t maxCodeLength;	// Specify how big can be buffer used for code generation (in char). Default 1000000 chars. 
-int64_t maxTempLength;	// Specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number.
-uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern
-uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow
-uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc
+pfUINT computeCapabilityMajor;	// CUDA/HIP compute capability of the device
+pfUINT computeCapabilityMinor;	// CUDA/HIP compute capability of the device 	
+pfUINT maxComputeWorkGroupCount[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGroupCount from VkPhysicalDeviceLimits
+pfUINT maxComputeWorkGroupSize[VKFFT_MAX_FFT_DIMENSIONS];	// maxComputeWorkGroupCount from VkPhysicalDeviceLimits
+pfUINT maxThreadsNum;	// Max number of threads from VkPhysicalDeviceLimits
+pfUINT sharedMemorySizeStatic;	// Available for static allocation shared memory size, in bytes
+pfUINT sharedMemorySize;	// Available for allocation shared memory size, in bytes
+pfUINT sharedMemorySizePow2;	// Power of 2 which is less or equal to sharedMemorySize, in bytes
+pfUINT warpSize;	// Number of threads per warp/wavefront.
+pfUINT halfThreads;	// Intel fix
+pfUINT allocateTempBuffer;	// Buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated
+pfUINT reorderFourStep;	// Unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1.
+pfINT maxCodeLength;	// Specify how big can be buffer used for code generation (in char). Default 1000000 chars. 
+pfINT maxTempLength;	// Specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number.
+pfUINT autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern
+pfUINT useRaderUintLUT; // allocate additional LUT to store g_pow
+pfUINT vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc
 #if(VKFFT_BACKEND==0) //Vulkan API
 VkDeviceMemory tempBufferDeviceMemory;	// Filled at app creation
 VkCommandBuffer* commandBuffer;	// Filled at app execution
 VkMemoryBarrier* memory_barrier;	// Filled at app creation
 #elif(VKFFT_BACKEND==1) //CUDA API
 cudaEvent_t* stream_event;	// Filled at app creation
-uint64_t streamCounter;	// Filled at app creation
-uint64_t streamID;	// Filled at app creation
+pfUINT streamCounter;	// Filled at app creation
+pfUINT streamID;	// Filled at app creation
 #elif(VKFFT_BACKEND==2) //HIP API
 hipEvent_t* stream_event;	// Filled at app creation
-uint64_t streamCounter;	// Filled at app creation
-uint64_t streamID;	// Filled at app creation
+pfUINT streamCounter;	// Filled at app creation
+pfUINT streamID;	// Filled at app creation
 #elif(VKFFT_BACKEND==3) //OpenCL API
 cl_command_queue* commandQueue;	// Filled at app creation
 #elif(VKFFT_BACKEND==4)
@@ -1345,7 +1411,8 @@ \subsubsection{Driver API parameters}
 with the provided device. There is no real benefit in having more
 than one, however. 
 \item uint64\_t num\_streams - Try to submit CUDA kernels in multiple streams
-for asynchronous execution. Default 1 
+for asynchronous execution. Default 0, set to >=1 if you pass values
+in the stream pointer. 
 \end{itemize}
 HIP API will need the following information:
 \begin{itemize}
@@ -1355,7 +1422,8 @@ \subsubsection{Driver API parameters}
 with the provided device. There is no real benefit in having more
 than one, however. 
 \item uint64\_t num\_streams - Try to submit HIP kernels in multiple streams
-for asynchronous execution. Default 1 
+for asynchronous execution. Default 0, set to >=1 if you pass values
+in the stream pointer.  
 \end{itemize}
 OpenCL API will need the following information:
 \begin{itemize}
@@ -1473,12 +1541,17 @@ \subsubsection{General FFT parameters }
 uint64\_t numberBatches - N parameter of the transform. By default,
 it is set to 1. Optional parameter.
 
-uint64\_t performR2C - perform R2C/C2R decomposition. performDCT must
-be set to 0. Default 0, set to 1 to enable. Optional parameter.
+uint64\_t performR2C - perform R2C/C2R decomposition. performDCT and
+performDST must be set to 0. Default 0, set to 1 to enable. Optional
+parameter.
 
-uint64\_t performDCT - perform DCT transformation. performR2C must
-be set to 0. Default 0, set to X for DCT-X (currently supported X:
-1, 2, 3 and 4). Optional parameter.
+uint64\_t performDCT - perform DCT transformation. performR2C and
+performDST must be set to 0. Default 0, set to X for DCT-X (currently
+supported X: 1, 2, 3 and 4). Optional parameter.
+
+uint64\_t performDST - perform DST transformation. performR2C and
+performDCT must be set to 0. Default 0, set to X for DST-X (currently
+supported X: 1, 2, 3 and 4). Optional parameter.
 
 uint64\_t normalize - enabling this parameter will make the inverse
 transform divide the result by the FFT length. Default 0, set to 1
@@ -1497,15 +1570,21 @@ \subsubsection{Precision parameters (and some things that can affect it):}
 set to 0. This option increases precision, but not that much to be
 recommended for actual use. Default 0, set to 1 to enable. In Vulkan/OpenCL/Level
 Zero your device must support double precision functionality. Metal
-API does not support double precision. Experimental feature. Optional
-parameter.
+API does not support double precision. Optional parameter.
+
+uint64\_t quadDoubleDoublePrecision - perform calculations in double-double
+emulation of quad precision (0 - off, 1 - on). Optional parameter.
+Requires quadmath library (for now).
+
+uint64\_t quadDoubleDoublePrecisionDoubleMemory - perform calculations
+in double-double emulation of quad precision, while all memory storage
+is done in FP64. Optional parameter. Requires quadmath library (for
+now).
 
 uint64\_t halfPrecision - half-precision in VkFFT is implemented only
 as memory optimization. All calculations are done in single precision
 (similar way as doublePrecisionFloatMemory works for double and single
-precision). Default 0, set to 1 to enable. Works only in Vulkan API
-now, experimental feature (half precision seems to have bad precision
-for the first FFT element). Optional parameter.
+precision). Default 0, set to 1 to enable. Optional parameter.
 
 uint64\_t halfPrecisionMemoryOnly - another way of performing half-precision
 in VkFFT, it will use half-precision only for initial and final memory
@@ -1515,8 +1594,7 @@ \subsubsection{Precision parameters (and some things that can affect it):}
 So, for example, intermediate storage between axes FFTs in the multidimensional
 case will be done in single precision, as opposed to half-precision
 in the base halfPrecision case. halfPrecision must be set to 1. Default
-0, set to 1 to enable. Works only in Vulkan API now, experimental
-feature. Optional parameter.
+0, set to 1 to enable. Optional parameter.
 
 int64\_t useLUT - switches from calculating sines and cosines (via
 special function units in single precision or as a polynomial approximation
@@ -1569,8 +1647,8 @@ \subsubsection{Advanced parameters (code will work fine without using them)}
 
 uint64\_t performBandwidthBoost - try to reduce coalsesced number
 by a factor of X to get bigger sequence in one upload for strided
-axes. Default: -1(inf) for DCT, 2 for Bluestein's algorithm (or -1
-if DCT), 0 otherwise 
+axes. Default: -1(inf) for DCT and DST, 2 for Bluestein's algorithm
+(or -1 if DCT and DST), 0 otherwise 
 
 uint64\_t disableMergeSequencesR2C - disable the optimization that
 performs merging of two real sequences to reduce calculations (in
@@ -1869,6 +1947,8 @@ \section{VkFFT Benchmark/Precision Suite and utils\_VkFFT helper routines}
 \item Sample 6 - FFT + iFFT R2C / C2R benchmark, in-place.
 \item Sample 7 - FFT + iFFT C2C Bluestein benchmark in single precision
 \item Sample 8 - FFT + iFFT C2C Bluestein benchmark in double precision
+\item Sample 9 - FFT + iFFT C2C benchmark 1D batched in double-double emulation
+of quad precision
 \item Sample 10 - multiple buffers (4 by default) split version of benchmark
 0
 \item Sample 11 - VkFFT / xFFT / FFTW C2C precision test in single precision
@@ -1886,6 +1966,8 @@ \section{VkFFT Benchmark/Precision Suite and utils\_VkFFT helper routines}
 in double precision
 \item Sample 18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein
 precision test in double precision
+\item Sample 19 - VkFFT / FFTW C2C precision test in double-double emulation
+of quad precision
 \item Sample 50 - convolution example with identity kernel
 \item Sample 51 - zero padding convolution example with identity kernel
 \item Sample 52 - batched convolution example with identity kernel
@@ -1895,10 +1977,14 @@ \section{VkFFT Benchmark/Precision Suite and utils\_VkFFT helper routines}
 double precision
 \item Sample 1000 - FFT + iFFT C2C benchmark 1D batched in single precision:
 all supported systems from 2 to 4096
-\item Sample 1001 - FFT + iFFT C2C benchmark 1D batched in single precision:
+\item Sample 1001 - FFT + iFFT C2C benchmark 1D batched in double precision:
+all supported systems from 2 to 4096
+\item Sample 1002 - FFT + iFFT C2C benchmark 1D batched in half precision:
 all supported systems from 2 to 4096
 \item Sample 1003 - FFT + iFFT C2C benchmark 1D batched in single precision:
 all supported systems from 2 to 4096
+\item Sample 1004 - FFT + iFFT C2C benchmark 1D batched in double-double
+emulation of quad precision: all supported systems from 2 to 4096
 \end{itemize}
 
 \subsection{utils\_VkFFT helper routines}
@@ -2260,6 +2346,7 @@ \subsection{Advanced FFT application example: ND, C2C/R2C/R2R, different precisi
 configuration.performDCT = DCT;
 if (P == 1) configuration.doublePrecision = 1; 
 if (P == 2) configuration.halfPrecision = 1;
+if (P == 3) configuration.quadDoubleDoublePrecision = 1;
 
 uint64_t bufferSize = 0;
 
diff --git a/vkFFT/vkFFT.h b/vkFFT/vkFFT.h
index 98af2986..493c5fc2 100644
--- a/vkFFT/vkFFT.h
+++ b/vkFFT/vkFFT.h
@@ -71,12 +71,42 @@
 #define VKFFT_MAX_FFT_DIMENSIONS 4
 #endif
 
+#ifdef VKFFT_USE_DOUBLEDOUBLE_FP128
+
+#define VKFFT_USE_QUADMATH_FP128 // for now the only implementation, but defining these functions as mpfr should also be possible
+#include <quadmath.h>
+//#define pfQ __float128
+#define pfLD __float128
+#define pfUINT uint64_t
+#define pfINT int64_t
+#define pfsin sinq
+#define pfcos cosq
+#define pfceil ceilq
+#define pffloor floorq
+#define pfsqrt sqrtq
+
+#define pfFPinit(x) strtoflt128(x, 0)
+
+#else
+#define pfLD long double
+#define pfUINT uint64_t
+#define pfINT int64_t
+#define pfsin sin
+#define pfcos cos
+#define pfceil ceil
+#define pffloor floor
+#define pfsqrt sqrt
+
+#define pfFPinit(x) strtold(x, 0)
+
+#endif
+
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_RunApp.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h"
 
 static inline int VkFFTGetVersion() {
-	return 10301; //X.XX.XX format
+	return 10302; //X.XX.XX format
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h
index d082c48d..34a34772 100644
--- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h
+++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h
@@ -34,7 +34,7 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 #elif(VKFFT_BACKEND==1)
 	if (app->configuration.num_streams > 1) {
 		cudaError_t res_t = cudaSuccess;
-		for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
+		for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
 			if (app->configuration.stream_event[i] != 0) {
 				res_t = cudaEventDestroy(app->configuration.stream_event[i]);
 				if (res_t == cudaSuccess) app->configuration.stream_event[i] = 0;
@@ -48,7 +48,7 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 #elif(VKFFT_BACKEND==2)
 	if (app->configuration.num_streams > 1) {
 		hipError_t res_t = hipSuccess;
-		for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
+		for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
 			if (app->configuration.stream_event[i] != 0) {
 				res_t = hipEventDestroy(app->configuration.stream_event[i]);
 				if (res_t == hipSuccess) app->configuration.stream_event[i] = 0;
@@ -61,7 +61,7 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 	}
 #endif
 	if (app->numRaderFFTPrimes) {
-		for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) {
+		for (pfUINT i = 0; i < app->numRaderFFTPrimes; i++) {
 			free(app->raderFFTkernel[i]);
 			app->raderFFTkernel[i] = 0;
 		}
@@ -117,9 +117,9 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 			app->configuration.tempBufferSize = 0;
 		}
 	}
-	for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+	for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 		if (app->configuration.useRaderUintLUT) {
-			for (uint64_t j = 0; j < 4; j++) {
+			for (pfUINT j = 0; j < 4; j++) {
 				if (app->bufferRaderUintLUT[i][j]) {
 #if(VKFFT_BACKEND==0)
 					vkDestroyBuffer(app->configuration.device[0], app->bufferRaderUintLUT[i][j], 0);
@@ -255,14 +255,19 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 	}
 	if (!app->configuration.makeInversePlanOnly) {
 		if (app->localFFTPlan != 0) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 				if (app->localFFTPlan->numAxisUploads[i] > 0) {
-					for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
-						deleteAxis(app, &app->localFFTPlan->axes[i][j]);
+					for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
+						deleteAxis(app, &app->localFFTPlan->axes[i][j], 0);
+				}
+				if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
+					for (pfUINT j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+						deleteAxis(app, &app->localFFTPlan->inverseBluesteinAxes[i][j], 1);
+					}
 				}
 			}
 			if (app->localFFTPlan->multiUploadR2C) {
-				deleteAxis(app, &app->localFFTPlan->R2Cdecomposition);
+				deleteAxis(app, &app->localFFTPlan->R2Cdecomposition, 0);
 			}
 			if (app->localFFTPlan != 0) {
 				free(app->localFFTPlan);
@@ -272,14 +277,19 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 	}
 	if (!app->configuration.makeForwardPlanOnly) {
 		if (app->localFFTPlan_inverse != 0) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 				if (app->localFFTPlan_inverse->numAxisUploads[i] > 0) {
-					for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
-						deleteAxis(app, &app->localFFTPlan_inverse->axes[i][j]);
+					for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
+						deleteAxis(app, &app->localFFTPlan_inverse->axes[i][j], 0);
+				}
+				if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
+					for (pfUINT j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+						deleteAxis(app, &app->localFFTPlan_inverse->inverseBluesteinAxes[i][j], 1);
+					}
 				}
 			}
 			if (app->localFFTPlan_inverse->multiUploadR2C) {
-				deleteAxis(app, &app->localFFTPlan_inverse->R2Cdecomposition);
+				deleteAxis(app, &app->localFFTPlan_inverse->R2Cdecomposition, 0);
 			}
 			if (app->localFFTPlan_inverse != 0) {
 				free(app->localFFTPlan_inverse);
@@ -292,7 +302,7 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 			free(app->saveApplicationString);
 			app->saveApplicationString = 0;
 		}
-		for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+		for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 			if (app->applicationBluesteinString[i] != 0) {
 				free(app->applicationBluesteinString[i]);
 				app->applicationBluesteinString[i] = 0;
@@ -309,5 +319,6 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
 			app->configuration.paddedSizes = 0;
 		}
 	}
+	memset(app, 0, sizeof(VkFFTApplication));
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h
index afec1b90..278f2b86 100644
--- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h
+++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h
@@ -34,7 +34,7 @@ static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app)
 	if (!app->configuration.useCustomBluesteinPaddingPattern) {
 		switch (app->configuration.vendorID) {
 		case 0x10DE://NVIDIA
-			if (app->configuration.doublePrecision) {
+			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
 				app->configuration.autoCustomBluesteinPaddingPattern = 48;
 			}
 			else {
@@ -42,7 +42,7 @@ static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app)
 			}
 			break;
 		default: //have not done a test run for Intel, so everything else uses AMD profile
-			if (app->configuration.doublePrecision) {
+			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
 				app->configuration.autoCustomBluesteinPaddingPattern = 54;
 			}
 			else {
@@ -50,13 +50,13 @@ static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app)
 			}
 			break;
 		}
-		app->configuration.primeSizes = (uint64_t*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(uint64_t));
+		app->configuration.primeSizes = (pfUINT*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(pfUINT));
 		if (!app->configuration.primeSizes) return VKFFT_ERROR_MALLOC_FAILED;
-		app->configuration.paddedSizes = (uint64_t*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(uint64_t));
+		app->configuration.paddedSizes = (pfUINT*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(pfUINT));
 		if (!app->configuration.paddedSizes) return VKFFT_ERROR_MALLOC_FAILED;
 		switch (app->configuration.vendorID) {
 		case 0x10DE://Nvidia
-			if (app->configuration.doublePrecision) {
+			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
 				app->configuration.primeSizes[0] = 17;
 				app->configuration.paddedSizes[0] = 36;
 				app->configuration.primeSizes[1] = 19;
@@ -248,7 +248,7 @@ static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app)
 			}
 			break;
 		default: //have not done a test run for Intel, so everything else uses AMD profile
-			if (app->configuration.doublePrecision) {
+			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
 				app->configuration.primeSizes[0] = 17;
 				app->configuration.paddedSizes[0] = 36;
 				app->configuration.primeSizes[1] = 19;
@@ -425,9 +425,13 @@ static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app)
 }
 static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration)  {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
-	//app->configuration = {};// inputLaunchConfiguration;
+    //app->configuration = {};// inputLaunchConfiguration;
 	if (inputLaunchConfiguration.doublePrecision != 0)	app->configuration.doublePrecision = inputLaunchConfiguration.doublePrecision;
 	if (inputLaunchConfiguration.doublePrecisionFloatMemory != 0)	app->configuration.doublePrecisionFloatMemory = inputLaunchConfiguration.doublePrecisionFloatMemory;
+
+	if (inputLaunchConfiguration.quadDoubleDoublePrecision != 0)	app->configuration.quadDoubleDoublePrecision = inputLaunchConfiguration.quadDoubleDoublePrecision;
+	if (inputLaunchConfiguration.quadDoubleDoublePrecisionDoubleMemory != 0)	app->configuration.quadDoubleDoublePrecisionDoubleMemory = inputLaunchConfiguration.quadDoubleDoublePrecisionDoubleMemory;
+	
 	if (inputLaunchConfiguration.halfPrecision != 0)	app->configuration.halfPrecision = inputLaunchConfiguration.halfPrecision;
 	if (inputLaunchConfiguration.halfPrecisionMemoryOnly != 0)	app->configuration.halfPrecisionMemoryOnly = inputLaunchConfiguration.halfPrecisionMemoryOnly;
 	if (inputLaunchConfiguration.useCustomBluesteinPaddingPattern != 0) {
@@ -484,14 +488,14 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	app->configuration.maxComputeWorkGroupSize[2] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
 	//if ((physicalDeviceProperties.vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1;
 	app->configuration.sharedMemorySize = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
-	app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(physicalDeviceProperties.limits.maxComputeSharedMemorySize));
+	app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(physicalDeviceProperties.limits.maxComputeSharedMemorySize));
 	app->configuration.vendorID = physicalDeviceProperties.vendorID;
 	if (inputLaunchConfiguration.pipelineCache != 0)	app->configuration.pipelineCache = inputLaunchConfiguration.pipelineCache;
 	app->configuration.useRaderUintLUT = 1;
 	switch (physicalDeviceProperties.vendorID) {
 	case 0x10DE://NVIDIA
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM.
-		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 		app->configuration.warpSize = 32;
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = 4;
@@ -505,25 +509,25 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 1 : 2;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 		break;
 	case 0x1002://AMD
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;
-		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 		app->configuration.warpSize = 64;
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 2 : 4;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 		break;
 	default:
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64;
-		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 		app->configuration.warpSize = 32;
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = 1;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 		break;
 	}
 #elif(VKFFT_BACKEND==1)
@@ -620,7 +624,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	}
 	app->configuration.useLUT_4step = (value <= 4) ? -1 : 1;
 	//we don't need this in CUDA
-	app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
+	app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(app->configuration.sharedMemorySize));
 	app->configuration.useRaderUintLUT = 0;
 	if (app->configuration.num_streams > 1) {
 		app->configuration.stream_event = (cudaEvent_t*)malloc(app->configuration.num_streams * sizeof(cudaEvent_t));
@@ -628,7 +632,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
+		for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
 			res_t = cudaEventCreate(&app->configuration.stream_event[i]);
 			if (res_t != cudaSuccess) {
 				deleteVkFFT(app);
@@ -638,11 +642,11 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	}
 
 	app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM.
-	app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+	app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 	app->configuration.registerBoostNonPow2 = 0;
 	app->configuration.registerBoost = 1;
 	app->configuration.registerBoost4Step = 1;
-	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 4194305 : 4194305;
+	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 4194305 : 4194305;
 	app->configuration.vendorID = 0x10DE;
 #elif(VKFFT_BACKEND==2)
 	hipError_t res = hipSuccess;
@@ -726,7 +730,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE;
 	}
 	app->configuration.warpSize = value;
-	app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
+	app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(app->configuration.sharedMemorySize));
 	app->configuration.useRaderUintLUT = 0;
 	if (app->configuration.num_streams > 1) {
 		app->configuration.stream_event = (hipEvent_t*)malloc(app->configuration.num_streams * sizeof(hipEvent_t));
@@ -734,7 +738,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
+		for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
 			res = hipEventCreate(&app->configuration.stream_event[i]);
 			if (res != hipSuccess) {
 				deleteVkFFT(app);
@@ -743,12 +747,12 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		}
 	}
 	app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;
-	app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+	app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 	app->configuration.useLUT_4step = -1;
 	app->configuration.registerBoostNonPow2 = 0;
 	app->configuration.registerBoost = 1;
 	app->configuration.registerBoost4Step = 1;
-	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 1048576 : 2097152;
+	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1048576 : 2097152;
 	app->configuration.vendorID = 0x1002;
 #elif(VKFFT_BACKEND==3)
 	cl_int res = 0;
@@ -810,20 +814,20 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE;
 	}
 	app->configuration.sharedMemorySize = sharedMemorySize;
-	app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(sharedMemorySize));
+	app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(sharedMemorySize));
 	app->configuration.vendorID = vendorID;
 	app->configuration.useRaderUintLUT = 1;
 	switch (vendorID) {
 	case 0x10DE://NVIDIA
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM.
-		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 		app->configuration.warpSize = 32;
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = 4;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 4194305 : 4194305;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 4194305 : 4194305;
 		app->configuration.sharedMemorySize -= 0x10;//reserved by system
-		app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
+		app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(app->configuration.sharedMemorySize));
 		break;
 	case 0x8086://INTEL
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64;
@@ -832,25 +836,25 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 1 : 2;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 		break;
 	case 0x1002://AMD
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;
-		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 		app->configuration.warpSize = 64;
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 2 : 4;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 		break;
 	default:
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64;
-		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+		app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 		app->configuration.warpSize = 32;
 		app->configuration.registerBoostNonPow2 = 0;
 		app->configuration.registerBoost = 1;
 		app->configuration.registerBoost4Step = 1;
-		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+		app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 		break;
 	}
 #elif(VKFFT_BACKEND==4)
@@ -888,7 +892,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	app->configuration.maxComputeWorkGroupCount[2] = compute_properties.maxGroupCountZ;
 	//if ((vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1;
 	app->configuration.sharedMemorySize = compute_properties.maxSharedLocalMemory;
-	app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
+	app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(app->configuration.sharedMemorySize));
 
 	app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64;
 	app->configuration.useLUT = 1;
@@ -896,7 +900,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	app->configuration.registerBoostNonPow2 = 0;
 	app->configuration.registerBoost = (app->configuration.sharedMemorySize >= 65536) ? 1 : 2;
 	app->configuration.registerBoost4Step = 1;
-	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 	app->configuration.vendorID = 0x8086;
 	app->configuration.useRaderUintLUT = 1;
 #elif(VKFFT_BACKEND==5)
@@ -948,15 +952,15 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 
 	app->configuration.warpSize = dummy_state->threadExecutionWidth();
 
-	app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
+	app->configuration.sharedMemorySizePow2 = (pfUINT)pow(2, (pfUINT)log2(app->configuration.sharedMemorySize));
 	app->configuration.useRaderUintLUT = 1;
 
 	app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64;//the coalesced memory is equal to 64 bytes between L2 and VRAM.
-	app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1;
+	app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 1 : -1;
 	app->configuration.registerBoostNonPow2 = 0;
 	app->configuration.registerBoost = 1;
 	app->configuration.registerBoost4Step = 1;
-	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288;
+	app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) ? 262144 : 524288;
 	app->configuration.vendorID = 0x1027f00;
 
 	dummy_state->release();
@@ -987,7 +991,9 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		deleteVkFFT(app);
 		return VKFFT_ERROR_EMPTY_size;
 	}
-
+	app->configuration.isInputFormatted = inputLaunchConfiguration.isInputFormatted;
+	app->configuration.isOutputFormatted = inputLaunchConfiguration.isOutputFormatted;
+	
 	app->configuration.size[0] = inputLaunchConfiguration.size[0];
 
 	if (inputLaunchConfiguration.bufferStride[0] == 0) {
@@ -1000,7 +1006,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		app->configuration.bufferStride[0] = inputLaunchConfiguration.bufferStride[0];
 
 	if (inputLaunchConfiguration.inputBufferStride[0] == 0) {
-		if (inputLaunchConfiguration.performR2C)
+		if (inputLaunchConfiguration.performR2C && (!app->configuration.isInputFormatted))
 			app->configuration.inputBufferStride[0] = app->configuration.size[0] + 2;
 		else
 			app->configuration.inputBufferStride[0] = app->configuration.size[0];
@@ -1009,14 +1015,14 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 		app->configuration.inputBufferStride[0] = inputLaunchConfiguration.inputBufferStride[0];
 
 	if (inputLaunchConfiguration.outputBufferStride[0] == 0) {
-		if (inputLaunchConfiguration.performR2C)
+		if (inputLaunchConfiguration.performR2C && (!app->configuration.isOutputFormatted))
 			app->configuration.outputBufferStride[0] = app->configuration.size[0] + 2;
 		else
 			app->configuration.outputBufferStride[0] = app->configuration.size[0];
 	}
 	else
 		app->configuration.outputBufferStride[0] = inputLaunchConfiguration.outputBufferStride[0];
-	for (uint64_t i = 1; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
+	for (pfUINT i = 1; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
 		if (inputLaunchConfiguration.size[i] == 0)
 			app->configuration.size[i] = 1;
 		else
@@ -1038,8 +1044,6 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 			app->configuration.outputBufferStride[i] = inputLaunchConfiguration.outputBufferStride[i];
 	}
 
-	app->configuration.isInputFormatted = inputLaunchConfiguration.isInputFormatted;
-	app->configuration.isOutputFormatted = inputLaunchConfiguration.isOutputFormatted;
 	app->configuration.performConvolution = inputLaunchConfiguration.performConvolution;
 
 	if (inputLaunchConfiguration.bufferNum == 0)	app->configuration.bufferNum = 1;
@@ -1052,7 +1056,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 #endif
 	app->configuration.bufferSize = inputLaunchConfiguration.bufferSize;
 	if (app->configuration.bufferSize != 0) {
-		for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+		for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 			if (app->configuration.bufferSize[i] == 0) {
 				deleteVkFFT(app);
 				return VKFFT_ERROR_EMPTY_bufferSize;
@@ -1074,7 +1078,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 #endif
 		app->configuration.tempBufferSize = inputLaunchConfiguration.tempBufferSize;
 		if (app->configuration.tempBufferSize != 0) {
-			for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) {
+			for (pfUINT i = 0; i < app->configuration.tempBufferNum; i++) {
 				if (app->configuration.tempBufferSize[i] == 0) {
 					deleteVkFFT(app);
 					return VKFFT_ERROR_EMPTY_tempBufferSize;
@@ -1085,7 +1089,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	}
 	else {
 		app->configuration.tempBufferNum = 1;
-		app->configuration.tempBufferSize = (uint64_t*)malloc(sizeof(uint64_t));
+		app->configuration.tempBufferSize = (pfUINT*)malloc(sizeof(pfUINT));
 		if (!app->configuration.tempBufferSize) {
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
@@ -1105,7 +1109,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 #endif
 		app->configuration.inputBufferSize = inputLaunchConfiguration.inputBufferSize;
 		if (app->configuration.inputBufferSize != 0) {
-			for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
+			for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
 				if (app->configuration.inputBufferSize[i] == 0) {
 					deleteVkFFT(app);
 					return VKFFT_ERROR_EMPTY_inputBufferSize;
@@ -1132,7 +1136,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 #endif
 		app->configuration.outputBufferSize = inputLaunchConfiguration.outputBufferSize;
 		if (app->configuration.outputBufferSize != 0) {
-			for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+			for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 				if (app->configuration.outputBufferSize[i] == 0) {
 					deleteVkFFT(app);
 					return VKFFT_ERROR_EMPTY_outputBufferSize;
@@ -1158,7 +1162,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 #endif
 		app->configuration.kernelSize = inputLaunchConfiguration.kernelSize;
 		if (app->configuration.kernelSize != 0) {
-			for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
+			for (pfUINT i = 0; i < app->configuration.kernelNum; i++) {
 				if (app->configuration.kernelSize[i] == 0) {
 					deleteVkFFT(app);
 					return VKFFT_ERROR_EMPTY_kernelSize;
@@ -1175,8 +1179,8 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	if (inputLaunchConfiguration.kernelOffset != 0)	app->configuration.kernelOffset = inputLaunchConfiguration.kernelOffset;
 	if (inputLaunchConfiguration.specifyOffsetsAtLaunch != 0)	app->configuration.specifyOffsetsAtLaunch = inputLaunchConfiguration.specifyOffsetsAtLaunch;
 	//set optional parameters:
-	uint64_t checkBufferSizeFor64BitAddressing = 0;
-	for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+	pfUINT checkBufferSizeFor64BitAddressing = 0;
+	for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 		if (app->configuration.bufferSize)
 			checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i];
 		else {
@@ -1184,42 +1188,43 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 			if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures;
 			if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches;
 			if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels;
-			if (app->configuration.doublePrecision) checkBufferSizeFor64BitAddressing *= 2;
+			if (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) checkBufferSizeFor64BitAddressing *= 2;
+			if (app->configuration.quadDoubleDoublePrecision) checkBufferSizeFor64BitAddressing *= 4;
 		}
 	}
 #if(VKFFT_BACKEND==2)
 	app->configuration.useStrict32BitAddress = 0;
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)32)) app->configuration.useStrict32BitAddress = -1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
 #endif
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
 	checkBufferSizeFor64BitAddressing = 0;
-	for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
+	for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
 		if (app->configuration.inputBufferSize)
 			checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i];
 	}
 #if(VKFFT_BACKEND==2)
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)32)) app->configuration.useStrict32BitAddress = -1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
 #endif
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
 
 	checkBufferSizeFor64BitAddressing = 0;
-	for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+	for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 		if (app->configuration.outputBufferSize)
 			checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i];
 	}
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
 
 	checkBufferSizeFor64BitAddressing = 0;
-	for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
+	for (pfUINT i = 0; i < app->configuration.kernelNum; i++) {
 		if (app->configuration.kernelSize)
 			checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i];
 	}
 #if(VKFFT_BACKEND==2)
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)32)) app->configuration.useStrict32BitAddress = -1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
 	// No reason was found to disable strict 32 bit addressing, so enable it
 	if (app->configuration.useStrict32BitAddress == 0) app->configuration.useStrict32BitAddress = 1;
 #endif
-	if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
+	if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
 	if (inputLaunchConfiguration.useUint64 != 0)	app->configuration.useUint64 = inputLaunchConfiguration.useUint64;
 #if(VKFFT_BACKEND==2)
 	if (inputLaunchConfiguration.useStrict32BitAddress != 0) app->configuration.useStrict32BitAddress = inputLaunchConfiguration.useStrict32BitAddress;
@@ -1244,46 +1249,61 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	}
 
 	if (app->configuration.useLUT == -1)	app->configuration.useLUT_4step = -1;
+	app->configuration.swapTo2Stage4Step = app->configuration.swapTo3Stage4Step;
 
+    if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory){
+		app->configuration.useLUT_4step = 1;
+		app->configuration.useLUT = 1;
+		app->configuration.swapTo3Stage4Step = 524288;
+	}
 	if (inputLaunchConfiguration.fixMaxRadixBluestein != 0) app->configuration.fixMaxRadixBluestein = inputLaunchConfiguration.fixMaxRadixBluestein;
 	if (inputLaunchConfiguration.forceBluesteinSequenceSize != 0) app->configuration.forceBluesteinSequenceSize = inputLaunchConfiguration.forceBluesteinSequenceSize;
 
-	app->configuration.fixMinRaderPrimeMult = 17;
-	switch (app->configuration.vendorID) {
-	case 0x10DE://NVIDIA
-		app->configuration.fixMaxRaderPrimeMult = 89;
-		break;
-	case 0x1002://AMD profile
-		app->configuration.fixMaxRaderPrimeMult = 89;
-		break;
-	default:
-		app->configuration.fixMaxRaderPrimeMult = 17;
-		break;
+	if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory){
+			app->configuration.fixMinRaderPrimeMult = 11;
+			app->configuration.fixMaxRaderPrimeMult = 29;
+	} else{
+			app->configuration.fixMinRaderPrimeMult = 17;
+			switch (app->configuration.vendorID) {
+			case 0x10DE://NVIDIA
+					app->configuration.fixMaxRaderPrimeMult = 89;
+					break;
+			case 0x1002://AMD profile
+					app->configuration.fixMaxRaderPrimeMult = 89;
+					break;
+			default:
+					app->configuration.fixMaxRaderPrimeMult = 17;
+					break;
+			}
+			if (inputLaunchConfiguration.fixMinRaderPrimeMult != 0) app->configuration.fixMinRaderPrimeMult = inputLaunchConfiguration.fixMinRaderPrimeMult;
 	}
-	if (inputLaunchConfiguration.fixMinRaderPrimeMult != 0) app->configuration.fixMinRaderPrimeMult = inputLaunchConfiguration.fixMinRaderPrimeMult;
 	if (inputLaunchConfiguration.fixMaxRaderPrimeMult != 0) app->configuration.fixMaxRaderPrimeMult = inputLaunchConfiguration.fixMaxRaderPrimeMult;
 
 	switch (app->configuration.vendorID) {
 	case 0x1002://AMD profile
-		if (app->configuration.doublePrecision)
-			app->configuration.fixMinRaderPrimeFFT = 29;
-		else
-			app->configuration.fixMinRaderPrimeFFT = 17;
-		break;
+			if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+					app->configuration.fixMinRaderPrimeFFT = 19;
+			else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory)
+					app->configuration.fixMinRaderPrimeFFT = 29;
+			else
+					app->configuration.fixMinRaderPrimeFFT = 17;
+			break;
 	default:
-		app->configuration.fixMinRaderPrimeFFT = 17;
-		break;
+			app->configuration.fixMinRaderPrimeFFT = 17;
+			break;
 	}
 	app->configuration.fixMaxRaderPrimeFFT = 16384;
 	if (inputLaunchConfiguration.fixMinRaderPrimeFFT != 0) app->configuration.fixMinRaderPrimeFFT = inputLaunchConfiguration.fixMinRaderPrimeFFT;
 	if (inputLaunchConfiguration.fixMaxRaderPrimeFFT != 0) app->configuration.fixMaxRaderPrimeFFT = inputLaunchConfiguration.fixMaxRaderPrimeFFT;
-
 	if (inputLaunchConfiguration.performR2C != 0) {
 		app->configuration.performR2C = inputLaunchConfiguration.performR2C;
 	}
 	if (inputLaunchConfiguration.performDCT != 0) {
 		app->configuration.performDCT = inputLaunchConfiguration.performDCT;
 	}
+	if (inputLaunchConfiguration.performDST != 0) {
+		app->configuration.performDST = inputLaunchConfiguration.performDST;
+	}
 	if (inputLaunchConfiguration.disableMergeSequencesR2C != 0) {
 		app->configuration.disableMergeSequencesR2C = inputLaunchConfiguration.disableMergeSequencesR2C;
 	}
@@ -1296,10 +1316,10 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 	app->configuration.reorderFourStep = 1;
 	if (inputLaunchConfiguration.disableReorderFourStep != 0) {
 		app->configuration.reorderFourStep = 0;
-		if (app->configuration.swapTo3Stage4Step < 1048576) app->configuration.swapTo3Stage4Step = 1048576;
+		if ((app->configuration.swapTo3Stage4Step < 1048576) && (!app->configuration.quadDoubleDoublePrecision) && (!app->configuration.quadDoubleDoublePrecisionDoubleMemory)) app->configuration.swapTo3Stage4Step = 1048576;
 	}
 	if (inputLaunchConfiguration.frequencyZeroPadding != 0) app->configuration.frequencyZeroPadding = inputLaunchConfiguration.frequencyZeroPadding;
-	for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+	for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 		if (inputLaunchConfiguration.performZeropadding[i] != 0) {
 			app->configuration.performZeropadding[i] = inputLaunchConfiguration.performZeropadding[i];
 			app->configuration.fft_zeropad_left[i] = inputLaunchConfiguration.fft_zeropad_left[i];
@@ -1393,11 +1413,15 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 
 	if (inputLaunchConfiguration.useRaderUintLUT != 0)	app->configuration.useRaderUintLUT = inputLaunchConfiguration.useRaderUintLUT;
 	if (inputLaunchConfiguration.halfThreads != 0)	app->configuration.halfThreads = inputLaunchConfiguration.halfThreads;
+	if (inputLaunchConfiguration.swapTo2Stage4Step != 0)	app->configuration.swapTo2Stage4Step = inputLaunchConfiguration.swapTo2Stage4Step;
 	if (inputLaunchConfiguration.swapTo3Stage4Step != 0)	app->configuration.swapTo3Stage4Step = inputLaunchConfiguration.swapTo3Stage4Step;
-	if (app->configuration.performDCT > 0) app->configuration.performBandwidthBoost = -1;
+	if ((app->configuration.performDCT > 0) || (app->configuration.performDST > 0)) app->configuration.performBandwidthBoost = -1;
 	if (inputLaunchConfiguration.performBandwidthBoost != 0)	app->configuration.performBandwidthBoost = inputLaunchConfiguration.performBandwidthBoost;
-	
-    for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+#if(VKFFT_BACKEND==0)	
+	if (inputLaunchConfiguration.stagingBuffer != 0)	app->configuration.stagingBuffer = inputLaunchConfiguration.stagingBuffer;
+	if (inputLaunchConfiguration.stagingBufferMemory != 0)	app->configuration.stagingBufferMemory = inputLaunchConfiguration.stagingBufferMemory;
+#endif	
+    for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
         if (inputLaunchConfiguration.groupedBatch[i] != 0)	app->configuration.groupedBatch[i] = inputLaunchConfiguration.groupedBatch[i];
     }
 	
@@ -1422,21 +1446,28 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
 			deleteVkFFT(app);
 			return VKFFT_ERROR_EMPTY_applicationString;
 		}
-		memcpy(&app->applicationStringSize, app->configuration.loadApplicationString, sizeof(uint64_t));
-		memcpy(&app->applicationStringOffsetRader, (char*)app->configuration.loadApplicationString + 2 * sizeof(uint64_t), sizeof(uint64_t));
-		app->currentApplicationStringPos = 5 * sizeof(uint64_t);
+		memcpy(&app->applicationStringSize, app->configuration.loadApplicationString, sizeof(pfUINT));
+		memcpy(&app->applicationStringOffsetRader, (char*)app->configuration.loadApplicationString + 2 * sizeof(pfUINT), sizeof(pfUINT));
+		app->currentApplicationStringPos = 5 * sizeof(pfUINT);
 	}
 	//temporary set:
 	app->configuration.registerBoost4Step = 1;
 #if(VKFFT_BACKEND==0) 
 	app->configuration.useUint64 = 0; //No physical addressing mode in Vulkan shaders. Use multiple-buffer support to achieve emulation of physical addressing.
 #endif
-	//uint64_t initSharedMemory = app->configuration.sharedMemorySize;
+	//pfUINT initSharedMemory = app->configuration.sharedMemorySize;
 	return resFFT;
 }
 static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration) {
 	
 	VkFFTResult resFFT = VKFFT_SUCCESS;
+    unsigned char *test = (unsigned char*)app;
+    if (app == 0){
+    	return VKFFT_ERROR_EMPTY_app;
+    }
+	if (memcmp(test, test + 1, sizeof(VkFFTApplication) - 1) != 0){
+		return VKFFT_ERROR_NONZERO_APP_INITIALIZATION;
+	}
 	resFFT = setConfigurationVkFFT(app, inputLaunchConfiguration);
 	if (resFFT != VKFFT_SUCCESS) {
 		deleteVkFFT(app);
@@ -1446,15 +1477,15 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 	if (!app->configuration.makeForwardPlanOnly) {
 		app->localFFTPlan_inverse = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan));
 		if (app->localFFTPlan_inverse) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 				//app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
 				resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, (int)i);
 				if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH) {
 					//try again with Rader disabled - sequences like 89^4 can still be done with Bluestein FFT
 					memset(app->localFFTPlan_inverse, 0, sizeof(VkFFTPlan));
-					uint64_t temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT;
+					pfUINT temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT;
 					app->configuration.fixMaxRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT;
-					uint64_t temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult;
+					pfUINT temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult;
 					app->configuration.fixMaxRaderPrimeMult = app->configuration.fixMinRaderPrimeMult;
 					resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, (int)i);
 					app->configuration.fixMaxRaderPrimeFFT = temp_fixMaxRaderPrimeFFT;
@@ -1465,14 +1496,14 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 					return resFFT;
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+					for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
 						app->localFFTPlan_inverse->inverseBluesteinAxes[i][j] = app->localFFTPlan_inverse->axes[i][j];
 					}
 				}
 			}
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 				//app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
-				for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+				for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
 					resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 0);
 					if (resFFT != VKFFT_SUCCESS) {
 						deleteVkFFT(app);
@@ -1480,7 +1511,7 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 					}
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+					for (pfUINT j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
 						resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 1);
 						if (resFFT != VKFFT_SUCCESS) {
 							deleteVkFFT(app);
@@ -1505,15 +1536,15 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 	if (!app->configuration.makeInversePlanOnly) {
 		app->localFFTPlan = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan));
 		if (app->localFFTPlan) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 				//app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
 				resFFT = VkFFTScheduler(app, app->localFFTPlan, (int)i);
 				if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH) {
 					//try again with Rader disabled - sequences like 89^4 can still be done with Bluestein FFT
 					memset(app->localFFTPlan, 0, sizeof(VkFFTPlan));
-					uint64_t temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT;
+					pfUINT temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT;
 					app->configuration.fixMaxRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT;
-					uint64_t temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult;
+					pfUINT temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult;
 					app->configuration.fixMaxRaderPrimeMult = app->configuration.fixMinRaderPrimeMult;
 					resFFT = VkFFTScheduler(app, app->localFFTPlan, (int)i);
 					app->configuration.fixMaxRaderPrimeFFT = temp_fixMaxRaderPrimeFFT;
@@ -1524,14 +1555,14 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 					return resFFT;
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+					for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
 						app->localFFTPlan->inverseBluesteinAxes[i][j] = app->localFFTPlan->axes[i][j];
 					}
 				}
 			}
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 				//app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
-				for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+				for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
 					resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 0);
 					if (resFFT != VKFFT_SUCCESS) {
 						deleteVkFFT(app);
@@ -1539,7 +1570,7 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 					}
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+					for (pfUINT j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
 						resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 1);
 						if (resFFT != VKFFT_SUCCESS) {
 							deleteVkFFT(app);
@@ -1561,7 +1592,7 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
 	}
-	for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+	for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 		if (app->useBluesteinFFT[i]) {
 			if (!app->configuration.makeInversePlanOnly)
 				resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan, i);
@@ -1575,45 +1606,45 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 	}
 
 	if (inputLaunchConfiguration.saveApplicationToString != 0) {
-		uint64_t totalBinarySize = 5 * sizeof(uint64_t);
+		pfUINT totalBinarySize = 5 * sizeof(pfUINT);
 		if (!app->configuration.makeForwardPlanOnly) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-				for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
-					totalBinarySize += app->localFFTPlan_inverse->axes[i][j].binarySize + sizeof(uint64_t);
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+				for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+					totalBinarySize += app->localFFTPlan_inverse->axes[i][j].binarySize + sizeof(pfUINT);
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
-						totalBinarySize += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize + sizeof(uint64_t);
+					for (pfUINT j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+						totalBinarySize += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize + sizeof(pfUINT);
 					}
 				}
 				if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) {
-					totalBinarySize += app->localFFTPlan_inverse->R2Cdecomposition.binarySize + sizeof(uint64_t);
+					totalBinarySize += app->localFFTPlan_inverse->R2Cdecomposition.binarySize + sizeof(pfUINT);
 				}
 			}
 		}
 		if (!app->configuration.makeInversePlanOnly) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-				for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
-					totalBinarySize += app->localFFTPlan->axes[i][j].binarySize + sizeof(uint64_t);
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+				for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+					totalBinarySize += app->localFFTPlan->axes[i][j].binarySize + sizeof(pfUINT);
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
-						totalBinarySize += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize + sizeof(uint64_t);
+					for (pfUINT j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+						totalBinarySize += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize + sizeof(pfUINT);
 					}
 				}
 				if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) {
-					totalBinarySize += app->localFFTPlan->R2Cdecomposition.binarySize + sizeof(uint64_t);
+					totalBinarySize += app->localFFTPlan->R2Cdecomposition.binarySize + sizeof(pfUINT);
 				}
 			}
 		}
-		for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+		for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 			if (app->useBluesteinFFT[i]) {
 				totalBinarySize += app->applicationBluesteinStringSize[i];
 			}
 		}
 		if (app->numRaderFFTPrimes > 0) {
 			app->applicationStringOffsetRader = totalBinarySize;
-			for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) {
+			for (pfUINT i = 0; i < app->numRaderFFTPrimes; i++) {
 				totalBinarySize += app->rader_buffer_size[i];
 			}
 		}
@@ -1624,70 +1655,70 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
 		}
 		app->applicationStringSize = totalBinarySize;
 		char* localApplicationStringCast = (char*)app->saveApplicationString;
-		memcpy(localApplicationStringCast, &totalBinarySize, sizeof(uint64_t));
-		memcpy(localApplicationStringCast + 2, &app->applicationStringOffsetRader, sizeof(uint64_t));
-		uint64_t currentPos = 5 * sizeof(uint64_t);
+		memcpy(localApplicationStringCast, &totalBinarySize, sizeof(pfUINT));
+		memcpy(localApplicationStringCast + 2 * sizeof(pfUINT), &app->applicationStringOffsetRader, sizeof(pfUINT));
+		pfUINT currentPos = 5 * sizeof(pfUINT);
 		if (!app->configuration.makeForwardPlanOnly) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-				for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
-					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->axes[i][j].binarySize, sizeof(uint64_t));
-					currentPos += sizeof(uint64_t);
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+				for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->axes[i][j].binarySize, sizeof(pfUINT));
+					currentPos += sizeof(pfUINT);
 					memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->axes[i][j].binary, app->localFFTPlan_inverse->axes[i][j].binarySize);
 					currentPos += app->localFFTPlan_inverse->axes[i][j].binarySize;
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
-						memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize, sizeof(uint64_t));
-						currentPos += sizeof(uint64_t);
+					for (pfUINT j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
+						memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize, sizeof(pfUINT));
+						currentPos += sizeof(pfUINT);
 						memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binary, app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize);
 						currentPos += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize;
 					}
 				}
 				if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) {
-					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->R2Cdecomposition.binarySize, sizeof(uint64_t));
-					currentPos += sizeof(uint64_t);
+					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->R2Cdecomposition.binarySize, sizeof(pfUINT));
+					currentPos += sizeof(pfUINT);
 					memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->R2Cdecomposition.binary, app->localFFTPlan_inverse->R2Cdecomposition.binarySize);
 					currentPos += app->localFFTPlan_inverse->R2Cdecomposition.binarySize;
 				}
 			}
 		}
 		if (!app->configuration.makeInversePlanOnly) {
-			for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-				for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
-					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->axes[i][j].binarySize, sizeof(uint64_t));
-					currentPos += sizeof(uint64_t);
+			for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+				for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->axes[i][j].binarySize, sizeof(pfUINT));
+					currentPos += sizeof(pfUINT);
 					memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->axes[i][j].binary, app->localFFTPlan->axes[i][j].binarySize);
 					currentPos += app->localFFTPlan->axes[i][j].binarySize;
 				}
 				if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-					for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
-						memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize, sizeof(uint64_t));
-						currentPos += sizeof(uint64_t);
+					for (pfUINT j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
+						memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize, sizeof(pfUINT));
+						currentPos += sizeof(pfUINT);
 						memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->inverseBluesteinAxes[i][j].binary, app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize);
 						currentPos += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize;
 					}
 				}
 				if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) {
-					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->R2Cdecomposition.binarySize, sizeof(uint64_t));
-					currentPos += sizeof(uint64_t);
+					memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->R2Cdecomposition.binarySize, sizeof(pfUINT));
+					currentPos += sizeof(pfUINT);
 					memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->R2Cdecomposition.binary, app->localFFTPlan->R2Cdecomposition.binarySize);
 					currentPos += app->localFFTPlan->R2Cdecomposition.binarySize;
 				}
 			}
 		}
-		for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+		for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 			if (app->useBluesteinFFT[i]) {
 				memcpy(localApplicationStringCast + currentPos, app->applicationBluesteinString[i], app->applicationBluesteinStringSize[i]);
 				currentPos += app->applicationBluesteinStringSize[i];
 			}
 		}
 		if (app->numRaderFFTPrimes > 0) {
-			for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) {
+			for (pfUINT i = 0; i < app->numRaderFFTPrimes; i++) {
 				memcpy(localApplicationStringCast + currentPos, app->raderFFTkernel[i], app->rader_buffer_size[i]);
 				currentPos += app->rader_buffer_size[i];
 			}
 		}
-		for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
+		for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
 			if (app->applicationBluesteinString[i] != 0) {
 				free(app->applicationBluesteinString[i]);
 				app->applicationBluesteinString[i] = 0;
diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_RunApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_RunApp.h
index 1b72e60d..802c5256 100644
--- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_RunApp.h
+++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_RunApp.h
@@ -31,7 +31,7 @@ static inline VkFFTResult VkFFTSync(VkFFTApplication* app) {
 #elif(VKFFT_BACKEND==1)
     if (app->configuration.num_streams > 1) {
         cudaError_t res = cudaSuccess;
-        for (uint64_t s = 0; s < app->configuration.num_streams; s++) {
+        for (pfUINT s = 0; s < app->configuration.num_streams; s++) {
             res = cudaEventSynchronize(app->configuration.stream_event[s]);
             if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
         }
@@ -40,7 +40,7 @@ static inline VkFFTResult VkFFTSync(VkFFTApplication* app) {
 #elif(VKFFT_BACKEND==2)
     if (app->configuration.num_streams > 1) {
         hipError_t res = hipSuccess;
-        for (uint64_t s = 0; s < app->configuration.num_streams; s++) {
+        for (pfUINT s = 0; s < app->configuration.num_streams; s++) {
             res = hipEventSynchronize(app->configuration.stream_event[s]);
             if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
         }
@@ -111,34 +111,34 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
     if (inverse != 1) {
         //FFT axis 0
         if (!app->configuration.omitDimension[0]) {
-            for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[0] - 1; l >= 0; l--) {
+            for (pfINT l = (pfINT)app->localFFTPlan->numAxisUploads[0] - 1; l >= 0; l--) {
                 VkFFTAxis* axis = &app->localFFTPlan->axes[0][l];
                 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
-                uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
+                pfUINT maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
 #if(VKFFT_BACKEND==0)
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
+                pfUINT dispatchBlock[3];
                 if (l == 0) {
                     if (app->localFFTPlan->numAxisUploads[0] > 2) {
-                        dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
+                        dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
                         dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
                     }
                     else {
                         if (app->localFFTPlan->numAxisUploads[0] > 1) {
-                            dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
+                            dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
                             dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
                         }
                         else {
                             dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i;
-                            dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
+                            dispatchBlock[1] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
                         }
                     }
                 }
                 else {
-                    dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
+                    dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
                     dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
                 }
                 dispatchBlock[2] = maxCoordinate * app->configuration.numberBatches;
@@ -146,9 +146,9 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                     dispatchBlock[2]*= app->localFFTPlan->actualFFTSizePerAxis[0][p];
                 }
                 
-                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                 resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
                 printDebugInformation(app, axis);
@@ -156,34 +156,34 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
             }
             if (app->useBluesteinFFT[0] && (app->localFFTPlan->numAxisUploads[0] > 1)) {
-                for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[0]; l++) {
+                for (pfINT l = 1; l < (pfINT)app->localFFTPlan->numAxisUploads[0]; l++) {
                     VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[0][l];
                     resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
-                    uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
+                    pfUINT maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
 #if(VKFFT_BACKEND==0)
                     vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                     vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                    uint64_t dispatchBlock[3];
+                    pfUINT dispatchBlock[3];
                     if (l == 0) {
                         if (app->localFFTPlan->numAxisUploads[0] > 2) {
-                            dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
+                            dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
                             dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
                         }
                         else {
                             if (app->localFFTPlan->numAxisUploads[0] > 1) {
-                                dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
+                                dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
                                 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
                             }
                             else {
                                 dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i;
-                                dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
+                                dispatchBlock[1] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
                             }
                         }
                     }
                     else {
-                        dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
+                        dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
                         dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
                     }
                     
@@ -192,9 +192,9 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                         dispatchBlock[2]*= app->localFFTPlan->actualFFTSizePerAxis[0][p];
                     }
                     
-                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                    //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                    //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                    //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                    //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                     resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
                     printDebugInformation(app, axis);
@@ -206,19 +206,19 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 VkFFTAxis* axis = &app->localFFTPlan->R2Cdecomposition;
                 resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan, axis, 0, 0, 0);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
-                uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
+                pfUINT maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
                 
 #if(VKFFT_BACKEND==0)
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
+                pfUINT dispatchBlock[3];
                 
                 dispatchBlock[0] = (app->configuration.size[0] / 2 + 1);
                 for (int p = 1; p <app->configuration.FFTdim; p++){
                     dispatchBlock[0] *= app->configuration.size[p];
                 }
-                dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / (double)(2 * axis->axisBlock[0]));
+                dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / (double)(2 * axis->axisBlock[0]));
                 
                 dispatchBlock[1] = 1;
                 dispatchBlock[2] = maxCoordinate * axis->specializationConstants.numBatches.data.i;
@@ -234,26 +234,26 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
             if (!app->configuration.omitDimension[i]) {
                 if ((app->configuration.FFTdim == (i+1)) && (app->configuration.performConvolution)) {
                     
-                    for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[i] - 1; l >= 0; l--) {
+                    for (pfINT l = (pfINT)app->localFFTPlan->numAxisUploads[i] - 1; l >= 0; l--) {
                         VkFFTAxis* axis = &app->localFFTPlan->axes[i][l];
                         resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, i, l, 0);
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
-                        uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
+                        pfUINT maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
                         
 #if(VKFFT_BACKEND==0)
                         vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                         vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                        uint64_t dispatchBlock[3];
-                        dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
+                        pfUINT dispatchBlock[3];
+                        dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
                         dispatchBlock[1] = 1;
                         dispatchBlock[2] = maxCoordinate * app->configuration.numberBatches;
                         for (int p = 1; p <app->configuration.FFTdim; p++){
                             if (p != i)
                                 dispatchBlock[2]*= app->localFFTPlan->actualFFTSizePerAxis[i][p];
                         }
-                        //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
-                        //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                        //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
+                        //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                         resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
                         printDebugInformation(app, axis);
@@ -263,7 +263,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 }
                 else {
                     
-                    for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[i] - 1; l >= 0; l--) {
+                    for (pfINT l = (pfINT)app->localFFTPlan->numAxisUploads[i] - 1; l >= 0; l--) {
                         VkFFTAxis* axis = &app->localFFTPlan->axes[i][l];
                         resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, i, l, 0);
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -272,17 +272,17 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                         vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                         vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                        uint64_t dispatchBlock[3];
+                        pfUINT dispatchBlock[3];
                         
-                        dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
+                        dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
                         dispatchBlock[1] = 1;
                         dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
                         for (int p = 1; p <app->configuration.FFTdim; p++){
                             if (p != i)
                                 dispatchBlock[2]*= app->localFFTPlan->actualFFTSizePerAxis[i][p];
                         }
-                        //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
-                        //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                        //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
+                        //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                         resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
                         printDebugInformation(app, axis);
@@ -291,7 +291,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
                     }
                     if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-                        for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[i]; l++) {
+                        for (pfINT l = 1; l < (pfINT)app->localFFTPlan->numAxisUploads[i]; l++) {
                             VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[i][l];
                             resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, i, l, 0);
                             if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -299,16 +299,16 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                             vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                             vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                            uint64_t dispatchBlock[3];
-                            dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
+                            pfUINT dispatchBlock[3];
+                            dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
                             dispatchBlock[1] = 1;
                             dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
                             for (int p = 1; p <app->configuration.FFTdim; p++){
                                 if (p != i)
                                     dispatchBlock[2]*= app->localFFTPlan->actualFFTSizePerAxis[i][p];
                             }
-                            //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                            //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                            //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                            //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                             resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                             if (resFFT != VKFFT_SUCCESS) return resFFT;
                             printDebugInformation(app, axis);
@@ -327,7 +327,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
             //multiple upload ifft leftovers
             if (app->configuration.FFTdim == (i+1)) {
 
-                for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[i]; l++) {
+                for (pfINT l = (pfINT)1; l < (pfINT)app->localFFTPlan_inverse->numAxisUploads[i]; l++) {
                     VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[i][l];
                     resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, i, l, 1);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -336,15 +336,15 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                     vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                     vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                    uint64_t dispatchBlock[3];
-                    dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
+                    pfUINT dispatchBlock[3];
+                    dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
                     dispatchBlock[1] = 1;
                     dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberKernels;
                     for (int p = 1; p <app->configuration.FFTdim; p++){
                         if (p != i)
                             dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[i][p];
                     }
-                    //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
+                    //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
                     resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
                     printDebugInformation(app, axis);
@@ -362,13 +362,13 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
+                pfUINT dispatchBlock[3];
 
                 dispatchBlock[0] = (app->configuration.size[0] / 2 + 1);
                 for (int p = 1; p <app->configuration.FFTdim; p++){
                     dispatchBlock[0] *= app->configuration.size[p];
                 }
-                dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / (double)(2 * axis->axisBlock[0]));
+                dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / (double)(2 * axis->axisBlock[0]));
                 
                 
                 dispatchBlock[1] = 1;
@@ -381,7 +381,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
             }
             
-            for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[i-1]; l++) {
+            for (pfINT l = 0; l < (pfINT)app->localFFTPlan_inverse->numAxisUploads[i-1]; l++) {
                 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[i-1][l];
                 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, i-1, l, 1);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -390,35 +390,35 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
+                pfUINT dispatchBlock[3];
                 if (i==1){
                     if (l == 0) {
                         if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
-                            dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
+                            dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
                             dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                         }
                         else {
                             if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
-                                dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
+                                dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
                                 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                             }
                             else {
                                 dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i;
-                                dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
+                                dispatchBlock[1] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
                             }
                         }
                     }
                     else {
-                        dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
+                        dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
                         dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                     }
                     dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberKernels;
                     for (int p = 2; p <app->configuration.FFTdim; p++){
                         dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[i-1][p];
                     }
-                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
+                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
                 }else{
-                    dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i-1][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i-1][i-1] / (double)axis->specializationConstants.fftDim.data.i);
+                    dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i-1][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i-1][i-1] / (double)axis->specializationConstants.fftDim.data.i);
                     dispatchBlock[1] = 1;
                     dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberKernels;
                     for (int p = 1; p <app->configuration.FFTdim; p++){
@@ -426,8 +426,8 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                             dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[i-1][p];
                     }
                 }
-                //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
-                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
+                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                 resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
                 printDebugInformation(app, axis);
@@ -438,7 +438,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
         }
         
         if (app->configuration.FFTdim == 1) {
-            for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
+            for (pfINT l = (pfINT)1; l < (pfINT)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
                 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l];
                 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -447,13 +447,13 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
-                dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->specializationConstants.fftDim.data.i);
+                pfUINT dispatchBlock[3];
+                dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->specializationConstants.fftDim.data.i);
                 dispatchBlock[1] = 1;
                 dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberKernels;
                 
-                //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
-                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
+                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                 resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
                 printDebugInformation(app, axis);
@@ -468,7 +468,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
         
         for (int i = (int)app->configuration.FFTdim-1; i > 0; i--){
             if (!app->configuration.omitDimension[i]) {
-                for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[i] - 1; l >= 0; l--) {
+                for (pfINT l = (pfINT)app->localFFTPlan_inverse->numAxisUploads[i] - 1; l >= 0; l--) {
                     //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l;
                     VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[i][l];
                     resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, i, l, 1);
@@ -478,18 +478,18 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                     vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                     vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                    uint64_t dispatchBlock[3];
-                    dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i][0]  / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
+                    pfUINT dispatchBlock[3];
+                    dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i][0]  / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
                     dispatchBlock[1] = 1;
                     dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
                     for (int p = 1; p <app->configuration.FFTdim; p++){
                         if (p != i)
                             dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[i][p];
                     }
-                    //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
-                    //if (app->configuration.performZeropaddingInverse[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
+                    //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
+                    //if (app->configuration.performZeropaddingInverse[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
 
-                    //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
+                    //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / 2.0);
                     resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
                     printDebugInformation(app, axis);
@@ -498,7 +498,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                     //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l;
                 }
                 if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-                    for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[i]; l++) {
+                    for (pfINT l = 1; l < (pfINT)app->localFFTPlan_inverse->numAxisUploads[i]; l++) {
                         VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[i][l];
                         resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, i, l, 1);
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -506,16 +506,16 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                         vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                         vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                        uint64_t dispatchBlock[3];
-                        dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i][0]  / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
+                        pfUINT dispatchBlock[3];
+                        dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[i][0]  / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[i][i] / (double)axis->specializationConstants.fftDim.data.i);
                         dispatchBlock[1] = 1;
                         dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
                         for (int p = 1; p <app->configuration.FFTdim; p++){
                             if (p != i)
                                 dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[i][p];
                         }
-                        //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                        //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                        //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                        //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                         resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                         if (resFFT != VKFFT_SUCCESS) return resFFT;
                         printDebugInformation(app, axis);
@@ -536,13 +536,13 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
+                pfUINT dispatchBlock[3];
 
                 dispatchBlock[0] = (app->configuration.size[0] / 2 + 1);
                 for (int p = 1; p <app->configuration.FFTdim; p++){
                     dispatchBlock[0] *= app->configuration.size[p];
                 }
-                dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / (double)(2 * axis->axisBlock[0]));
+                dispatchBlock[0] = (pfUINT)pfceil(dispatchBlock[0] / (double)(2 * axis->axisBlock[0]));
                 
                 
                 dispatchBlock[1] = 1;
@@ -556,7 +556,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
             }
             //FFT axis 0
-            for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[0] - 1; l >= 0; l--) {
+            for (pfINT l = (pfINT)app->localFFTPlan_inverse->numAxisUploads[0] - 1; l >= 0; l--) {
                 //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l;
                 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l];
                 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
@@ -565,34 +565,34 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                uint64_t dispatchBlock[3];
+                pfUINT dispatchBlock[3];
                 if (l == 0) {
                     if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
-                        dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
+                        dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
                         dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                     }
                     else {
                         if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
-                            dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
+                            dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
                             dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                         }
                         else {
                             dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i;
-                            dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
+                            dispatchBlock[1] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
                         }
                     }
                 }
                 else {
-                    dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
+                    dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
                     dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                 }
                 dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
                 for (int p = 2; p <app->configuration.FFTdim; p++){
                     dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[0][p];
                 }
-                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                 resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
                 printDebugInformation(app, axis);
@@ -601,7 +601,7 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                 if (resFFT != VKFFT_SUCCESS) return resFFT;
             }
             if (app->useBluesteinFFT[0] && (app->localFFTPlan_inverse->numAxisUploads[0] > 1)) {
-                for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
+                for (pfINT l = 1; l < (pfINT)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
                     VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[0][l];
                     resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -610,34 +610,34 @@ static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTL
                     vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
                     vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 #endif
-                    uint64_t dispatchBlock[3];
+                    pfUINT dispatchBlock[3];
                     if (l == 0) {
                         if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
-                            dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
+                            dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
                             dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                         }
                         else {
                             if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
-                                dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
+                                dispatchBlock[0] = (pfUINT)pfceil((pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[1]));
                                 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                             }
                             else {
                                 dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i;
-                                dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
+                                dispatchBlock[1] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
                             }
                         }
                     }
                     else {
-                        dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
+                        dispatchBlock[0] = (pfUINT)pfceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim.data.i / (double)axis->axisBlock[0]);
                         dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
                     }
                     dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
                     for (int p = 2; p <app->configuration.FFTdim; p++){
                         dispatchBlock[2]*= app->localFFTPlan_inverse->actualFFTSizePerAxis[0][p];
                     }
-                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                    //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
-                    //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
+                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                    //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (pfUINT)pfceil(dispatchBlock[1] / 2.0);
+                    //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (pfUINT)pfceil(dispatchBlock[2] / 2.0);
                     resFFT = VkFFT_DispatchPlan(app, axis, dispatchBlock);
                     if (resFFT != VKFFT_SUCCESS) return resFFT;
                     printDebugInformation(app, axis);
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelStartEnd.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelStartEnd.h
index 822b3a34..ab231d2b 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelStartEnd.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelStartEnd.h
@@ -24,9 +24,9 @@
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_StringManagement/vkFFT_StringManager.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h"
-static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, int64_t type) {
+static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, pfINT type) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type;
+	pfUINT locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type;
 	PfContainer* floatType;
 	PfGetTypeFromCode(sc, sc->floatTypeCode, &floatType); 
 	PfContainer* floatTypeInputMemory;
@@ -63,45 +63,45 @@ static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, int
 	switch (type) {
 	case 5:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	case 6:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	case 110:case 111:case 120:case 121:case 130:case 131:case 140:case 141:case 142:case 143:case 144:case 145:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	default:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	}
 	PfAppendLine(sc);
 
 	if (sc->convolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType->name);
 		PfAppendLine(sc);
 	}
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType->name);
 		PfAppendLine(sc);
 	}
 	if (sc->raderUintLUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", %s* g_pow", uintType32->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", %s* g_pow", uintType32->name);
 		PfAppendLine(sc);
 	}
 	if (sc->BluesteinConvolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinConvolutionKernel", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinConvolutionKernel", vecType->name);
 		PfAppendLine(sc);
 	}
 	if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) {
-		sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinMultiplication", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinMultiplication", vecType->name);
 		PfAppendLine(sc);
 	}
 	sc->tempLen = sprintf(sc->tempStr, ") {\n");
@@ -147,44 +147,44 @@ static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, int
 	switch (type) {
 	case 5:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	case 6:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	case 110:case 111:case 120:case 121:case 130:case 131:case 140:case 141:case 142:case 143:case 144:case 145:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	default:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	}
 	PfAppendLine(sc);
 	if (sc->convolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> kernel_obj", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> kernel_obj", vecType->name);
 		PfAppendLine(sc);
 	}
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> twiddleLUT", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> twiddleLUT", vecType->name);
 		PfAppendLine(sc);
 	}
 	if (sc->raderUintLUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> g_pow", uintType32->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> g_pow", uintType32->name);
 		PfAppendLine(sc);
 	}
 	if (sc->BluesteinConvolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinConvolutionKernel", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinConvolutionKernel", vecType->name);
 		PfAppendLine(sc);
 	}
 	if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) {
-		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinMultiplication", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinMultiplication", vecType->name);
 		PfAppendLine(sc);
 	}
 	sc->tempLen = sprintf(sc->tempStr, ") {\n");
@@ -197,49 +197,49 @@ static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, int
 	switch (type) {
 	case 5:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	case 6:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	case 110:case 111:case 120:case 121:case 130:case 131:case 140:case 141:case 142:case 143:case 144:case 145:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	default:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	}
 	PfAppendLine(sc);
 	int args_id = 2;
 	if (sc->convolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType->name);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType->name);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->raderUintLUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", __global %s* g_pow", uintType32->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", __global %s* g_pow", uintType32->name);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->BluesteinConvolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinConvolutionKernel", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinConvolutionKernel", vecType->name);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) {
-		sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinMultiplication", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinMultiplication", vecType->name);
 		PfAppendLine(sc);
 		args_id++;
 	}
@@ -255,63 +255,67 @@ static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, int
 	sc->tempLen = sprintf(sc->tempStr, "kernel void VkFFT_main ");
 	PfAppendLine(sc);
 	
-	sc->tempLen = sprintf(sc->tempStr, "(%s3 thread_position_in_grid [[thread_position_in_grid]], ", uintType->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "(%s3 thread_position_in_grid [[thread_position_in_grid]], ", uintType->name);
 	PfAppendLine(sc);
 	
-	sc->tempLen = sprintf(sc->tempStr, "%s3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], ", uintType->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "%s3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], ", uintType->name);
 	PfAppendLine(sc);
 	
-	sc->tempLen = sprintf(sc->tempStr, "%s3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], ", uintType->data.s);
-	PfAppendLine(sc);
-	
-	sc->tempLen = sprintf(sc->tempStr, "threadgroup %s* sdata [[threadgroup(0)]], ", vecType->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "%s3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], ", uintType->name);
 	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "threadgroup %s* sdata [[threadgroup(0)]], ", floatType->name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "threadgroup %s* sdata [[threadgroup(0)]], ", vecType->name);
+		PfAppendLine(sc);
+	}
 	switch (type) {
 	case 5:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	case 6:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	case 110:case 111:case 120:case 121:case 130:case 131:case 140:case 141:case 142:case 143:case 144:case 145:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory->data.s, floatTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory->name, floatTypeOutputMemory->name);
 		break;
 	}
 	default:
 	{
-		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 		break;
 	}
 	}
 	PfAppendLine(sc);
 	int args_id = 2;
 	if (sc->convolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", constant %s* kernel_obj[[buffer(%d)]]", vecType->data.s, args_id);
+		sc->tempLen = sprintf(sc->tempStr, ", constant %s* kernel_obj[[buffer(%d)]]", vecType->name, args_id);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", constant %s* twiddleLUT[[buffer(%d)]]", vecType->data.s, args_id);
+		sc->tempLen = sprintf(sc->tempStr, ", constant %s* twiddleLUT[[buffer(%d)]]", vecType->name, args_id);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->raderUintLUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", constant %s* g_pow[[buffer(%d)]]", uintType32->data.s, args_id);
+		sc->tempLen = sprintf(sc->tempStr, ", constant %s* g_pow[[buffer(%d)]]", uintType32->name, args_id);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->BluesteinConvolutionStep) {
-		sc->tempLen = sprintf(sc->tempStr, ", constant %s* BluesteinConvolutionKernel[[buffer(%d)]]", vecType->data.s, args_id);
+		sc->tempLen = sprintf(sc->tempStr, ", constant %s* BluesteinConvolutionKernel[[buffer(%d)]]", vecType->name, args_id);
 		PfAppendLine(sc);
 		args_id++;
 	}
 	if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) {
-		sc->tempLen = sprintf(sc->tempStr, ", constant %s* BluesteinMultiplication[[buffer(%d)]]", vecType->data.s, args_id);
+		sc->tempLen = sprintf(sc->tempStr, ", constant %s* BluesteinMultiplication[[buffer(%d)]]", vecType->name, args_id);
 		PfAppendLine(sc);
 		args_id++;
 	}
@@ -330,9 +334,9 @@ static inline void appendKernelStart(VkFFTSpecializationConstantsLayout* sc, int
 	return;
 }
 
-static inline void appendKernelStart_R2C(VkFFTSpecializationConstantsLayout* sc, int64_t type) {
+static inline void appendKernelStart_R2C(VkFFTSpecializationConstantsLayout* sc, pfINT type) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type;
+	pfUINT locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type;
 	PfContainer* floatType;
 	PfGetTypeFromCode(sc, sc->floatTypeCode, &floatType);
 	PfContainer* floatTypeInputMemory;
@@ -364,11 +368,11 @@ static inline void appendKernelStart_R2C(VkFFTSpecializationConstantsLayout* sc,
 	sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ void __launch_bounds__(%" PRIi64 ") VkFFT_main_R2C ", sc->localSize[0].data.i * sc->localSize[1].data.i * sc->localSize[2].data.i);
 	PfAppendLine(sc);
 
-	sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 	PfAppendLine(sc);
 
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType->name);
 		PfAppendLine(sc);
 	}
 	sc->tempLen = sprintf(sc->tempStr, ") {\n");
@@ -408,11 +412,11 @@ static inline void appendKernelStart_R2C(VkFFTSpecializationConstantsLayout* sc,
 	sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIi64 ") __global__ void VkFFT_main_R2C ", sc->localSize[0].data.i * sc->localSize[1].data.i * sc->localSize[2].data.i);
 	PfAppendLine(sc);
 	
-	sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 	PfAppendLine(sc);
 	
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> twiddleLUT", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> twiddleLUT", vecType->name);
 		PfAppendLine(sc);
 	}
 	sc->tempLen = sprintf(sc->tempStr, ") {\n");
@@ -421,11 +425,11 @@ static inline void appendKernelStart_R2C(VkFFTSpecializationConstantsLayout* sc,
 #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
 	sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIi64 ", %" PRIi64 ", %" PRIi64 "))) void VkFFT_main_R2C ", sc->localSize[0].data.i, sc->localSize[1].data.i, sc->localSize[2].data.i);
 	PfAppendLine(sc);
-	sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 	PfAppendLine(sc);
 	int args_id = 2;
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType->data.s);
+		sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType->name);
 		PfAppendLine(sc);
 		args_id++;
 	}
@@ -440,21 +444,21 @@ static inline void appendKernelStart_R2C(VkFFTSpecializationConstantsLayout* sc,
 	sc->tempLen = sprintf(sc->tempStr, "kernel void VkFFT_main_R2C ");
 	PfAppendLine(sc);
 
-	sc->tempLen = sprintf(sc->tempStr, "(%s3 thread_position_in_grid [[thread_position_in_grid]], ", uintType->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "(%s3 thread_position_in_grid [[thread_position_in_grid]], ", uintType->name);
 	PfAppendLine(sc);
 
-	sc->tempLen = sprintf(sc->tempStr, "%s3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], ", uintType->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "%s3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], ", uintType->name);
 	PfAppendLine(sc);
 
-	sc->tempLen = sprintf(sc->tempStr, "%s3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], ", uintType->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "%s3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], ", uintType->name);
 	PfAppendLine(sc);
 
-	sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInputMemory->data.s, vecTypeOutputMemory->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInputMemory->name, vecTypeOutputMemory->name);
 	PfAppendLine(sc);
 	int args_id = 2;
 	
 	if (sc->LUT) {
-		sc->tempLen = sprintf(sc->tempStr, ", constant %s* twiddleLUT[[buffer(%d)]]", vecType->data.s, args_id);
+		sc->tempLen = sprintf(sc->tempStr, ", constant %s* twiddleLUT[[buffer(%d)]]", vecType->name, args_id);
 		PfAppendLine(sc);
 		args_id++;
 	}
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelUtils.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelUtils.h
index 2f5f9b4a..49aaba29 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelUtils.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelUtils.h
@@ -68,7 +68,7 @@ static inline void appendExtensions(VkFFTSpecializationConstantsLayout* sc) {
 	//PfAppendLine(sc);
 	//
 
-	if ((((sc->floatTypeCode/10)%10) == 2) || (sc->useUint64)) {
+	if ((((sc->floatTypeCode/10)%10) == 2) || (((sc->floatTypeCode/10)%10) == 3) ||(sc->useUint64)) {
 		sc->tempLen = sprintf(sc->tempStr, "\
 #extension GL_ARB_gpu_shader_fp64 : enable\n\
 #extension GL_ARB_gpu_shader_int64 : enable\n\n");
@@ -96,7 +96,7 @@ static inline void appendExtensions(VkFFTSpecializationConstantsLayout* sc) {
 		PfAppendLine(sc);
 	}
 #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-	if ((((sc->floatTypeCode / 10) % 10) == 2) || (sc->useUint64)) {
+	if ((((sc->floatTypeCode / 10) % 10) == 2) || (((sc->floatTypeCode/10)%10) == 3) || (sc->useUint64)) {
 		sc->tempLen = sprintf(sc->tempStr, "\
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\n");
 		PfAppendLine(sc);
@@ -114,7 +114,35 @@ using namespace metal;\n");
 #endif
 	return;
 }
-
+static inline void appendQuadDoubleDoubleStruct(VkFFTSpecializationConstantsLayout* sc) {
+#if(VKFFT_BACKEND==0)	
+	/*sc->tempLen = sprintf(sc->tempStr, "\
+struct pf_quad {\n\
+%s x;\n\
+%s y;\n\
+};\n", sc->doubleDef.name, sc->doubleDef.name);
+	PfAppendLine(sc);*/
+	sc->tempLen = sprintf(sc->tempStr, "\
+struct pf_quad2 {\n\
+%s x;\n\
+%s y;\n\
+};\n", sc->quadDef.name, sc->quadDef.name);
+	PfAppendLine(sc);
+#else	
+	/*sc->tempLen = sprintf(sc->tempStr, "\
+typedef struct pf_quad {\n\
+%s x;\n\
+%s y;\n\
+};\n", sc->doubleDef.name, sc->doubleDef.name);
+	PfAppendLine(sc);*/
+	sc->tempLen = sprintf(sc->tempStr, "\
+typedef struct pf_quad2 {\n\
+%s x;\n\
+%s y;\n\
+};\n", sc->quadDef.name, sc->quadDef.name);
+	PfAppendLine(sc);
+#endif
+}
 static inline void appendSinCos20(VkFFTSpecializationConstantsLayout* sc) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer* vecType;
@@ -122,46 +150,46 @@ static inline void appendSinCos20(VkFFTSpecializationConstantsLayout* sc) {
 	PfContainer* floatType;
 	PfGetTypeFromCode(sc, sc->floatTypeCode, &floatType);
 	PfContainer temp_double;
-	temp_double.type = 32;
-	PfContainer temp_name;
-	PfAllocateContainerFlexible(sc, &temp_name, 50);
+	temp_double.type = 22;
+	PfContainer temp_name = VKFFT_ZERO_INIT;
 	temp_name.type = 100 + sc->floatTypeCode;
+	PfAllocateContainerFlexible(sc, &temp_name, 50);
 #if(VKFFT_BACKEND==0)
-	temp_double.data.d = 0.63661977236758134307553505349006l;
-	sprintf(temp_name.data.s, "loc_2_PI");
+	temp_double.data.d = pfFPinit("0.63661977236758134307553505349006");
+	sprintf(temp_name.name, "loc_2_PI");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = 1.5707963267948966192313216916398l;
-	sprintf(temp_name.data.s, "loc_PI_2"); 
+	temp_double.data.d = pfFPinit("1.5707963267948966192313216916398");
+	sprintf(temp_name.name, "loc_PI_2"); 
 	PfDefineConstant(sc, &temp_name, &temp_double); 
-	temp_double.data.d = 0.99999999999999999999962122687403772l;
-	sprintf(temp_name.data.s, "a1");
+	temp_double.data.d = pfFPinit("0.99999999999999999999962122687403772");
+	sprintf(temp_name.name, "a1");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = -0.166666666666666666637194166219637268l;
-	sprintf(temp_name.data.s, "a3");
+	temp_double.data.d = pfFPinit("-0.166666666666666666637194166219637268");
+	sprintf(temp_name.name, "a3");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = 0.00833333333333333295212653322266277182l;
-	sprintf(temp_name.data.s, "a5");
+	temp_double.data.d = pfFPinit("0.00833333333333333295212653322266277182");
+	sprintf(temp_name.name, "a5");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = -0.000198412698412696489459896530659927773l;
-	sprintf(temp_name.data.s, "a7");
+	temp_double.data.d = pfFPinit("-0.000198412698412696489459896530659927773");
+	sprintf(temp_name.name, "a7");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = 2.75573192239364018847578909205399262e-6l;
-	sprintf(temp_name.data.s, "a9");
+	temp_double.data.d = pfFPinit("2.75573192239364018847578909205399262e-6");
+	sprintf(temp_name.name, "a9");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = -2.50521083781017605729370231280411712e-8l;
-	sprintf(temp_name.data.s, "a11");
+	temp_double.data.d = pfFPinit("-2.50521083781017605729370231280411712e-8");
+	sprintf(temp_name.name, "a11");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = 1.60590431721336942356660057796782021e-10l;
-	sprintf(temp_name.data.s, "a13");
+	temp_double.data.d = pfFPinit("1.60590431721336942356660057796782021e-10");
+	sprintf(temp_name.name, "a13");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = -7.64712637907716970380859898835680587e-13l;
-	sprintf(temp_name.data.s, "a15");
+	temp_double.data.d = pfFPinit("-7.64712637907716970380859898835680587e-13");
+	sprintf(temp_name.name, "a15");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = 2.81018528153898622636194976499656274e-15l;
-	sprintf(temp_name.data.s, "a17");
+	temp_double.data.d = pfFPinit("2.81018528153898622636194976499656274e-15");
+	sprintf(temp_name.name, "a17");
 	PfDefineConstant(sc, &temp_name, &temp_double);
-	temp_double.data.d = -7.97989713648499642889739108679114937e-18l;
-	sprintf(temp_name.data.s, "ab");
+	temp_double.data.d = pfFPinit("-7.97989713648499642889739108679114937e-18");
+	sprintf(temp_name.name, "ab");
 	PfDefineConstant(sc, &temp_name, &temp_double);
 
 	sc->tempLen = sprintf(sc->tempStr, "\
@@ -180,7 +208,7 @@ static inline void appendSinCos20(VkFFTSpecializationConstantsLayout* sc) {
 	r = x < 0 ? -r : r;\n\
 	cos_sin.y = (quadrant & 2) != 0 ? -r : r;\n\
 	return cos_sin;\n\
-}\n\n", sc->functionDef.data.s, vecType->data.s, vecType->data.s);
+}\n\n", sc->functionDef.name, vecType->name, vecType->name);
 	PfAppendLine(sc);
 #endif
 	PfDeallocateContainer(sc, &temp_name);
@@ -205,29 +233,69 @@ static inline void appendConversion(VkFFTSpecializationConstantsLayout* sc) {
 		PfGetTypeFromCode(sc, sc->vecTypeOutputMemoryCode, &vecTypeDifferent);
 		PfGetTypeFromCode(sc, sc->floatTypeOutputMemoryCode, &floatTypeDifferent);
 	}
+	if (((sc->vecTypeCode % 100) / 10) == 3) {
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s%s conv_%s_to_pf_quad(%s input)\n\
+{\n\
+	%s ret_val;\n\
+	ret_val.x = (%s) input;\n\
+	ret_val.y = (%s) 0;\n\
+	return ret_val;\n\
+}\n\n", sc->functionDef.name, sc->quadDef.name, sc->doubleDef.name, sc->doubleDef.name, sc->quadDef.name, sc->doubleDef.name, sc->doubleDef.name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s%s conv_pf_quad_to_%s(%s input)\n\
+{\n\
+	%s ret_val;\n\
+	ret_val = (%s) input.x;\n\
+	return ret_val;\n\
+}\n\n", sc->functionDef.name, sc->doubleDef.name, sc->doubleDef.name, sc->quadDef.name, sc->doubleDef.name, sc->doubleDef.name);
+		PfAppendLine(sc);
 
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s%s conv_%s_to_%s(%s input)\n\
+{\n\
+	%s ret_val;\n\
+	ret_val.x.x = (%s) input.x;\n\
+	ret_val.y.x = (%s) input.y;\n\
+	ret_val.x.y = (%s) 0;\n\
+	ret_val.y.y = (%s) 0;\n\
+	return ret_val;\n\
+}\n\n", sc->functionDef.name, sc->quad2Def.name, sc->double2Def.name, sc->quad2Def.name, sc->double2Def.name, sc->quad2Def.name, sc->doubleDef.name, sc->doubleDef.name, sc->doubleDef.name, sc->doubleDef.name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s%s conv_%s_to_%s(%s input)\n\
+{\n\
+	%s ret_val;\n\
+	ret_val.x = (%s) input.x.x;\n\
+	ret_val.y = (%s) input.y.x;\n\
+	return ret_val;\n\
+}\n\n", sc->functionDef.name, sc->double2Def.name, sc->quad2Def.name, sc->double2Def.name, sc->quad2Def.name, sc->double2Def.name, sc->doubleDef.name, sc->doubleDef.name);
+		PfAppendLine(sc);
+	}
+	else {
 #if(VKFFT_BACKEND==0)
 #else
-	sc->tempLen = sprintf(sc->tempStr, "\
+		sc->tempLen = sprintf(sc->tempStr, "\
 %s%s conv_%s(%s input)\n\
 {\n\
 	%s ret_val;\n\
 	ret_val.x = (%s) input.x;\n\
 	ret_val.y = (%s) input.y;\n\
 	return ret_val;\n\
-}\n\n", sc->functionDef.data.s, vecType->data.s, vecType->data.s, vecTypeDifferent->data.s, vecType->data.s, floatType->data.s, floatType->data.s);
-	PfAppendLine(sc);
-	sc->tempLen = sprintf(sc->tempStr, "\
+}\n\n", sc->functionDef.name, vecType->name, vecType->name, vecTypeDifferent->name, vecType->name, floatType->name, floatType->name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, "\
 %s%s conv_%s(%s input)\n\
 {\n\
 	%s ret_val;\n\
 	ret_val.x = (%s) input.x;\n\
 	ret_val.y = (%s) input.y;\n\
 	return ret_val;\n\
-}\n\n", sc->functionDef.data.s, vecTypeDifferent->data.s, vecTypeDifferent->data.s, vecType->data.s, vecTypeDifferent->data.s, floatTypeDifferent->data.s, floatTypeDifferent->data.s);
-	PfAppendLine(sc);
+}\n\n", sc->functionDef.name, vecTypeDifferent->name, vecTypeDifferent->name, vecType->name, vecTypeDifferent->name, floatTypeDifferent->name, floatTypeDifferent->name);
+		PfAppendLine(sc);
 #endif
-	
+	}
 	return;
 }
 
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Constants.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Constants.h
index 37c5e862..e2fac0b1 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Constants.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Constants.h
@@ -40,17 +40,17 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 		for (int i = 0; i < sc->numRaderPrimes; i++) {
 			if (sc->raderContainer[i].prime > 0) {
 				if (sc->inline_rader_g_pow == 1) {
-					int64_t g_pow = 1;
+					pfINT g_pow = 1;
 #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-					sc->tempLen = sprintf(sc->tempStr, "__constant %s %s[%d]= {1", uintType->data.s, sc->raderContainer[i].g_powConstantStruct.data.s, sc->raderContainer[i].prime);
+					sc->tempLen = sprintf(sc->tempStr, "__constant %s %s[%d]= {1", uintType->name, sc->raderContainer[i].g_powConstantStruct.name, sc->raderContainer[i].prime);
 					PfAppendLine(sc);
 					
 #elif(VKFFT_BACKEND==5)
-					sc->tempLen = sprintf(sc->tempStr, "constant %s %s[%d]= {1", uintType->data.s, sc->raderContainer[i].g_powConstantStruct.data.s, sc->raderContainer[i].prime);
+					sc->tempLen = sprintf(sc->tempStr, "constant %s %s[%d]= {1", uintType->name, sc->raderContainer[i].g_powConstantStruct.name, sc->raderContainer[i].prime);
 					PfAppendLine(sc);
 					
 #else
-					sc->tempLen = sprintf(sc->tempStr, "const %s %s[%d]= {1", uintType->data.s, sc->raderContainer[i].g_powConstantStruct.data.s, sc->raderContainer[i].prime);
+					sc->tempLen = sprintf(sc->tempStr, "const %s %s[%d]= {1", uintType->name, sc->raderContainer[i].g_powConstantStruct.name, sc->raderContainer[i].prime);
 					PfAppendLine(sc);
 					
 #endif
@@ -66,15 +66,15 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 				}
 				if (sc->inline_rader_kernel) {
 #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-					sc->tempLen = sprintf(sc->tempStr, "__constant %s %s[%d]= {", floatType->data.s, sc->raderContainer[i].r_rader_kernelConstantStruct.data.s, sc->raderContainer[i].prime - 1);
+					sc->tempLen = sprintf(sc->tempStr, "__constant %s %s[%d]= {", floatType->name, sc->raderContainer[i].r_rader_kernelConstantStruct.name, sc->raderContainer[i].prime - 1);
 					PfAppendLine(sc);
 					
 #elif(VKFFT_BACKEND==5)
-					sc->tempLen = sprintf(sc->tempStr, "constant %s %s[%d]= {", floatType->data.s, sc->raderContainer[i].r_rader_kernelConstantStruct.data.s, sc->raderContainer[i].prime - 1);
+					sc->tempLen = sprintf(sc->tempStr, "constant %s %s[%d]= {", floatType->name, sc->raderContainer[i].r_rader_kernelConstantStruct.name, sc->raderContainer[i].prime - 1);
 					PfAppendLine(sc);
 					
 #else
-					sc->tempLen = sprintf(sc->tempStr, "const %s %s[%d]= {", floatType->data.s, sc->raderContainer[i].r_rader_kernelConstantStruct.data.s, sc->raderContainer[i].prime - 1);
+					sc->tempLen = sprintf(sc->tempStr, "const %s %s[%d]= {", floatType->name, sc->raderContainer[i].r_rader_kernelConstantStruct.name, sc->raderContainer[i].prime - 1);
 					PfAppendLine(sc);
 					
 #endif
@@ -84,14 +84,14 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
 								sc->tempLen = sprintf(sc->tempStr, "%.17e", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (((sc->floatTypeCode % 100) / 10) == 1) {
 								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
 								sc->tempLen = sprintf(sc->tempStr, "%.8e", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (j < (sc->raderContainer[i].prime - 2)) {
@@ -106,22 +106,22 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 					}
 					else {
 						for (int j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later
-							uint64_t g_pow = 1;
+							pfUINT g_pow = 1;
 							for (int t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) {
 								g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime;
 							}
 							if (((sc->floatTypeCode % 100) / 10) == 2) {
 								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
-								sc->tempLen = sprintf(sc->tempStr, "%.17e", (double)cos(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime));
+								sc->tempLen = sprintf(sc->tempStr, "%.17e", (double)pfcos(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (((sc->floatTypeCode % 100) / 10) == 1) {
 								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
-								sc->tempLen = sprintf(sc->tempStr, "%.8e", (float)cos(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime));
+								sc->tempLen = sprintf(sc->tempStr, "%.8e", (float)pfcos(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (j < (sc->raderContainer[i].prime - 2)) {
@@ -137,15 +137,15 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 						}
 					}
 #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-					sc->tempLen = sprintf(sc->tempStr, "__constant %s %s[%d]= {", floatType->data.s, sc->raderContainer[i].i_rader_kernelConstantStruct.data.s, sc->raderContainer[i].prime - 1);
+					sc->tempLen = sprintf(sc->tempStr, "__constant %s %s[%d]= {", floatType->name, sc->raderContainer[i].i_rader_kernelConstantStruct.name, sc->raderContainer[i].prime - 1);
 					PfAppendLine(sc);
 					
 #elif(VKFFT_BACKEND==5)
-					sc->tempLen = sprintf(sc->tempStr, "constant %s %s[%d]= {", floatType->data.s, sc->raderContainer[i].i_rader_kernelConstantStruct.data.s, sc->raderContainer[i].prime - 1);
+					sc->tempLen = sprintf(sc->tempStr, "constant %s %s[%d]= {", floatType->name, sc->raderContainer[i].i_rader_kernelConstantStruct.name, sc->raderContainer[i].prime - 1);
 					PfAppendLine(sc);
 					
 #else
-					sc->tempLen = sprintf(sc->tempStr, "const %s %s[%d]= {", floatType->data.s, sc->raderContainer[i].i_rader_kernelConstantStruct.data.s, sc->raderContainer[i].prime - 1);
+					sc->tempLen = sprintf(sc->tempStr, "const %s %s[%d]= {", floatType->name, sc->raderContainer[i].i_rader_kernelConstantStruct.name, sc->raderContainer[i].prime - 1);
 					PfAppendLine(sc);
 					
 #endif
@@ -155,14 +155,14 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
 								sc->tempLen = sprintf(sc->tempStr, "%.17e", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (((sc->floatTypeCode % 100) / 10) == 1) {
 								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
 								sc->tempLen = sprintf(sc->tempStr, "%.8e", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.name);
 								PfAppendLine(sc);
 							}
 
@@ -180,22 +180,22 @@ static inline void appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc)
 					}
 					else {
 						for (int j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later
-							uint64_t g_pow = 1;
+							pfUINT g_pow = 1;
 							for (int t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) {
 								g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime;
 							}
 							if (((sc->floatTypeCode % 100) / 10) == 2) {
 								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
-								sc->tempLen = sprintf(sc->tempStr, "%.17e", (double)(-sin(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime)));
+								sc->tempLen = sprintf(sc->tempStr, "%.17e", (double)(-pfsin(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime)));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->doubleLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (((sc->floatTypeCode % 100) / 10) == 1) {
 								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
-								sc->tempLen = sprintf(sc->tempStr, "%.8e", (float)(-sin(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime)));
+								sc->tempLen = sprintf(sc->tempStr, "%.8e", (float)(-pfsin(2.0 * g_pow * sc->double_PI / sc->raderContainer[i].prime)));
 								PfAppendLine(sc);
-								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.data.s);
+								sc->tempLen = sprintf(sc->tempStr, "%s ", sc->floatLiteral.name);
 								PfAppendLine(sc);
 							}
 							if (j < (sc->raderContainer[i].prime - 2)) {
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutput.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutput.h
index 6e745f49..e477b53a 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutput.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutput.h
@@ -38,10 +38,10 @@ static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc
 				if (sc->performPostCompilationInputOffset) {
 					locOffset.type = 1001;
 					if (sc->inputType < 1000) {
-						locOffset.data.s = sc->inputOffset.data.s;
+						locOffset.name = sc->inputOffset.name;
 					}
 					else {
-						locOffset.data.s = sc->kernelOffset.data.s;
+						locOffset.name = sc->kernelOffset.name;
 					}
 				}
 			}
@@ -88,7 +88,7 @@ static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc
 			}
 		}
 		char shiftCoordinate[500] = "";
-		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
+		pfUINT maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
 		if (sc->numCoordinates * sc->matrixConvolution > 1) {
 			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]);
 		}
@@ -148,7 +148,7 @@ static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc
 			}
 		}
 		char shiftCoordinate[500] = "";
-		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
+		pfUINT maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
 		if (sc->numCoordinates * sc->matrixConvolution > 1) {
 			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]);
 		}
@@ -194,7 +194,7 @@ static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* s
 		else
 			sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]);
 		char shiftY[500] = "";
-		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
+		pfUINT mult = (sc->mergeSequencesR2C) ? 2 : 1;
 		if (sc->size[1] > 1) {
 			if (sc->numAxisUploads == 1) {
 				if (sc->axisSwapped) {
@@ -233,7 +233,7 @@ static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* s
 			}
 		}
 		char shiftCoordinate[500] = "";
-		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
+		pfUINT maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
 		if (sc->numCoordinates * sc->matrixConvolution > 1) {
 			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]);
 		}
@@ -291,7 +291,7 @@ static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* s
 			}
 		}
 		char shiftCoordinate[500] = "";
-		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
+		pfUINT maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
 		if (sc->numCoordinates * sc->matrixConvolution > 1) {
 			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]);
 		}
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h
index cc1cfc11..a5003376 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_InputOutputLayout.h
@@ -46,14 +46,14 @@ static inline void appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataIn{\n\
 	%s inputs[%" PRIu64 "];\n\
-};\n\n", id, vecTypeInputMemory->data.s, sc->inputBufferBlockSize);
+};\n\n", id, vecTypeInputMemory->name, sc->inputBufferBlockSize / sc->complexSize);
 			PfAppendLine(sc);
 		}
 		else {
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataIn{\n\
 	%s inputs[%" PRIu64 "];\n\
-} inputBlocks[%" PRIu64 "];\n\n", id, vecTypeInputMemory->data.s, sc->inputBufferBlockSize, sc->inputBufferBlockNum);
+} inputBlocks[%" PRIu64 "];\n\n", id, vecTypeInputMemory->name, sc->inputBufferBlockSize / sc->complexSize, sc->inputBufferBlockNum);
 			PfAppendLine(sc);
 		}
 #elif(VKFFT_BACKEND==1)
@@ -72,14 +72,14 @@ layout(std430, binding = %d) buffer DataIn{\n\
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataIn{\n\
 	%s inputs[%" PRIu64 "];\n\
-};\n\n", id, floatTypeInputMemory->data.s, 2 * sc->inputBufferBlockSize);
+};\n\n", id, floatTypeInputMemory->name, sc->inputBufferBlockSize / (sc->complexSize / 2));
 			PfAppendLine(sc);
 	}
 		else {
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataIn{\n\
 	%s inputs[%" PRIu64 "];\n\
-} inputBlocks[%" PRIu64 "];\n\n", id, floatTypeInputMemory->data.s, 2 * sc->inputBufferBlockSize, sc->inputBufferBlockNum);
+} inputBlocks[%" PRIu64 "];\n\n", id, floatTypeInputMemory->name, sc->inputBufferBlockSize / (sc->complexSize / 2), sc->inputBufferBlockNum);
 			PfAppendLine(sc);
 		}
 #elif(VKFFT_BACKEND==1)
@@ -103,14 +103,14 @@ static inline void appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout* s
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataOut{\n\
 	%s outputs[%" PRIu64 "];\n\
-};\n\n", id, vecTypeOutputMemory->data.s, sc->outputBufferBlockSize);
+};\n\n", id, vecTypeOutputMemory->name, sc->outputBufferBlockSize / sc->complexSize);
 			PfAppendLine(sc);
 	}
 		else {
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataOut{\n\
 	%s outputs[%" PRIu64 "];\n\
-} outputBlocks[%" PRIu64 "];\n\n", id, vecTypeOutputMemory->data.s, sc->outputBufferBlockSize, sc->outputBufferBlockNum);
+} outputBlocks[%" PRIu64 "];\n\n", id, vecTypeOutputMemory->name, sc->outputBufferBlockSize / sc->complexSize, sc->outputBufferBlockNum);
 			PfAppendLine(sc);
 		}
 #elif(VKFFT_BACKEND==1)
@@ -129,14 +129,14 @@ layout(std430, binding = %d) buffer DataOut{\n\
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataOut{\n\
 	%s outputs[%" PRIu64 "];\n\
-};\n\n", id, floatTypeOutputMemory->data.s, 2 * sc->outputBufferBlockSize);
+};\n\n", id, floatTypeOutputMemory->name, sc->outputBufferBlockSize / (sc->complexSize / 2));
 			PfAppendLine(sc);
 		}
 		else {
 			sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer DataOut{\n\
 	%s outputs[%" PRIu64 "];\n\
-} outputBlocks[%" PRIu64 "];\n\n", id, floatTypeOutputMemory->data.s, 2 * sc->outputBufferBlockSize, sc->outputBufferBlockNum);
+} outputBlocks[%" PRIu64 "];\n\n", id, floatTypeOutputMemory->name, sc->outputBufferBlockSize / (sc->complexSize / 2), sc->outputBufferBlockNum);
 			PfAppendLine(sc);
 		}
 #elif(VKFFT_BACKEND==1)
@@ -159,14 +159,14 @@ static inline void appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout* s
 		sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer Kernel_FFT{\n\
 	%s kernel_obj[%" PRIu64 "];\n\
-};\n\n", id, vecType->data.s, sc->kernelBlockSize);
+};\n\n", id, vecType->name, sc->kernelBlockSize / sc->complexSize);
 		PfAppendLine(sc);
 	}
 	else {
 		sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) buffer Kernel_FFT{\n\
 	%s kernel_obj[%" PRIu64 "];\n\
-} kernelBlocks[%" PRIu64 "];\n\n", id, vecType->data.s, sc->kernelBlockSize, sc->kernelBlockNum);
+} kernelBlocks[%" PRIu64 "];\n\n", id, vecType->name, sc->kernelBlockSize / sc->complexSize, sc->kernelBlockNum);
 		PfAppendLine(sc);
 		
 	}
@@ -186,7 +186,7 @@ static inline void appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc,
 	sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) readonly buffer DataLUT {\n\
 %s twiddleLUT[];\n\
-};\n", id, vecType->data.s);
+};\n", id, vecType->name);
 	PfAppendLine(sc);
 	
 #elif(VKFFT_BACKEND==1)
@@ -205,7 +205,7 @@ static inline void appendRaderUintLUTLayoutVkFFT(VkFFTSpecializationConstantsLay
 	sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) readonly buffer DataRaderUintLUT {\n\
 %s g_pow[];\n\
-};\n", id, uintType32->data.s);
+};\n", id, uintType32->name);
 	PfAppendLine(sc);
 	
 #elif(VKFFT_BACKEND==1)
@@ -226,7 +226,7 @@ static inline void appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout
 		sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) readonly buffer DataBluesteinConvolutionKernel {\n\
 %s BluesteinConvolutionKernel[];\n\
-};\n", loc_id, vecType->data.s);
+};\n", loc_id, vecType->name);
 		PfAppendLine(sc);
 		loc_id++;
 	}
@@ -234,7 +234,7 @@ layout(std430, binding = %d) readonly buffer DataBluesteinConvolutionKernel {\n\
 		sc->tempLen = sprintf(sc->tempStr, "\
 layout(std430, binding = %d) readonly buffer DataBluesteinMultiplication {\n\
 %s BluesteinMultiplication[];\n\
-};\n", loc_id, vecType->data.s);
+};\n", loc_id, vecType->name);
 		PfAppendLine(sc);
 		loc_id++;
 	}
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h
index ef49fda7..e0e82c29 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h
@@ -28,9 +28,9 @@
 static inline void appendPushConstant(VkFFTSpecializationConstantsLayout* sc, PfContainer* var) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (var->type > 100) {
-		PfContainer* varType;
+		PfContainer* varType = VKFFT_ZERO_INIT;
 		PfGetTypeFromCode(sc, var->type, &varType);
-		sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", varType->data.s, var->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", varType->name, var->name);
 		PfAppendLine(sc);
 	}
 	else {
@@ -62,33 +62,33 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) {
 	char tempCopyStr[60];
 	if (sc->performWorkGroupShift[0]) {
 		appendPushConstant(sc, &sc->workGroupShiftX);
-		sprintf(tempCopyStr, "consts.%s", sc->workGroupShiftX.data.s);
-		sprintf(sc->workGroupShiftX.data.s, "%s", tempCopyStr);
+		sprintf(tempCopyStr, "consts.%s", sc->workGroupShiftX.name);
+		sprintf(sc->workGroupShiftX.name, "%s", tempCopyStr);
 	}
 	if (sc->performWorkGroupShift[1]) {
 		appendPushConstant(sc, &sc->workGroupShiftY);
-		sprintf(tempCopyStr, "consts.%s", sc->workGroupShiftY.data.s);
-		sprintf(sc->workGroupShiftY.data.s, "%s", tempCopyStr);
+		sprintf(tempCopyStr, "consts.%s", sc->workGroupShiftY.name);
+		sprintf(sc->workGroupShiftY.name, "%s", tempCopyStr);
 	}
 	if (sc->performWorkGroupShift[2]) {
 		appendPushConstant(sc, &sc->workGroupShiftZ);
-		sprintf(tempCopyStr, "consts.%s", sc->workGroupShiftZ.data.s);
-		sprintf(sc->workGroupShiftZ.data.s, "%s", tempCopyStr);
+		sprintf(tempCopyStr, "consts.%s", sc->workGroupShiftZ.name);
+		sprintf(sc->workGroupShiftZ.name, "%s", tempCopyStr);
 	}
 	if (sc->performPostCompilationInputOffset) {
 		appendPushConstant(sc, &sc->inputOffset);
-		sprintf(tempCopyStr, "consts.%s", sc->inputOffset.data.s);
-		sprintf(sc->inputOffset.data.s, "%s", tempCopyStr);
+		sprintf(tempCopyStr, "consts.%s", sc->inputOffset.name);
+		sprintf(sc->inputOffset.name, "%s", tempCopyStr);
 	}
 	if (sc->performPostCompilationOutputOffset) {
 		appendPushConstant(sc, &sc->outputOffset);
-		sprintf(tempCopyStr, "consts.%s", sc->outputOffset.data.s);
-		sprintf(sc->outputOffset.data.s, "%s", tempCopyStr);
+		sprintf(tempCopyStr, "consts.%s", sc->outputOffset.name);
+		sprintf(sc->outputOffset.name, "%s", tempCopyStr);
 	}
 	if (sc->performPostCompilationKernelOffset) {
 		appendPushConstant(sc, &sc->kernelOffset);
-		sprintf(tempCopyStr, "consts.%s", sc->kernelOffset.data.s);
-		sprintf(sc->kernelOffset.data.s, "%s", tempCopyStr);
+		sprintf(tempCopyStr, "consts.%s", sc->kernelOffset.name);
+		sprintf(sc->kernelOffset.name, "%s", tempCopyStr);
 	}
 #if(VKFFT_BACKEND==0)
 	sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n");
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Registers.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Registers.h
index 2f824011..00c2d689 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Registers.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_Registers.h
@@ -30,13 +30,13 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
-
+	char name[50];
 	//sc->tempLen = sprintf(sc->tempStr, "	uint dum=gl_LocalInvocationID.x;\n");
 	int additional_registers_c2r = 0;
 	if ((sc->mergeSequencesR2C == 1) && (type == 5))
 		additional_registers_c2r = 2;
 
-	int64_t max_coordinate = 1;
+	pfINT max_coordinate = 1;
 	if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) {
 		max_coordinate = sc->matrixConvolution;
 	}
@@ -48,10 +48,11 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 	if (sc->regIDs == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
 
 	for (int i = 0; i < logicalStoragePerThread; i++) {
-		PfAllocateContainerFlexible(sc, &sc->regIDs[i], 50);
 		sc->regIDs[i].type = 100 + sc->vecTypeCode;
-		sprintf(sc->regIDs[i].data.s, "temp_%d", i);
-		PfDefine(sc, &sc->regIDs[i]);
+		PfAllocateContainerFlexible(sc, &sc->regIDs[i], 50);
+
+		sprintf(name, "temp_%d", i);
+		PfDefine(sc, &sc->regIDs[i], name);
 		PfSetToZero(sc, &sc->regIDs[i]);
 	}
 	if (sc->convolutionStep) {
@@ -60,10 +61,10 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 			if (sc->regIDs_copy == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
 
 			for (int i = 0; i < logicalStoragePerThread; i++) {
-				PfAllocateContainerFlexible(sc, &sc->regIDs_copy[i], 50);
 				sc->regIDs_copy[i].type = 100 + sc->vecTypeCode;
-				sprintf(sc->regIDs_copy[i].data.s, "temp_copy_%d", i);
-				PfDefine(sc, &sc->regIDs_copy[i]);
+				PfAllocateContainerFlexible(sc, &sc->regIDs_copy[i], 50);
+				sprintf(name, "temp_copy_%d", i);
+				PfDefine(sc, &sc->regIDs_copy[i], name);
 				PfSetToZero(sc, &sc->regIDs_copy[i]);
 			}
 		}
@@ -71,75 +72,110 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 		if (sc->temp_conv == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
 
 		for (int j = 0; j < sc->matrixConvolution; j++) {
-			PfAllocateContainerFlexible(sc, &sc->temp_conv[j], 50);
 			sc->temp_conv[j].type = 100 + sc->vecTypeCode;
-			sprintf(sc->temp_conv[j].data.s, "temp_conv_%d", j);
-			PfDefine(sc, &sc->temp_conv[j]);
+			PfAllocateContainerFlexible(sc, &sc->temp_conv[j], 50);
+			sprintf(name, "temp_conv_%d", j);
+			PfDefine(sc, &sc->temp_conv[j], name);
 			PfSetToZero(sc, &sc->temp_conv[j]);
 		}
 	}
 
 
-	PfAllocateContainerFlexible(sc, &sc->w, 50);
 	sc->w.type = 100 + sc->vecTypeCode;
-	sprintf(sc->w.data.s, "w");
-	PfDefine(sc, &sc->w);
+	PfAllocateContainerFlexible(sc, &sc->w, 50);
+	sprintf(name, "w");
+	PfDefine(sc, &sc->w, name);
 	PfSetToZero(sc, &sc->w);
 
+	if (((sc->floatTypeCode % 100) / 10) == 3) {
+		sc->tempQuad.type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempQuad, 50);
+		sprintf(name, "tempQuad");
+		PfDefine(sc, &sc->tempQuad, name);
+		PfSetToZero(sc, &sc->tempQuad);
+
+		sc->tempQuad2.type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempQuad2, 50);
+		sprintf(name, "tempQuad2");
+		PfDefine(sc, &sc->tempQuad2, name);
+		PfSetToZero(sc, &sc->tempQuad2);
+
+		sc->tempQuad3.type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempQuad3, 50);
+		sprintf(name, "tempQuad3");
+		PfDefine(sc, &sc->tempQuad3, name);
+		PfSetToZero(sc, &sc->tempQuad3);
+
+		sc->tempIntQuad.type = 100 + sc->uintTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempIntQuad, 50);
+		sprintf(name, "tempIntQuad");
+		PfDefine(sc, &sc->tempIntQuad, name);
+		PfSetToZero(sc, &sc->tempIntQuad);
+	}
+
 	int maxNonPow2Radix = sc->maxNonPow2Radix;
 	for (int i = 0; i < sc->usedLocRegs; i++) {
-		PfAllocateContainerFlexible(sc, &sc->locID[i], 50);
 		sc->locID[i].type = 100 + sc->vecTypeCode;
-		sprintf(sc->locID[i].data.s, "loc_%d", i);
-		PfDefine(sc, &sc->locID[i]);
+		PfAllocateContainerFlexible(sc, &sc->locID[i], 50);
+		sprintf(name, "loc_%d", i);
+		PfDefine(sc, &sc->locID[i], name);
 		PfSetToZero(sc, &sc->locID[i]);	
 	}
-	PfAllocateContainerFlexible(sc, &sc->temp, 50);
 	sc->temp.type = 100 + sc->vecTypeCode;
-	sprintf(sc->temp.data.s, "loc_0"); 
-	//PfDefine(sc, &sc->temp);
+	PfAllocateContainerFlexible(sc, &sc->temp, 50);
+	sprintf(name, "loc_0"); 
+	PfSetContainerName(sc, &sc->temp, name);
+	//PfDefineReference(sc, &sc->temp, name);
 	//PfSetToZero(sc, &sc->temp);
 
-	PfAllocateContainerFlexible(sc, &sc->tempFloat, 50);
 	sc->tempFloat.type = 100 + sc->floatTypeCode;
-	sprintf(sc->tempFloat.data.s, "%s.x", sc->temp.data.s);
+	PfAllocateContainerFlexible(sc, &sc->tempFloat, 50);
+	sprintf(name, "loc_0");
+	if (((sc->floatTypeCode % 100) / 10) == 3) {
+		sprintf(sc->tempFloat.data.dd[0].name, "%s.x.x\n", name);
+		sprintf(sc->tempFloat.data.dd[1].name, "%s.x.y\n", name);
+	}
+	else {
+		sprintf(sc->tempFloat.name, "%s.x", sc->temp.name);
+	}
+	//PfDefineReference(sc, &sc->tempFloat, name);
 
-	PfAllocateContainerFlexible(sc, &sc->tempInt, 50);
 	sc->tempInt.type = 100 + sc->uintTypeCode;
-	sprintf(sc->tempInt.data.s, "tempInt");
-	PfDefine(sc, &sc->tempInt);
+	PfAllocateContainerFlexible(sc, &sc->tempInt, 50);
+	sprintf(name, "tempInt");
+	PfDefine(sc, &sc->tempInt, name);
 	PfSetToZero(sc, &sc->tempInt);
 
-	PfAllocateContainerFlexible(sc, &sc->tempInt2, 50);
 	sc->tempInt2.type = 100 + sc->uintTypeCode;
-	sprintf(sc->tempInt2.data.s, "tempInt2");
-	PfDefine(sc, &sc->tempInt2);
+	PfAllocateContainerFlexible(sc, &sc->tempInt2, 50);
+	sprintf(name, "tempInt2");
+	PfDefine(sc, &sc->tempInt2, name);
 	PfSetToZero(sc, &sc->tempInt2);
 
-	PfAllocateContainerFlexible(sc, &sc->shiftX, 50);
 	sc->shiftX.type = 100 + sc->uintTypeCode;
-	sprintf(sc->shiftX.data.s, "shiftX");
-	PfDefine(sc, &sc->shiftX);
+	PfAllocateContainerFlexible(sc, &sc->shiftX, 50);
+	sprintf(name, "shiftX");
+	PfDefine(sc, &sc->shiftX, name);
 	PfSetToZero(sc, &sc->shiftX);
 
-	PfAllocateContainerFlexible(sc, &sc->shiftY, 50);
 	sc->shiftY.type = 100 + sc->uintTypeCode;
-	sprintf(sc->shiftY.data.s, "shiftY");
-	PfDefine(sc, &sc->shiftY);
+	PfAllocateContainerFlexible(sc, &sc->shiftY, 50);
+	sprintf(name, "shiftY");
+	PfDefine(sc, &sc->shiftY, name);
 	PfSetToZero(sc, &sc->shiftY);
 
-	PfAllocateContainerFlexible(sc, &sc->shiftZ, 50);
 	sc->shiftZ.type = 100 + sc->uintTypeCode;
-	sprintf(sc->shiftZ.data.s, "shiftZ");
-	PfDefine(sc, &sc->shiftZ);
+	PfAllocateContainerFlexible(sc, &sc->shiftZ, 50);
+	sprintf(name, "shiftZ");
+	PfDefine(sc, &sc->shiftZ, name);
 	PfSetToZero(sc, &sc->shiftZ);
 
 	if (sc->useRaderFFT) {
 		for (int i = 0; i < 2; i++) {
-			PfAllocateContainerFlexible(sc, &sc->x0[i], 50);
 			sc->x0[i].type = 100 + sc->vecTypeCode;
-			sprintf(sc->x0[i].data.s, "x0_%d", i);
-			PfDefine(sc, &sc->x0[i]);
+			PfAllocateContainerFlexible(sc, &sc->x0[i], 50);
+			sprintf(name, "x0_%d", i);
+			PfDefine(sc, &sc->x0[i], name);
 			PfSetToZero(sc, &sc->x0[i]);
 		}
 	}
@@ -148,22 +184,22 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 		int rader_mult_regs = sc->raderRegisters / 2 - rader_fft_regs;
 		if (rader_mult_regs <= sc->usedLocRegs - 1) {
 			for (int i = 0; i < rader_mult_regs; i++) {
-				PfAllocateContainerFlexible(sc, &sc->x0[i + rader_fft_regs], 50);
 				sc->x0[i + rader_fft_regs].type = 100 + sc->vecTypeCode;
-				sprintf(sc->x0[i + rader_fft_regs].data.s, "%s", sc->locID[i + 1].data.s);
+				PfAllocateContainerFlexible(sc, &sc->x0[i + rader_fft_regs], 50);
+				PfCopyContainer(sc, &sc->x0[i + rader_fft_regs], &sc->locID[i + 1]);
 			}
 		}
 		else {
 			for (int i = 0; i < sc->usedLocRegs - 1; i++) {
-				PfAllocateContainerFlexible(sc, &sc->x0[i + rader_fft_regs], 50);
 				sc->x0[i + rader_fft_regs].type = 100 + sc->vecTypeCode;
-				sprintf(sc->x0[i + rader_fft_regs].data.s, "%s", sc->locID[i + 1].data.s);
+				PfAllocateContainerFlexible(sc, &sc->x0[i + rader_fft_regs], 50);
+				PfCopyContainer(sc, &sc->x0[i + rader_fft_regs], &sc->locID[i + 1]);
 			}
 			for (int i = sc->usedLocRegs - 1; i < rader_mult_regs; i++) {
-				PfAllocateContainerFlexible(sc, &sc->x0[i + rader_fft_regs], 50);
 				sc->x0[i + rader_fft_regs].type = 100 + sc->vecTypeCode;
-				sprintf(sc->x0[i + rader_fft_regs].data.s, "x0_%d", i + rader_fft_regs);	
-				PfDefine(sc, &sc->x0[i + rader_fft_regs]);
+				PfAllocateContainerFlexible(sc, &sc->x0[i + rader_fft_regs], 50);
+				sprintf(name, "x0_%d", i + rader_fft_regs);	
+				PfDefine(sc, &sc->x0[i + rader_fft_regs], name);
 				PfSetToZero(sc, &sc->x0[i + rader_fft_regs]);
 			}
 		}
@@ -176,81 +212,82 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 		if ((sc->stageRadix[i] == 8) || (sc->stageRadix[i] == 16) || (sc->stageRadix[i] == 32) || (sc->useRaderFFT)) useRadix8plus = 1;
 	if (useRadix8plus == 1) {
 		if (maxNonPow2Radix > 1) {
-			PfAllocateContainerFlexible(sc, &sc->iw, 50);
 			sc->iw.type = 100 + sc->vecTypeCode;
-			sprintf(sc->iw.data.s, "%s", sc->locID[1].data.s);
+			PfAllocateContainerFlexible(sc, &sc->iw, 50);
+			sprintf(name, "%s", sc->locID[1].name);
+			PfSetContainerName(sc, &sc->iw, name);
 		}
 		else {
-			PfAllocateContainerFlexible(sc, &sc->iw, 50);
 			sc->iw.type = 100 + sc->vecTypeCode;
-			sprintf(sc->iw.data.s, "iw");
-			PfDefine(sc, &sc->iw);
+			PfAllocateContainerFlexible(sc, &sc->iw, 50);
+			sprintf(name, "iw");
+			PfDefine(sc, &sc->iw, name);
 			PfSetToZero(sc, &sc->iw);
 		}
 	}
 	//sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->tempReg);
-	PfAllocateContainerFlexible(sc, &sc->stageInvocationID, 50);
 	sc->stageInvocationID.type = 100 + sc->uintTypeCode;
-	sprintf(sc->stageInvocationID.data.s, "stageInvocationID");
-	PfDefine(sc, &sc->stageInvocationID);
+	PfAllocateContainerFlexible(sc, &sc->stageInvocationID, 50);
+	sprintf(name, "stageInvocationID");
+	PfDefine(sc, &sc->stageInvocationID, name);
 	PfSetToZero(sc, &sc->stageInvocationID);
 
-	PfAllocateContainerFlexible(sc, &sc->blockInvocationID, 50);
 	sc->blockInvocationID.type = 100 + sc->uintTypeCode;
-	sprintf(sc->blockInvocationID.data.s, "blockInvocationID");
-	PfDefine(sc, &sc->blockInvocationID);
+	PfAllocateContainerFlexible(sc, &sc->blockInvocationID, 50);
+	sprintf(name, "blockInvocationID");
+	PfDefine(sc, &sc->blockInvocationID, name);
 	PfSetToZero(sc, &sc->blockInvocationID);
 	
-	PfAllocateContainerFlexible(sc, &sc->sdataID, 50);
 	sc->sdataID.type = 100 + sc->uintTypeCode;
-	sprintf(sc->sdataID.data.s, "sdataID");
-	PfDefine(sc, &sc->sdataID);
+	PfAllocateContainerFlexible(sc, &sc->sdataID, 50);
+	sprintf(name, "sdataID");
+	PfDefine(sc, &sc->sdataID, name);
 	PfSetToZero(sc, &sc->sdataID);
 	
-	PfAllocateContainerFlexible(sc, &sc->combinedID, 50);
 	sc->combinedID.type = 100 + sc->uintTypeCode;
-	sprintf(sc->combinedID.data.s, "combinedID");
-	PfDefine(sc, &sc->combinedID);
+	PfAllocateContainerFlexible(sc, &sc->combinedID, 50);
+	sprintf(name, "combinedID");
+	PfDefine(sc, &sc->combinedID, name);
 	PfSetToZero(sc, &sc->combinedID);
 	
-	PfAllocateContainerFlexible(sc, &sc->inoutID, 50);
 	sc->inoutID.type = 100 + sc->uintTypeCode;
-	sprintf(sc->inoutID.data.s, "inoutID");
-	PfDefine(sc, &sc->inoutID);
+	PfAllocateContainerFlexible(sc, &sc->inoutID, 50);
+	sprintf(name, "inoutID");
+	PfDefine(sc, &sc->inoutID, name);
 	PfSetToZero(sc, &sc->inoutID);
 	
-	PfAllocateContainerFlexible(sc, &sc->inoutID_x, 50);
 	sc->inoutID_x.type = 100 + sc->uintTypeCode;
-	sprintf(sc->inoutID_x.data.s, "inoutID_x");
-	PfDefine(sc, &sc->inoutID_x);
+	PfAllocateContainerFlexible(sc, &sc->inoutID_x, 50);
+	sprintf(name, "inoutID_x");
+	PfDefine(sc, &sc->inoutID_x, name);
 	PfSetToZero(sc, &sc->inoutID_x);
 
-	PfAllocateContainerFlexible(sc, &sc->inoutID_y, 50);
 	sc->inoutID_y.type = 100 + sc->uintTypeCode;
-	sprintf(sc->inoutID_y.data.s, "inoutID_y");
-	PfDefine(sc, &sc->inoutID_y);
+	PfAllocateContainerFlexible(sc, &sc->inoutID_y, 50);
+	sprintf(name, "inoutID_y");
+	PfDefine(sc, &sc->inoutID_y, name);
 	PfSetToZero(sc, &sc->inoutID_y);
 
 	if ((sc->fftDim.data.i < sc->fft_dim_full.data.i) || (type == 1) || (type == 111) || (type == 121) || (type == 131) || (type == 143) || (type == 145) || (type == 2) || (sc->performZeropaddingFull[0]) || (sc->performZeropaddingFull[1]) || (sc->performZeropaddingFull[2])) {
-		PfAllocateContainerFlexible(sc, &sc->disableThreads, 50);
 		sc->disableThreads.type = 101;
-		sprintf(sc->disableThreads.data.s, "disableThreads");
-		PfDefine(sc, &sc->disableThreads);
+		PfAllocateContainerFlexible(sc, &sc->disableThreads, 50);
+		sprintf(name, "disableThreads");
+		PfDefine(sc, &sc->disableThreads, name);
 		temp_int.data.i = 1;
 		PfMov(sc, &sc->disableThreads, &temp_int);
 	}
 	//initialize subgroups ids
 	if (sc->useRader) {
-		PfAllocateContainerFlexible(sc, &sc->raderIDx, 50);
 		sc->raderIDx.type = 100 + sc->uintTypeCode;
-		sprintf(sc->raderIDx.data.s, "raderIDx");
-		PfDefine(sc, &sc->raderIDx);
+		PfAllocateContainerFlexible(sc, &sc->raderIDx, 50);
+		sprintf(name, "raderIDx");
+		PfDefine(sc, &sc->raderIDx, name);
 		PfSetToZero(sc, &sc->raderIDx);
 		
-		PfAllocateContainerFlexible(sc, &sc->raderIDx2, 50);
 		sc->raderIDx2.type = 100 + sc->uintTypeCode;
-		sprintf(sc->raderIDx2.data.s, "raderIDx2");
-		PfDefine(sc, &sc->raderIDx2);
+		PfAllocateContainerFlexible(sc, &sc->raderIDx2, 50);
+		sprintf(name, "raderIDx2");
+		PfDefine(sc, &sc->raderIDx2, name);
 		PfSetToZero(sc, &sc->raderIDx2);
 		
 		/*#if((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2))
@@ -275,32 +312,32 @@ static inline void appendRegisterInitialization(VkFFTSpecializationConstantsLayo
 		#endif*/
 	}
 	if (sc->LUT) {
-		PfAllocateContainerFlexible(sc, &sc->LUTId, 50);
 		sc->LUTId.type = 100 + sc->uintTypeCode;
-		sprintf(sc->LUTId.data.s, "LUTId");
-		PfDefine(sc, &sc->LUTId);
+		PfAllocateContainerFlexible(sc, &sc->LUTId, 50);
+		sprintf(name, "LUTId");
+		PfDefine(sc, &sc->LUTId, name);
 		PfSetToZero(sc, &sc->LUTId);
 		
 		if ((!sc->LUT_4step)&&(sc->numAxisUploads>1)) {
-			PfAllocateContainerFlexible(sc, &sc->angle, 50);
 			sc->angle.type = 100 + sc->floatTypeCode;
-			sprintf(sc->angle.data.s, "angle");
-			PfDefine(sc, &sc->angle);
+			PfAllocateContainerFlexible(sc, &sc->angle, 50);
+			sprintf(name, "angle");
+			PfDefine(sc, &sc->angle, name);
 			PfSetToZero(sc, &sc->angle);
 		}
 	}
 	else {
-		PfAllocateContainerFlexible(sc, &sc->angle, 50);
 		sc->angle.type = 100 + sc->floatTypeCode;
-		sprintf(sc->angle.data.s, "angle");
-		PfDefine(sc, &sc->angle);
+		PfAllocateContainerFlexible(sc, &sc->angle, 50);
+		sprintf(name, "angle");
+		PfDefine(sc, &sc->angle, name);
 		PfSetToZero(sc, &sc->angle);
 	}
-	if (((sc->stageStartSize.data.i > 1) && (!((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT)) {
-		PfAllocateContainerFlexible(sc, &sc->mult, 50);
+	if (((sc->stageStartSize.data.i > 1) && (!((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT) || (sc->performDST)) {
 		sc->mult.type = 100 + sc->vecTypeCode;
-		sprintf(sc->mult.data.s, "mult");
-		PfDefine(sc, &sc->mult);
+		PfAllocateContainerFlexible(sc, &sc->mult, 50);
+		sprintf(name, "mult");
+		PfDefine(sc, &sc->mult, name);
 		PfSetToZero(sc, &sc->mult);
 	}
 	return;
@@ -311,84 +348,111 @@ static inline void appendRegisterInitialization_R2C(VkFFTSpecializationConstants
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
+	char name[50];
 
 	sc->regIDs = (PfContainer*)calloc(sc->registers_per_thread, sizeof(PfContainer));
 
 	for (int i = 0; i < sc->registers_per_thread; i++) {
-		PfAllocateContainerFlexible(sc, &sc->regIDs[i], 50);
 		sc->regIDs[i].type = 100 + sc->vecTypeCode;
-		sprintf(sc->regIDs[i].data.s, "temp_%d", i);
-		PfDefine(sc, &sc->regIDs[i]);
+		PfAllocateContainerFlexible(sc, &sc->regIDs[i], 50);
+		sprintf(name, "temp_%d", i);
+		PfDefine(sc, &sc->regIDs[i], name);
 		PfSetToZero(sc, &sc->regIDs[i]);
 	}
 	
-	PfAllocateContainerFlexible(sc, &sc->w, 50);
 	sc->w.type = 100 + sc->vecTypeCode;
-	sprintf(sc->w.data.s, "w");
-	PfDefine(sc, &sc->w);
+	PfAllocateContainerFlexible(sc, &sc->w, 50);
+	sprintf(name, "w");
+	PfDefine(sc, &sc->w, name);
 	PfSetToZero(sc, &sc->w);
 
-	PfAllocateContainerFlexible(sc, &sc->tempInt, 50);
+	if (((sc->floatTypeCode % 100) / 10) == 3) {
+		sc->tempQuad.type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempQuad, 50);
+		sprintf(name, "tempQuad");
+		PfDefine(sc, &sc->tempQuad, name);
+		PfSetToZero(sc, &sc->tempQuad);
+
+		sc->tempQuad2.type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempQuad2, 50);
+		sprintf(name, "tempQuad2");
+		PfDefine(sc, &sc->tempQuad2, name);
+		PfSetToZero(sc, &sc->tempQuad2);
+
+		sc->tempQuad3.type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempQuad3, 50);
+		sprintf(name, "tempQuad3");
+		PfDefine(sc, &sc->tempQuad3, name);
+		PfSetToZero(sc, &sc->tempQuad3);
+
+		sc->tempIntQuad.type = 100 + sc->uintTypeCode;
+		PfAllocateContainerFlexible(sc, &sc->tempIntQuad, 50);
+		sprintf(name, "tempIntQuad");
+		PfDefine(sc, &sc->tempIntQuad, name);
+		PfSetToZero(sc, &sc->tempIntQuad);
+	}
+
 	sc->tempInt.type = 100 + sc->uintTypeCode;
-	sprintf(sc->tempInt.data.s, "tempInt");
-	PfDefine(sc, &sc->tempInt);
+	PfAllocateContainerFlexible(sc, &sc->tempInt, 50);
+	sprintf(name, "tempInt");
+	PfDefine(sc, &sc->tempInt, name);
 	PfSetToZero(sc, &sc->tempInt);
 
-	PfAllocateContainerFlexible(sc, &sc->tempInt2, 50);
 	sc->tempInt2.type = 100 + sc->uintTypeCode;
-	sprintf(sc->tempInt2.data.s, "tempInt2");
-	PfDefine(sc, &sc->tempInt2);
+	PfAllocateContainerFlexible(sc, &sc->tempInt2, 50);
+	sprintf(name, "tempInt2");
+	PfDefine(sc, &sc->tempInt2, name);
 	PfSetToZero(sc, &sc->tempInt2);
 
-	PfAllocateContainerFlexible(sc, &sc->shiftX, 50);
 	sc->shiftX.type = 100 + sc->uintTypeCode;
-	sprintf(sc->shiftX.data.s, "shiftX");
-	PfDefine(sc, &sc->shiftX);
+	PfAllocateContainerFlexible(sc, &sc->shiftX, 50);
+	sprintf(name, "shiftX");
+	PfDefine(sc, &sc->shiftX, name);
 	PfSetToZero(sc, &sc->shiftX);
 
-	PfAllocateContainerFlexible(sc, &sc->shiftY, 50);
 	sc->shiftY.type = 100 + sc->uintTypeCode;
-	sprintf(sc->shiftY.data.s, "shiftY");
-	PfDefine(sc, &sc->shiftY);
+	PfAllocateContainerFlexible(sc, &sc->shiftY, 50);
+	sprintf(name, "shiftY");
+	PfDefine(sc, &sc->shiftY, name);
 	PfSetToZero(sc, &sc->shiftY);
 
-	PfAllocateContainerFlexible(sc, &sc->shiftZ, 50);
 	sc->shiftZ.type = 100 + sc->uintTypeCode;
-	sprintf(sc->shiftZ.data.s, "shiftZ");
-	PfDefine(sc, &sc->shiftZ);
+	PfAllocateContainerFlexible(sc, &sc->shiftZ, 50);
+	sprintf(name, "shiftZ");
+	PfDefine(sc, &sc->shiftZ, name);
 	PfSetToZero(sc, &sc->shiftZ);
 
-	PfAllocateContainerFlexible(sc, &sc->inoutID, 50);
 	sc->inoutID.type = 100 + sc->uintTypeCode;
-	sprintf(sc->inoutID.data.s, "inoutID");
-	PfDefine(sc, &sc->inoutID);
+	PfAllocateContainerFlexible(sc, &sc->inoutID, 50);
+	sprintf(name, "inoutID");
+	PfDefine(sc, &sc->inoutID, name);
 	PfSetToZero(sc, &sc->inoutID);
 
-	PfAllocateContainerFlexible(sc, &sc->inoutID_x, 50);
 	sc->inoutID_x.type = 100 + sc->uintTypeCode;
-	sprintf(sc->inoutID_x.data.s, "inoutID_x");
-	PfDefine(sc, &sc->inoutID_x);
+	PfAllocateContainerFlexible(sc, &sc->inoutID_x, 50);
+	sprintf(name, "inoutID_x");
+	PfDefine(sc, &sc->inoutID_x, name);
 	PfSetToZero(sc, &sc->inoutID_x);
 
-	PfAllocateContainerFlexible(sc, &sc->inoutID_y, 50);
 	sc->inoutID_y.type = 100 + sc->uintTypeCode;
-	sprintf(sc->inoutID_y.data.s, "inoutID_y");
-	PfDefine(sc, &sc->inoutID_y);
+	PfAllocateContainerFlexible(sc, &sc->inoutID_y, 50);
+	sprintf(name, "inoutID_y");
+	PfDefine(sc, &sc->inoutID_y, name);
 	PfSetToZero(sc, &sc->inoutID_y);
 
 	if (sc->LUT) {
-		PfAllocateContainerFlexible(sc, &sc->LUTId, 50);
 		sc->LUTId.type = 100 + sc->uintTypeCode;
-		sprintf(sc->LUTId.data.s, "LUTId");
-		PfDefine(sc, &sc->LUTId);
+		PfAllocateContainerFlexible(sc, &sc->LUTId, 50);
+		sprintf(name, "LUTId");
+		PfDefine(sc, &sc->LUTId, name);
 		PfSetToZero(sc, &sc->LUTId);
 
 	}
 	else {
-		PfAllocateContainerFlexible(sc, &sc->angle, 50);
 		sc->angle.type = 100 + sc->floatTypeCode;
-		sprintf(sc->angle.data.s, "angle");
-		PfDefine(sc, &sc->angle);
+		PfAllocateContainerFlexible(sc, &sc->angle, 50);
+		sprintf(name, "angle");
+		PfDefine(sc, &sc->angle, name);
 		PfSetToZero(sc, &sc->angle);
 	}
 
@@ -401,11 +465,19 @@ static inline void freeRegisterInitialization(VkFFTSpecializationConstantsLayout
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 
-	//sc->tempLen = sprintf(sc->tempStr, "	uint dum=gl_LocalInvocationID.x;\n");
-	int logicalStoragePerThread = sc->registers_per_thread * sc->registerBoost;
+	int additional_registers_c2r = 0;
+	if ((sc->mergeSequencesR2C == 1) && (type == 5))
+		additional_registers_c2r = 2;
+
+	pfINT max_coordinate = 1;
+	if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) {
+		max_coordinate = sc->matrixConvolution;
+	}
+
+	int logicalStoragePerThread = (sc->registers_per_thread + additional_registers_c2r) * sc->registerBoost * (int)max_coordinate;
 	int logicalRegistersPerThread = sc->registers_per_thread;
 
-	for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
+	for (pfUINT i = 0; i < logicalStoragePerThread; i++) {
 		PfDeallocateContainer(sc, &sc->regIDs[i]);
 	}
 
@@ -428,9 +500,9 @@ static inline void freeRegisterInitialization(VkFFTSpecializationConstantsLayout
 	//sc->tempLen = sprintf(sc->tempStr, "	dum=dum/gl_LocalInvocationID.x-1;\n");
 	//sc->tempLen = sprintf(sc->tempStr, "	dummy=dummy/gl_LocalInvocationID.x-1;\n");
 	if (sc->registerBoost > 1) {
-		/*for (uint64_t i = 1; i < sc->registerBoost; i++) {
+		/*for (pfUINT i = 1; i < sc->registerBoost; i++) {
 			//sc->tempLen = sprintf(sc->tempStr, "	%s temp%" PRIu64 "[%" PRIu64 "];\n", vecType, i, logicalRegistersPerThread);
-			for (uint64_t j = 0; j < sc->registers_per_thread; j++) {
+			for (pfUINT j = 0; j < sc->registers_per_thread; j++) {
 				sc->tempLen = sprintf(sc->tempStr, "	%s temp_%" PRIu64 ";\n", vecType, j + i * sc->registers_per_thread);
 				PfAppendLine(sc);
 
@@ -445,13 +517,20 @@ static inline void freeRegisterInitialization(VkFFTSpecializationConstantsLayout
 	}
 	PfDeallocateContainer(sc, &sc->w);
 	
-	uint64_t maxNonPow2Radix = sc->maxNonPow2Radix;
-	for (uint64_t i = 0; i < sc->usedLocRegs; i++) {
+	pfUINT maxNonPow2Radix = sc->maxNonPow2Radix;
+	for (pfUINT i = 0; i < sc->usedLocRegs; i++) {
 		PfDeallocateContainer(sc, &sc->locID[i]);
 	}
+
 	PfDeallocateContainer(sc, &sc->temp);
 	PfDeallocateContainer(sc, &sc->tempFloat);
 
+	if (((sc->floatTypeCode % 100) / 10) == 3) {
+		PfDeallocateContainer(sc, &sc->tempQuad);
+		PfDeallocateContainer(sc, &sc->tempQuad2);
+		PfDeallocateContainer(sc, &sc->tempQuad3);
+		PfDeallocateContainer(sc, &sc->tempIntQuad);
+	}
 	PfDeallocateContainer(sc, &sc->tempInt);
 
 	PfDeallocateContainer(sc, &sc->tempInt2);
@@ -543,7 +622,7 @@ static inline void freeRegisterInitialization(VkFFTSpecializationConstantsLayout
 	else {
 		PfDeallocateContainer(sc, &sc->angle);
 	}
-	if (((sc->stageStartSize.data.i > 1) && (!((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT)) {
+	if (((sc->stageStartSize.data.i > 1) && (!((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize.data.i > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT) || (sc->performDST)) {
 		PfDeallocateContainer(sc, &sc->mult);
 	}
 	return;
@@ -563,6 +642,12 @@ static inline void freeRegisterInitialization_R2C(VkFFTSpecializationConstantsLa
 
 	PfDeallocateContainer(sc, &sc->w);
 	
+	if (((sc->floatTypeCode % 100) / 10) == 3) {
+		PfDeallocateContainer(sc, &sc->tempQuad);
+		PfDeallocateContainer(sc, &sc->tempQuad2);
+		PfDeallocateContainer(sc, &sc->tempQuad3);
+		PfDeallocateContainer(sc, &sc->tempIntQuad);
+	}
 	PfDeallocateContainer(sc, &sc->tempInt);
 	
 	PfDeallocateContainer(sc, &sc->tempInt2);
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h
index 5a8299ab..dbc306ae 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h
@@ -32,14 +32,14 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s
 		sc->sharedMemSize -= (int)(sc->additionalRaderSharedSize.data.i * sc->complexSize);
 		sc->sharedMemSizePow2 -= (int)(sc->additionalRaderSharedSize.data.i * sc->complexSize);
 	}
-	PfContainer maxSequenceSharedMemory;
+	PfContainer maxSequenceSharedMemory = VKFFT_ZERO_INIT;
 	maxSequenceSharedMemory.type = 31;
 	maxSequenceSharedMemory.data.i = sc->sharedMemSize / sc->complexSize;
 	//maxSequenceSharedMemoryPow2 = sc->sharedMemSizePow2 / sc->complexSize;
-	uint64_t additionalR2Cshared = 0;
-	if ((sc->performR2C || ((sc->performDCT == 2) || ((sc->performDCT == 4) && ((sc->fftDim.data.i % 2) != 0)))) && (sc->mergeSequencesR2C) && (sc->axis_id == 0) && (!sc->performR2CmultiUpload)) {
+	pfUINT additionalR2Cshared = 0;
+	if ((sc->performR2C || ((sc->performDCT == 2) || (sc->performDST == 2) || (sc->performDCT == 3) || (sc->performDST == 3) || (((sc->performDCT == 4) || (sc->performDST == 4)) && ((sc->fftDim.data.i % 2) != 0)))) && (sc->mergeSequencesR2C) && (sc->axis_id == 0) && (!sc->performR2CmultiUpload)) {
 		additionalR2Cshared = (sc->fftDim.data.i % 2 == 0) ? 2 : 1;
-		if ((sc->performDCT == 2) || ((sc->performDCT == 4) && ((sc->fftDim.data.i % 2) != 0))) additionalR2Cshared = 1;
+		if ((sc->performDCT == 2) || (sc->performDST == 2) || (sc->performDCT == 3) || (sc->performDST == 3) || (((sc->performDCT == 4) || (sc->performDST == 4)) && ((sc->fftDim.data.i % 2) != 0))) additionalR2Cshared = 1;
 	}
 	switch (type) {
 	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c + single_r2c
@@ -49,18 +49,18 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s
 		sc->sharedStrideBankConflictFirstStages.data.i = ((sc->fftDim.data.i > sc->numSharedBanks / 2) && ((sc->fftDim.data.i & (sc->fftDim.data.i - 1)) == 0)) ? (sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared) * (sc->numSharedBanks / 2 + 1) / (sc->numSharedBanks / 2) : sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared;
 		sc->sharedStrideReadWriteConflict.type = 31;
 		sc->sharedStrideReadWriteConflict.data.i = ((sc->numSharedBanks / 2 <= sc->localSize[1].data.i)) ? sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared + 1 : sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared + (sc->numSharedBanks / 2) / sc->localSize[1].data.i;
-		if ((uint64_t)sc->sharedStrideReadWriteConflict.data.i < (sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared)) sc->sharedStrideReadWriteConflict.data.i = sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared;
+		if ((pfUINT)sc->sharedStrideReadWriteConflict.data.i < (sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared)) sc->sharedStrideReadWriteConflict.data.i = sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared;
 		if (sc->useRaderFFT) {
-			uint64_t max_stride = sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared;
-			uint64_t max_shift = 0;
-			for (uint64_t i = 0; i < sc->numRaderPrimes; i++) {
+			pfUINT max_stride = sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared;
+			pfUINT max_shift = 0;
+			for (pfUINT i = 0; i < sc->numRaderPrimes; i++) {
 
-				for (uint64_t j = 0; j < sc->raderContainer[i].numStages; j++) {
+				for (pfUINT j = 0; j < sc->raderContainer[i].numStages; j++) {
 					if (sc->raderContainer[i].containerFFTNum < 8) {
-						uint64_t subLogicalGroupSize = (uint64_t)ceil(sc->raderContainer[i].containerFFTDim / (double)sc->raderContainer[i].registers_per_thread_per_radix[sc->raderContainer[i].stageRadix[j]]); // hopefully it is not <1, will fix 
-						uint64_t shift = (subLogicalGroupSize > (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2) : 0;
+						pfUINT subLogicalGroupSize = (pfUINT)pfceil(sc->raderContainer[i].containerFFTDim / (double)sc->raderContainer[i].registers_per_thread_per_radix[sc->raderContainer[i].stageRadix[j]]); // hopefully it is not <1, will fix 
+						pfUINT shift = (subLogicalGroupSize > (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2) : 0;
 						if (j == 0) shift = (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2)) ? 0 : 1;
-						uint64_t loc_stride = sc->raderContainer[i].containerFFTDim + shift;
+						pfUINT loc_stride = sc->raderContainer[i].containerFFTDim + shift;
 						if (sc->raderContainer[i].containerFFTNum * (loc_stride + 1) > max_stride) {
 							max_stride = sc->raderContainer[i].containerFFTNum * (loc_stride + 1);
 							if (shift > max_shift) max_shift = shift;
@@ -98,44 +98,70 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s
 
 		sc->usedSharedMemory.data.i = sc->complexSize * sc->localSize[1].data.i * sc->maxSharedStride.data.i;
 		if (sc->useRaderMult) {
-			for (uint64_t i = 0; i < 20; i++) {
+			for (pfUINT i = 0; i < 20; i++) {
 				sc->RaderKernelOffsetShared[i].type = 31;
 				sc->RaderKernelOffsetShared[i].data.i += sc->usedSharedMemory.data.i / sc->complexSize;
 			}
 			sc->usedSharedMemory.data.i += sc->additionalRaderSharedSize.data.i * sc->complexSize;
 		}
-		PfContainer* vecType;
-		PfGetTypeFromCode(sc, sc->vecTypeCode, &vecType);
+		if (sc->storeSharedComplexComponentsSeparately){
+			sc->offsetImaginaryShared.type = 31;
+			sc->offsetImaginaryShared.data.i = sc->usedSharedMemory.data.i / sc->complexSize;
+			PfContainer* floatType = VKFFT_ZERO_INIT;
+			PfGetTypeFromCode(sc, sc->floatTypeCode, &floatType);
 #if(VKFFT_BACKEND==0)
-		sc->tempLen = sprintf(sc->tempStr, "shared %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", vecType->data.s, sc->usedSharedMemory.data.i / sc->complexSize);
-		PfAppendLine(sc);
+			sc->tempLen = sprintf(sc->tempStr, "shared %s sdata[%" PRIi64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", floatType->name, (2 * sc->usedSharedMemory.data.i) / sc->complexSize);
+			PfAppendLine(sc);
 		
 #elif(VKFFT_BACKEND==1)
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
-		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->data.s, vecType->data.s);
-		PfAppendLine(sc);
-		
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", floatType->name, floatType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
 #elif(VKFFT_BACKEND==2)
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
-		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->data.s, vecType->data.s);
-		PfAppendLine(sc);
-		
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", floatType->name, floatType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
 #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-		sc->tempLen = sprintf(sc->tempStr, "__local %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", vecType->data.s, sc->usedSharedMemory.data.i / sc->complexSize);
-		PfAppendLine(sc);
+			sc->tempLen = sprintf(sc->tempStr, "__local %s sdata[%" PRIi64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", floatType->name, (2 * sc->usedSharedMemory.data.i) / sc->complexSize);
+			PfAppendLine(sc);
+#endif
+		}else{
+			PfContainer* vecType = VKFFT_ZERO_INIT;
+			PfGetTypeFromCode(sc, sc->vecTypeCode, &vecType);
+#if(VKFFT_BACKEND==0)
+			sc->tempLen = sprintf(sc->tempStr, "shared %s sdata[%" PRIi64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", vecType->name, sc->usedSharedMemory.data.i / sc->complexSize);
+			PfAppendLine(sc);
 		
+#elif(VKFFT_BACKEND==1)
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->name, vecType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
+#elif(VKFFT_BACKEND==2)
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->name, vecType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
+#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
+			sc->tempLen = sprintf(sc->tempStr, "__local %s sdata[%" PRIi64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", vecType->name, sc->usedSharedMemory.data.i / sc->complexSize);
+			PfAppendLine(sc);
 #endif
+		}
 		break;
 				}
 	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c + single_c2c_strided
 	{
-		uint64_t shift = (sc->fftDim.data.i < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim.data.i : 1;
+		pfUINT shift = (sc->fftDim.data.i < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim.data.i : 1;
 		sc->sharedStrideReadWriteConflict.type = 31;
 		sc->sharedStrideReadWriteConflict.data.i = ((sc->axisSwapped) && ((sc->localSize[0].data.i % 4) == 0)) ? sc->localSize[0].data.i + shift : sc->localSize[0].data.i;
 		sc->maxSharedStride.type = 31;
-		sc->maxSharedStride.data.i = ((maxSequenceSharedMemory.data.i < sc->sharedStrideReadWriteConflict.data.i * (sc->fftDim.data.i / sc->registerBoost + (int64_t)additionalR2Cshared))) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i;
+		sc->maxSharedStride.data.i = ((maxSequenceSharedMemory.data.i < sc->sharedStrideReadWriteConflict.data.i * (sc->fftDim.data.i / sc->registerBoost + (pfINT)additionalR2Cshared))) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i;
 		sc->sharedStrideReadWriteConflict.data.i = (sc->maxSharedStride.data.i == sc->localSize[0].data.i) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i;
 		
 		sc->sharedStride.type = 31;
@@ -144,35 +170,59 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s
 		sc->usedSharedMemory.type = 31;
 		sc->usedSharedMemory.data.i = sc->complexSize * sc->maxSharedStride.data.i * (sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared);
 		if (sc->useRaderMult) {
-			for (uint64_t i = 0; i < 20; i++) {
+			for (pfUINT i = 0; i < 20; i++) {
 				sc->RaderKernelOffsetShared[i].type = 31;
 				sc->RaderKernelOffsetShared[i].data.i += sc->usedSharedMemory.data.i / sc->complexSize;
 			}
 			sc->usedSharedMemory.data.i += sc->additionalRaderSharedSize.data.i * sc->complexSize;
 		}
-		PfContainer* vecType;
-		PfGetTypeFromCode(sc, sc->vecTypeCode, &vecType);
+		if (sc->storeSharedComplexComponentsSeparately){
+			sc->offsetImaginaryShared.type = 31;
+			sc->offsetImaginaryShared.data.i = sc->usedSharedMemory.data.i / sc->complexSize;
+			PfContainer* floatType = VKFFT_ZERO_INIT;
+			PfGetTypeFromCode(sc, sc->floatTypeCode, &floatType);
 #if(VKFFT_BACKEND==0)
-		sc->tempLen = sprintf(sc->tempStr, "shared %s sdata[%" PRIu64 "];\n\n", vecType->data.s, sc->usedSharedMemory.data.i / sc->complexSize);
-		PfAppendLine(sc);
-		
+			sc->tempLen = sprintf(sc->tempStr, "shared %s sdata[%" PRIi64 "];\n\n", floatType->name, (2 * sc->usedSharedMemory.data.i) / sc->complexSize);
+			PfAppendLine(sc);
 #elif(VKFFT_BACKEND==1)
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
-		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->data.s, vecType->data.s);
-		PfAppendLine(sc);
-		
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", floatType->name, floatType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
 #elif(VKFFT_BACKEND==2)
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
-		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->data.s, vecType->data.s);
-		PfAppendLine(sc);
-		
-		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", floatType->name, floatType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
 #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-		sc->tempLen = sprintf(sc->tempStr, "__local %s sdata[%" PRIu64 "];\n\n", vecType->data.s, sc->usedSharedMemory.data.i / sc->complexSize);
-		PfAppendLine(sc);
-		
+			sc->tempLen = sprintf(sc->tempStr, "__local %s sdata[%" PRIi64 "];\n\n", floatType->name, (2 * sc->usedSharedMemory.data.i) / sc->complexSize);
+			PfAppendLine(sc);
 #endif
+		}else{
+			PfContainer* vecType = VKFFT_ZERO_INIT;
+			PfGetTypeFromCode(sc, sc->vecTypeCode, &vecType);
+#if(VKFFT_BACKEND==0)
+			sc->tempLen = sprintf(sc->tempStr, "shared %s sdata[%" PRIi64 "];\n\n", vecType->name, sc->usedSharedMemory.data.i / sc->complexSize);
+			PfAppendLine(sc);
+#elif(VKFFT_BACKEND==1)
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->name, vecType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
+#elif(VKFFT_BACKEND==2)
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
+			sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType->name, vecType->name);
+			PfAppendLine(sc);
+			
+			//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
+#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
+			sc->tempLen = sprintf(sc->tempStr, "__local %s sdata[%" PRIi64 "];\n\n", vecType->name, sc->usedSharedMemory.data.i / sc->complexSize);
+			PfAppendLine(sc);
+#endif
+		}
 		break;
 	}
 			}
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryTransfers/vkFFT_Transfers.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryTransfers/vkFFT_Transfers.h
index 95d65b0e..9cb99668 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryTransfers/vkFFT_Transfers.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryTransfers/vkFFT_Transfers.h
@@ -27,82 +27,148 @@
 static inline void appendSharedToRegisters(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* sdataID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-%s = sdata[%s];\n", out->data.s, sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s];\n", out->data.c[0].name, sdataID->name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s + %" PRIi64 "];\n", out->data.c[1].name, sdataID->name, sc->offsetImaginaryShared.data.i);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s];\n", out->name, sdataID->name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendSharedToRegisters_x_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* sdataID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = sdata[%s].x;\n", out->data.s, sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s];\n", out->data.c[0].name, sdataID->name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s].x;\n", out->data.c[0].name, sdataID->name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendSharedToRegisters_x_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* sdataID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = sdata[%s].y;\n", out->data.s, sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s + %" PRIi64 "];\n", out->data.c[0].name, sdataID->name, sc->offsetImaginaryShared.data.i);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s].y;\n", out->data.c[0].name, sdataID->name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendSharedToRegisters_y_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* sdataID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = sdata[%s].x;\n", out->data.s, sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s];\n", out->data.c[1].name, sdataID->name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s].x;\n", out->data.c[1].name, sdataID->name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendSharedToRegisters_y_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* sdataID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = sdata[%s].y;\n", out->data.s, sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s + %" PRIi64 "];\n", out->data.c[1].name, sdataID->name, sc->offsetImaginaryShared.data.i);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+%s = sdata[%s].y;\n", out->data.c[1].name, sdataID->name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 
 static inline void appendRegistersToShared(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID, PfContainer* out)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-sdata[%s] = %s;\n", sdataID->data.s, out->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s] = %s;\n", sdataID->name, out->data.c[0].name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s + %" PRIi64 "] = %s;\n", sdataID->name, sc->offsetImaginaryShared.data.i, out->data.c[1].name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s] = %s;\n", sdataID->name, out->name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendRegistersToShared_x_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID, PfContainer* out)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-sdata[%s].x = %s.x;\n", sdataID->data.s, out->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s] = %s;\n", sdataID->name, out->data.c[0].name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].x = %s;\n", sdataID->name, out->data.c[0].name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendRegistersToShared_x_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID, PfContainer* out)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-sdata[%s].x = %s.y;\n", sdataID->data.s, out->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s] = %s;\n", sdataID->name, out->data.c[1].name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].x = %s;\n", sdataID->name, out->data.c[1].name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendRegistersToShared_y_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID, PfContainer* out)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-sdata[%s].y = %s.y;\n", sdataID->data.s, out->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s + %" PRIi64 "] = %s;\n", sdataID->name, sc->offsetImaginaryShared.data.i, out->data.c[1].name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].y = %s;\n", sdataID->name, out->data.c[1].name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 static inline void appendRegistersToShared_y_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID, PfContainer* out)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "\
-sdata[%s].y = %s.x;\n", sdataID->data.s, out->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s + %" PRIi64 "] = %s;\n", sdataID->name, sc->offsetImaginaryShared.data.i, out->data.c[0].name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].y = %s;\n", sdataID->name, out->data.c[0].name);
+		PfAppendLine(sc);
+	}
 	return;
 }
 
@@ -110,7 +176,7 @@ static inline void appendConstantToRegisters(VkFFTSpecializationConstantsLayout*
 {
 	if (sc->res != VKFFT_SUCCESS) return;
 	sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s[%s];\n", out->data.s, constantBufferName->data.s, inoutID->data.s);
+%s = %s[%s];\n", out->name, constantBufferName->name, inoutID->name);
 	PfAppendLine(sc);
 	return;
 }
@@ -118,7 +184,7 @@ static inline void appendConstantToRegisters_x(VkFFTSpecializationConstantsLayou
 {
 	if (sc->res != VKFFT_SUCCESS) return;
 	sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = %s[%s];\n", out->data.s, constantBufferName->data.s, inoutID->data.s);
+%s = %s[%s];\n", out->data.c[0].name, constantBufferName->name, inoutID->name);
 	PfAppendLine(sc);
 	return;
 }
@@ -126,7 +192,7 @@ static inline void appendConstantToRegisters_y(VkFFTSpecializationConstantsLayou
 {
 	if (sc->res != VKFFT_SUCCESS) return;
 	sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = %s[%s];\n", out->data.s, constantBufferName->data.s, inoutID->data.s);
+%s = %s[%s];\n", out->data.c[1].name, constantBufferName->name, inoutID->name);
 	PfAppendLine(sc);
 	return;
 }
@@ -134,22 +200,23 @@ static inline void appendConstantToRegisters_y(VkFFTSpecializationConstantsLayou
 static inline void appendGlobalToRegisters(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* bufferName, PfContainer* inoutID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
 	PfAppendConversionStart(sc, out, bufferName);
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	int dataSize = ((out->type % 10) == 3) ? sc->complexSize : sc->complexSize / 2;
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / dataSize, bufferName->name, inoutID->name, sc->inputBufferBlockSize / dataSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / dataSize, bufferName->name, inoutID->name, sc->outputBufferBlockSize / dataSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / dataSize, bufferName->name, inoutID->name, sc->kernelBlockSize / dataSize);
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
 	PfAppendConversionEnd(sc, out, bufferName);
@@ -160,28 +227,26 @@ static inline void appendGlobalToRegisters(VkFFTSpecializationConstantsLayout* s
 static inline void appendGlobalToRegisters_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* bufferName, PfContainer* inoutID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	PfContainer* floatType;
-	PfGetTypeFromCode(sc, out->type - 1, &floatType);
-
-	sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
+	
+	sc->tempLen = sprintf(sc->tempStr, "%s", out->data.c[0].name);
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
-	PfAppendConversionStart(sc, floatType, bufferName);
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	PfAppendConversionStart(sc, &out->data.c[0], bufferName);
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2));
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
-	PfAppendConversionEnd(sc, floatType, bufferName);
+	PfAppendConversionEnd(sc, &out->data.c[0], bufferName);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
 	PfAppendLine(sc);
 	return;
@@ -189,53 +254,53 @@ static inline void appendGlobalToRegisters_x(VkFFTSpecializationConstantsLayout*
 static inline void appendGlobalToRegisters_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* bufferName, PfContainer* inoutID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	PfContainer* floatType;
-	PfGetTypeFromCode(sc, out->type - 1, &floatType);
-
-	sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
+	
+	sc->tempLen = sprintf(sc->tempStr, "%s", out->data.c[1].name);
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
-	PfAppendConversionStart(sc, floatType, bufferName);
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	PfAppendConversionStart(sc, &out->data.c[1], bufferName);
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2));
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
-	PfAppendConversionEnd(sc, floatType, bufferName);
+	PfAppendConversionEnd(sc, &out->data.c[1], bufferName);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
 	PfAppendLine(sc);
+	
 	return;
 }
 
 static inline void appendRegistersToGlobal(VkFFTSpecializationConstantsLayout* sc, PfContainer* bufferName, PfContainer* inoutID, PfContainer* in)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	int dataSize = ((in->type % 10) == 3) ? sc->complexSize : sc->complexSize / 2;
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / dataSize, bufferName->name, inoutID->name, sc->inputBufferBlockSize / dataSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / dataSize, bufferName->name, inoutID->name, sc->outputBufferBlockSize / dataSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / dataSize, bufferName->name, inoutID->name, sc->kernelBlockSize / dataSize);
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
 	PfAppendConversionStart(sc, bufferName, in);
-	sc->tempLen = sprintf(sc->tempStr, "%s", in->data.s);
+	sc->tempLen = sprintf(sc->tempStr, "%s", in->name);
 	PfAppendLine(sc);
 	PfAppendConversionEnd(sc, bufferName, in);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
@@ -245,28 +310,25 @@ static inline void appendRegistersToGlobal(VkFFTSpecializationConstantsLayout* s
 static inline void appendRegistersToGlobal_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* bufferName, PfContainer* inoutID, PfContainer* in)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	PfContainer* floatType;
-	PfGetTypeFromCode(sc, in->type - 1, &floatType);
-	
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2));
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
-	PfAppendConversionStart(sc, bufferName, floatType);
-	sc->tempLen = sprintf(sc->tempStr, "%s.x", in->data.s);
+	PfAppendConversionStart(sc, bufferName, &in->data.c[0]);
+	sc->tempLen = sprintf(sc->tempStr, "%s", in->data.c[0].name);
 	PfAppendLine(sc);
-	PfAppendConversionEnd(sc, bufferName, floatType);
+	PfAppendConversionEnd(sc, bufferName, &in->data.c[0]);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
 	PfAppendLine(sc);
 	return;
@@ -274,28 +336,25 @@ static inline void appendRegistersToGlobal_x(VkFFTSpecializationConstantsLayout*
 static inline void appendRegistersToGlobal_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* bufferName, PfContainer* inoutID, PfContainer* in)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	PfContainer* floatType;
-	PfGetTypeFromCode(sc, in->type - 1, &floatType);
-
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->inputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->outputBufferBlockSize / (sc->complexSize / 2));
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2), bufferName->name, inoutID->name, sc->kernelBlockSize / (sc->complexSize / 2));
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
-	PfAppendConversionStart(sc, bufferName, floatType);
-	sc->tempLen = sprintf(sc->tempStr, "%s.y", in->data.s);
+	PfAppendConversionStart(sc, bufferName, &in->data.c[1]);
+	sc->tempLen = sprintf(sc->tempStr, "%s", in->data.c[1].name);
 	PfAppendLine(sc);
-	PfAppendConversionEnd(sc, bufferName, floatType);
+	PfAppendConversionEnd(sc, bufferName, &in->data.c[1]);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
 	PfAppendLine(sc);
 	return;
@@ -304,50 +363,67 @@ static inline void appendRegistersToGlobal_y(VkFFTSpecializationConstantsLayout*
 static inline void appendGlobalToShared(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID, PfContainer* bufferName, PfContainer* inoutID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	sc->tempLen = sprintf(sc->tempStr, "sdata[%s]", sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "%s", sc->temp.name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "sdata[%s]", sdataID->name);
+		PfAppendLine(sc);
+	}
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
 	PfAppendConversionStart(sc, &sc->sdataStruct, bufferName);
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / sc->complexSize, bufferName->name, inoutID->name, sc->inputBufferBlockSize / sc->complexSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / sc->complexSize, bufferName->name, inoutID->name, sc->outputBufferBlockSize / sc->complexSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / sc->complexSize, bufferName->name, inoutID->name, sc->kernelBlockSize / sc->complexSize);
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
 	PfAppendConversionEnd(sc, &sc->sdataStruct, bufferName);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
 	PfAppendLine(sc);
+
+	if (sc->storeSharedComplexComponentsSeparately){
+		appendRegistersToShared(sc, sdataID, &sc->temp);
+	}
 	return;
 }
 static inline void appendSharedToGlobal(VkFFTSpecializationConstantsLayout* sc, PfContainer* bufferName, PfContainer* inoutID, PfContainer* sdataID)
 {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if ((!(strcmp(bufferName->data.s, sc->inputsStruct.data.s))) && (sc->inputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->inputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->inputBufferBlockSize);
+	if (sc->storeSharedComplexComponentsSeparately){
+		appendSharedToRegisters(sc, &sc->temp, sdataID);
+	}
+	if ((!(strcmp(bufferName->name, sc->inputsStruct.name))) && (sc->inputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "inputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->inputBufferBlockSize / sc->complexSize, bufferName->name, inoutID->name, sc->inputBufferBlockSize / sc->complexSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->outputsStruct.data.s))) && (sc->outputBufferBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->outputBufferBlockSize, bufferName->data.s, inoutID->data.s, sc->outputBufferBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->outputsStruct.name))) && (sc->outputBufferBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "outputBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->outputBufferBlockSize / sc->complexSize, bufferName->name, inoutID->name, sc->outputBufferBlockSize / sc->complexSize);
 	}
-	else if ((!(strcmp(bufferName->data.s, sc->kernelStruct.data.s))) && (sc->kernelBlockNum != 1)) {
-		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->data.s, sc->kernelBlockSize, bufferName->data.s, inoutID->data.s, sc->kernelBlockSize);
+	else if ((!(strcmp(bufferName->name, sc->kernelStruct.name))) && (sc->kernelBlockNum != 1)) {
+		sc->tempLen = sprintf(sc->tempStr, "kernelBlocks[%s / %" PRIu64 "].%s[%s %% %" PRIu64 "]", inoutID->name, sc->kernelBlockSize / sc->complexSize, bufferName->name, inoutID->name, sc->kernelBlockSize / sc->complexSize);
 	}
 	else {
-		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->data.s, inoutID->data.s);
+		sc->tempLen = sprintf(sc->tempStr, "%s[%s]", bufferName->name, inoutID->name);
 	}
 	PfAppendLine(sc);
 	sc->tempLen = sprintf(sc->tempStr, " = ");
 	PfAppendLine(sc);
 	PfAppendConversionStart(sc, bufferName, &sc->sdataStruct);
-	sc->tempLen = sprintf(sc->tempStr, "sdata[%s]", sdataID->data.s);
-	PfAppendLine(sc);
+	if (sc->storeSharedComplexComponentsSeparately){
+		sc->tempLen = sprintf(sc->tempStr, "%s", sc->temp.name);
+		PfAppendLine(sc);
+	}else{
+		sc->tempLen = sprintf(sc->tempStr, "sdata[%s]", sdataID->name);
+		PfAppendLine(sc);
+	}
 	PfAppendConversionEnd(sc, bufferName, &sc->sdataStruct);
 	sc->tempLen = sprintf(sc->tempStr, ";\n");
 	PfAppendLine(sc);
@@ -367,7 +443,7 @@ static inline void appendSetSMToZero(VkFFTSpecializationConstantsLayout* sc) {
 	temp_int.data.i = sc->localSize[0].data.i * sc->localSize[1].data.i;
 	temp_int1.data.i = sc->usedSharedMemory.data.i / sc->complexSize;
 	PfDivCeil(sc, &used_registers, &temp_int1, &temp_int);
-	for (int64_t i = 0; i < used_registers.data.i; i++) {
+	for (pfINT i = 0; i < used_registers.data.i; i++) {
 		if (sc->localSize[1].data.i == 1) {
 			temp_int.data.i = (i)*sc->localSize[0].data.i;
 
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_4step.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_4step.h
index 72923163..d2c07c66 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_4step.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_4step.h
@@ -35,8 +35,8 @@ static inline void appendReorder4Step(VkFFTSpecializationConstantsLayout* sc, in
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
-	uint64_t logicalRegistersPerThread;
+	temp_double.type = 22;
+	pfUINT logicalRegistersPerThread;
 	if (readWrite==0)
 		logicalRegistersPerThread = (sc->rader_generator[0] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[0]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 	else
@@ -59,12 +59,12 @@ static inline void appendReorder4Step(VkFFTSpecializationConstantsLayout* sc, in
 			else {
 				PfMod(sc, &sc->inoutID, &sc->shiftX, &sc->stageStartSize);
 			}
-			for (uint64_t i = 0; i < (uint64_t)temp_int1.data.i; i++) {
+			for (pfUINT i = 0; i < (pfUINT)temp_int1.data.i; i++) {
 				PfMod(sc, &temp_int, &sc->fftDim, &sc->localSize[1]);
 				if ((temp_int.data.i != 0) && (i == (temp_int1.data.i - 1))) {
 					PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &temp_int);				
 				}
-				uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
+				pfUINT id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
 
 				if ((sc->LUT) && (sc->LUT_4step)) {
 					temp_int.data.i = i * sc->localSize[1].data.i;
@@ -83,7 +83,7 @@ static inline void appendReorder4Step(VkFFTSpecializationConstantsLayout* sc, in
 					temp_int.data.i = i * sc->localSize[1].data.i;
 					PfAdd(sc, &sc->tempInt, &sc->gl_LocalInvocationID_y, &temp_int);
 					PfMul(sc, &sc->tempInt, &sc->inoutID, &sc->tempInt, 0);
-					temp_double.data.d = 2 * sc->double_PI/ (long double)(sc->stageStartSize.data.i * sc->fftDim.data.i);
+					temp_double.data.d = pfFPinit("2.0") * sc->double_PI/ (pfLD)(sc->stageStartSize.data.i * sc->fftDim.data.i);
 					PfMul(sc, &sc->angle, &sc->tempInt, &temp_double, 0);
 					PfSinCos(sc, &sc->mult, &sc->angle);
 					if ((!sc->inverse) && (readWrite == 1)) {
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Bluestein.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Bluestein.h
index 0e94f9c4..39e981e8 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Bluestein.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Bluestein.h
@@ -29,14 +29,14 @@
 #include "vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_KernelUtils.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryTransfers/vkFFT_Transfers.h"
 
-static inline void appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout* sc, uint64_t strideType, uint64_t pre_or_post_multiplication) {
+static inline void appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout* sc, pfUINT strideType, pfUINT pre_or_post_multiplication) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	
 	//char index_y[2000] = "";
@@ -79,8 +79,8 @@ static inline void appendBluesteinMultiplication(VkFFTSpecializationConstantsLay
 		
 	PfDivCeil(sc, &used_registers, &sc->fftDim, &localSize);
 
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 			PfContainer current_group_cut;
 			current_group_cut.type = 31;
 			current_group_cut .data.i = sc->fftDim.data.i - i * localSize.data.i;
@@ -151,7 +151,7 @@ static inline void appendBluesteinMultiplication(VkFFTSpecializationConstantsLay
 
 		appendGlobalToRegisters(sc, &sc->w, &sc->BluesteinStruct, &sc->inoutID);
 		
-		//uint64_t k = 0;
+		//pfUINT k = 0;
 		if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
 			if (sc->stridedSharedLayout) {
 				temp_int.data.i = i * sc->localSize[1].data.i;
@@ -185,7 +185,7 @@ static inline void appendBluesteinMultiplication(VkFFTSpecializationConstantsLay
 		if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) {
 			PfIf_end(sc);
 		}
-		if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+		if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 			PfIf_end(sc);
 		}
 	}
@@ -195,14 +195,14 @@ static inline void appendBluesteinMultiplication(VkFFTSpecializationConstantsLay
 	return;
 }
 
-static inline void appendBluesteinConvolution(VkFFTSpecializationConstantsLayout* sc, uint64_t strideType) {
+static inline void appendBluesteinConvolution(VkFFTSpecializationConstantsLayout* sc, pfUINT strideType) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 	
 	if (sc->useDisableThreads) {
 		temp_int.data.i = 0;
@@ -227,8 +227,8 @@ static inline void appendBluesteinConvolution(VkFFTSpecializationConstantsLayout
 		PfDivCeil(sc, &used_registers, &sc->fftDim, &sc->localSize[0]);
 	}
 
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 			temp_int.data.i = sc->fftDim.data.i - i * localSize.data.i;
 			PfIf_lt_start(sc, localInvocationID, &temp_int);
 		}
@@ -298,7 +298,7 @@ static inline void appendBluesteinConvolution(VkFFTSpecializationConstantsLayout
 		PfMul(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->w, &sc->temp);
 		
 		PfIf_end(sc);
-		if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+		if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 			PfIf_end(sc);
 		}
 	}
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Convolution.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Convolution.h
index 8160f30b..fe231faa 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Convolution.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_Convolution.h
@@ -37,7 +37,7 @@ static inline void appendRegisterStorage(VkFFTSpecializationConstantsLayout* sc,
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
@@ -66,8 +66,8 @@ static inline void appendRegisterStorage(VkFFTSpecializationConstantsLayout* sc,
 			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 		}
 
-		for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-			if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 				temp_int.data.i = sc->fftDim.data.i - i * localSize.data.i;
 				PfIf_lt_start(sc, localInvocationID, &temp_int);
 			}
@@ -91,7 +91,7 @@ static inline void appendRegisterStorage(VkFFTSpecializationConstantsLayout* sc,
 			else
 				appendSharedToRegisters(sc, &sc->regIDs[sc->coordinate.data.i * sc->registers_per_thread + i], &sc->sdataID);
 
-			if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+			if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 				PfIf_end(sc);
 			}
 		}
@@ -110,11 +110,11 @@ static inline void appendPreparationBatchedKernelConvolution(VkFFTSpecialization
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32; 
+	temp_double.type = 22; 
 	
-	for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
+	for (pfUINT i = 0; i < sc->registers_per_thread; i++) {
 		//sc->tempLen = sprintf(sc->tempStr, "			temp%s[i]=temp[i];\n", separateRegisterStore);
-		for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
+		for (pfUINT j = 0; j < sc->matrixConvolution; j++) {
 			PfMov(sc, &sc->regIDs_copy[i + j * sc->registers_per_thread], &sc->regIDs[i + j * sc->registers_per_thread]);
 		}
 	}
@@ -122,14 +122,14 @@ static inline void appendPreparationBatchedKernelConvolution(VkFFTSpecialization
 }
 
 
-static inline void appendKernelConvolution(VkFFTSpecializationConstantsLayout* sc, uint64_t strideType) {
+static inline void appendKernelConvolution(VkFFTSpecializationConstantsLayout* sc, pfUINT strideType) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer localSize = VKFFT_ZERO_INIT;
 	localSize.type = 31;
@@ -176,12 +176,12 @@ static inline void appendKernelConvolution(VkFFTSpecializationConstantsLayout* s
 			PfMul(sc, &temp_int, &batching_localSize, &sc->firstStageStartSize, 0);
 			PfMul(sc, &sc->tempInt, &sc->tempInt, &temp_int, 0);
 
-			//sc->tempLen = sprintf(sc->tempStr, "		%s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (uint64_t)ceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
+			//sc->tempLen = sprintf(sc->tempStr, "		%s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((pfUINT)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (pfUINT)pfceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((pfUINT)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
 			temp_int.data.i = sc->firstStageStartSize.data.i / sc->fftDim.data.i;
 			PfDiv(sc, &sc->tempInt, &sc->gl_WorkGroupID_x, &temp_int);
-			temp_int1.data.i = ((int64_t)floor(sc->fft_dim_full.data.i / ((long double)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i);
+			temp_int1.data.i = ((pfINT)pffloor(sc->fft_dim_full.data.i / ((pfLD)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i);
 			PfIf_eq_start(sc, &sc->tempInt, &temp_int1);
-			temp_int.data.i = (int64_t)ceil(((sc->fft_dim_full.data.i - (sc->firstStageStartSize.data.i / sc->fftDim.data.i) * ((((int64_t)floor(sc->fft_dim_full.data.i / ((long double)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i)) * batching_localSize.data.i * sc->fftDim.data.i)) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i)) / (long double)used_registers.data.i);
+			temp_int.data.i = (pfINT)pfceil(((sc->fft_dim_full.data.i - (sc->firstStageStartSize.data.i / sc->fftDim.data.i) * ((((pfINT)pffloor(sc->fft_dim_full.data.i / ((pfLD)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i)) * batching_localSize.data.i * sc->fftDim.data.i)) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i)) / (pfLD)used_registers.data.i);
 			PfMov(sc, &sc->sdataID, &temp_int);
 			PfIf_else(sc);
 			temp_int.data.i = sc->localSize[0].data.i * sc->localSize[1].data.i;
@@ -250,11 +250,11 @@ static inline void appendKernelConvolution(VkFFTSpecializationConstantsLayout* s
 		PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->blockInvocationID);
 	}
 
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		for (int64_t j = 0; j < sc->matrixConvolution; j++) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		for (pfINT j = 0; j < sc->matrixConvolution; j++) {
 			PfSetToZero(sc, &sc->temp_conv[j]);
 		}
-		if (localSize.data.i * ((1 + (int64_t)i)) > sc->fftDim.data.i) {
+		if (localSize.data.i * ((1 + (pfINT)i)) > sc->fftDim.data.i) {
 			temp_int.data.i = sc->fftDim.data.i - i * localSize.data.i;
 			PfIf_lt_start(sc, localInvocationID, &temp_int);
 		}
@@ -347,9 +347,9 @@ static inline void appendKernelConvolution(VkFFTSpecializationConstantsLayout* s
 		}
 		break;
 		}
-		for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
-			for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
-				uint64_t k = 0;
+		for (pfUINT j = 0; j < sc->matrixConvolution; j++) {
+			for (pfUINT l = 0; l < sc->matrixConvolution; l++) {
+				pfUINT k = 0;
 				if (sc->symmetricKernel) {
 					k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
 				}
@@ -389,7 +389,7 @@ static inline void appendKernelConvolution(VkFFTSpecializationConstantsLayout* s
 		else {
 			PfMov(sc, &sc->regIDs[i], &sc->temp_conv[0]);
 		}
-		for (uint64_t l = 1; l < sc->matrixConvolution; l++) {
+		for (pfUINT l = 1; l < sc->matrixConvolution; l++) {
 			if (sc->crossPowerSpectrumNormalization) {
 				PfNorm(sc, &sc->tempFloat, &sc->temp_conv[l]);
 				PfRsqrt(sc, &sc->tempFloat, &sc->tempFloat);
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2C.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2C.h
index c656a735..cfb635f3 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2C.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2C.h
@@ -32,7 +32,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
@@ -80,8 +80,8 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (i < (uint64_t)((fftDim.data.i / 2 + 1) / localSize.data.i)) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (i < (pfUINT)((fftDim.data.i / 2 + 1) / localSize.data.i)) {
 			temp_int.data.i = i * localSize.data.i;
 			if (sc->stridedSharedLayout) {
 				PfAdd(sc, &sc->sdataID, localInvocationID, &temp_int);
@@ -95,7 +95,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 			}
 			appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
 			if (sc->mergeSequencesR2C) {
-				temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
+				temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
 
 				if (sc->stridedSharedLayout)
 					temp_int.data.i *= sc->sharedStride.data.i;
@@ -103,22 +103,22 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
 				appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
 
-				PfShuffleComplex(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, 0);
+				PfShuffleComplex(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, &sc->w);
 			}
 		}
 		else {
-			if (i >= (uint64_t)ceil((fftDim.data.i / 2 + 1) / (long double)localSize.data.i)) {
-				if ((1 + (int64_t)i) * localSize.data.i > fftDim.data.i) {
+			if (i >= (pfUINT)pfceil((fftDim.data.i / 2 + 1) / (pfLD)localSize.data.i)) {
+				if ((1 + (pfINT)i) * localSize.data.i > fftDim.data.i) {
 					temp_int.data.i = fftDim.data.i - (i)*localSize.data.i;
 					PfIf_lt_start(sc, localInvocationID, &temp_int);
 				}
-				if ((((int64_t)ceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) > ((int64_t)i - ((int64_t)ceil((fftDim.data.i / 2 + 1) / (long double)localSize.data.i))) * localSize.data.i) && ((int64_t)ceil(fftDim.data.i / 2.0) - 1 > (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1)))) {
+				if ((((pfINT)pfceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) > ((pfINT)i - ((pfINT)pfceil((fftDim.data.i / 2 + 1) / (pfLD)localSize.data.i))) * localSize.data.i) && ((pfINT)pfceil(fftDim.data.i / 2.0) - 1 > (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1)))) {
 					if (sc->zeropadBluestein[0]) {
-						temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) - (i - ((int64_t)ceil((fftDim.data.i / 2 + 1) / (long double)localSize.data.i))) * localSize.data.i;
+						temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) - (i - ((pfINT)pfceil((fftDim.data.i / 2 + 1) / (pfLD)localSize.data.i))) * localSize.data.i;
 						PfIf_gt_start(sc, &temp_int, localInvocationID);
 					}
 
-					temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) - (i - ((int64_t)ceil((fftDim.data.i / 2 + 1) / (long double)localSize.data.i))) * localSize.data.i;
+					temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) - (i - ((pfINT)pfceil((fftDim.data.i / 2 + 1) / (pfLD)localSize.data.i))) * localSize.data.i;
 					if (sc->stridedSharedLayout) {
 						PfSub(sc, &sc->sdataID, &temp_int, localInvocationID);
 						PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
@@ -131,7 +131,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 					}
 					appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
 					if (sc->mergeSequencesR2C) {
-						temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
+						temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
 
 						if (sc->stridedSharedLayout)
 							temp_int.data.i *= sc->sharedStride.data.i;
@@ -139,7 +139,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 						PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
 						appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
 
-						PfShuffleComplexInv(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, 0);
+						PfShuffleComplexInv(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, &sc->w);
 					}
 					PfConjugate(sc, &sc->regIDs[i], &sc->regIDs[i]);
 
@@ -154,7 +154,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 				else {
 					PfSetToZero(sc, &sc->regIDs[i]);
 				}
-				if ((1 + (int64_t)i) * localSize.data.i > fftDim.data.i) {
+				if ((1 + (pfINT)i) * localSize.data.i > fftDim.data.i) {
 					PfIf_end(sc);
 				}
 			}
@@ -179,7 +179,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 				}
 				appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
 				if (sc->mergeSequencesR2C) {
-					temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
+					temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
 
 					if (sc->stridedSharedLayout)
 						temp_int.data.i *= sc->sharedStride.data.i;
@@ -187,11 +187,11 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 					PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
 					appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
 
-					PfShuffleComplex(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, 0);
+					PfShuffleComplex(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, &sc->w);
 				}
 				PfIf_else(sc);
 
-				temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) - (i - ((int64_t)ceil((fftDim.data.i / 2 + 1) / (long double)localSize.data.i))) * localSize.data.i;
+				temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) - 1 - (localSize.data.i - ((fftDim.data.i / 2) % localSize.data.i + 1))) - (i - ((pfINT)pfceil((fftDim.data.i / 2 + 1) / (pfLD)localSize.data.i))) * localSize.data.i;
 
 				if (sc->stridedSharedLayout)
 				{
@@ -206,7 +206,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 				}
 				appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
 				if (sc->mergeSequencesR2C) {
-					temp_int.data.i = ((int64_t)ceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
+					temp_int.data.i = ((pfINT)pfceil(fftDim.data.i / 2.0) + (1 - fftDim.data.i % 2));
 
 					if (sc->stridedSharedLayout)
 						temp_int.data.i *= sc->sharedStride.data.i;
@@ -214,7 +214,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 					PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
 					appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
 
-					PfShuffleComplexInv(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, 0);
+					PfShuffleComplexInv(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->temp, &sc->w);
 				}
 				PfConjugate(sc, &sc->regIDs[i], &sc->regIDs[i]);
 
@@ -263,9 +263,9 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 			temp_int.data.i = 0;
 			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 		}
-		for (uint64_t k = 0; k < sc->registerBoost; k++) {
-			for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-				if ((int64_t)(1 + i + k * used_registers.data.i) * localSize.data.i > fftDim.data.i) {
+		for (pfUINT k = 0; k < sc->registerBoost; k++) {
+			for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+				if ((pfINT)(1 + i + k * used_registers.data.i) * localSize.data.i > fftDim.data.i) {
 					temp_int.data.i = fftDim.data.i - (i + k * used_registers.data.i) * localSize.data.i;
 					PfIf_lt_start(sc, localInvocationID, &temp_int);
 				}
@@ -284,7 +284,7 @@ static inline void appendC2R_read(VkFFTSpecializationConstantsLayout* sc, int ty
 					PfAdd(sc, &sc->sdataID, &sc->sdataID, localInvocationID);
 				}
 				appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i + k * used_registers.data.i]);
-				if ((int64_t)(1 + i + k * used_registers.data.i) * localSize.data.i > fftDim.data.i) {
+				if ((pfINT)(1 + i + k * used_registers.data.i) * localSize.data.i > fftDim.data.i) {
 					sc->tempLen = sprintf(sc->tempStr, "		}\n");
 					PfAppendLine(sc);
 				}
@@ -304,7 +304,7 @@ static inline void appendR2C_write(VkFFTSpecializationConstantsLayout* sc, int t
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
@@ -368,7 +368,7 @@ static inline void appendR2C_write(VkFFTSpecializationConstantsLayout* sc, int t
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
 	//we actually construct 2x used_registers here, if mult = 2
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
 
 		if (sc->localSize[1].data.i == 1) {
 			//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		combinedID = %s + %" PRIu64 ";\n", &sc->gl_LocalInvocationID_x, (i + k * used_registers) * &sc->localSize[0]);
@@ -435,15 +435,15 @@ static inline void appendR2C_write(VkFFTSpecializationConstantsLayout* sc, int t
 		temp_int.data.i = 0;
 		PfIf_eq_start(sc, &sc->tempInt, &temp_int);
 
-		PfAdd_x(sc, &sc->regIDs[i], &sc->temp, &sc->w);
-		PfSub_y(sc, &sc->regIDs[i], &sc->temp, &sc->w);
+		PfAdd(sc, &sc->regIDs[i].data.c[0], &sc->temp.data.c[0], &sc->w.data.c[0]);
+		PfSub(sc, &sc->regIDs[i].data.c[1], &sc->temp.data.c[1], &sc->w.data.c[1]);
 		PfIf_else(sc);
-		PfAdd_y(sc, &sc->temp, &sc->temp, &sc->w);
-		PfSub_x(sc, &sc->temp, &sc->w, &sc->temp);
-		PfMov_x_y(sc, &sc->regIDs[i], &sc->temp);
-		PfMov_y_x(sc, &sc->regIDs[i], &sc->temp);
+		PfAdd(sc, &sc->temp.data.c[1], &sc->temp.data.c[1], &sc->w.data.c[1]);
+		PfSub(sc, &sc->temp.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[0]);
+		PfMov(sc, &sc->regIDs[i].data.c[0], &sc->temp.data.c[1]);
+		PfMov(sc, &sc->regIDs[i].data.c[1], &sc->temp.data.c[0]);
 		PfIf_end(sc);
-		temp_double.data.d = 0.5l;
+		temp_double.data.d = pfFPinit("0.5");
 		PfMul(sc, &sc->regIDs[i], &sc->regIDs[i], &temp_double, 0);
 
 		temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2R.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2R.h
index 0fe5ad3a..51b3b159 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2R.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/PrePostProcessing/vkFFT_R2R.h
@@ -32,7 +32,7 @@ static inline void appendDCTI_read(VkFFTSpecializationConstantsLayout* sc, int t
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
@@ -86,7 +86,7 @@ static inline void appendDCTI_read(VkFFTSpecializationConstantsLayout* sc, int t
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
 		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
@@ -180,18 +180,18 @@ static inline void appendDCTI_read(VkFFTSpecializationConstantsLayout* sc, int t
 	return;
 }
 
-static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+static inline void appendDSTI_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
-
+	
 	PfContainer fftDim = VKFFT_ZERO_INIT;
 	fftDim.type = 31;
 
@@ -219,15 +219,14 @@ static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout
 
 	if (sc->zeropadBluestein[readWrite]) {
 		if (readWrite) {
-			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
+			fftDim.data.i = (sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i - 2) / 2;
 		}
 		else {
-			appendSetSMToZero(sc);
-			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
+			fftDim.data.i = (sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i - 2) / 2;
 		}
 	}
 	else {
-		fftDim.data.i = sc->fftDim.data.i;
+		fftDim.data.i = (sc->fftDim.data.i - 2) / 2;
 	}
 
 	if (sc->stridedSharedLayout) {
@@ -242,8 +241,8 @@ static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (sc->axis_id > 0) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
 			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
@@ -270,7 +269,6 @@ static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 			}
-
 			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
 			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
@@ -279,55 +277,59 @@ static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout
 				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 			}
 		}
-		if (sc->axis_id > 0){
-			temp_int.data.i = 2;
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
-		}
-		else {
+		if (!sc->stridedSharedLayout) {
 			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			temp_int.data.i = 2;
-			PfMod(sc, &sc->sdataID, &sc->tempInt, &temp_int);
-		}
-		PfMul(sc, &sc->blockInvocationID, &sc->sdataID, &temp_int, 0);
-		temp_int.data.i = 2;
-		if (sc->axis_id > 0) {
-			PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-		}
-		else {
-			PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
 		}
-		PfMul(sc, &sc->blockInvocationID, &sc->blockInvocationID, &sc->tempInt, 0);
-		
-		temp_int.data.i = fftDim.data.i - 1;
-		PfMul(sc, &sc->sdataID, &sc->sdataID, &temp_int, 0);
-		PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-		PfSub(sc, &sc->sdataID, &sc->sdataID, &sc->blockInvocationID);
+		if (sc->stridedSharedLayout) {
+			PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
 
-		if (sc->axis_id > 0) {
-			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			temp_int.data.i = (2 * fftDim.data.i + 1) * sc->sharedStride.data.i;
+			PfAdd(sc, &sc->inoutID, &sc->gl_LocalInvocationID_x, &temp_int);
+			PfSub(sc, &sc->inoutID, &sc->inoutID, &sc->tempInt);
+
+			PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &sc->tempInt);
+			
+			if (i == 0) {
+				temp_int.data.i = 0;
+				PfIf_eq_start(sc, &sc->combinedID, &temp_int);
+				PfSetToZeroShared(sc, &sc->sdataID);
+				temp_int.data.i = (fftDim.data.i + 1) * sc->sharedStride.data.i;
+				PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+				PfSetToZeroShared(sc, &sc->tempInt);
+				PfIf_end(sc);
+			}
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride);
 		}
 		else {
-			if (sc->stridedSharedLayout) {
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-			}
-			else {
-				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+
+			temp_int.data.i = (2 * fftDim.data.i + 1);
+			PfAdd(sc, &sc->inoutID, &sc->sdataID, &temp_int);
+			PfSub(sc, &sc->inoutID, &sc->inoutID, &sc->tempInt);
+
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+
+			if (i == 0) {
+				temp_int.data.i = 0;
+				PfIf_eq_start(sc, &sc->tempInt, &temp_int);
+				PfSetToZeroShared(sc, &sc->sdataID);
+				temp_int.data.i = (fftDim.data.i + 1);
+				PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+				PfSetToZeroShared(sc, &sc->tempInt);
+				PfIf_end(sc);
 			}
+			PfInc(sc, &sc->sdataID);
 		}
-		if(readWrite)
-			appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
-		else
-			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
-
-		if (sc->axis_id > 0) {
+		appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
+		PfMovNeg(sc, &sc->temp, &sc->temp);
+		appendRegistersToShared(sc, &sc->inoutID, &sc->temp);
+		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
 			temp_int1.data.i = fftDim.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
 				PfIf_end(sc);
 			}
 		}
@@ -342,20 +344,17 @@ static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout
 	if (sc->useDisableThreads) {
 		PfIf_end(sc);
 	}
-	if (readWrite)
-		sc->writeFromRegisters = 1;
-	else
-		sc->readToRegisters = 0;
 	return;
 }
-static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+
+static inline void appendDCTII_read_III_write(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
@@ -363,9 +362,6 @@ static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout
 	PfContainer fftDim = VKFFT_ZERO_INIT;
 	fftDim.type = 31;
 
-	PfContainer fftDim_half = VKFFT_ZERO_INIT;
-	fftDim_half.type = 31;
-
 	PfContainer localSize = VKFFT_ZERO_INIT;
 	localSize.type = 31;
 
@@ -393,6 +389,7 @@ static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout
 			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
 		}
 		else {
+			appendSetSMToZero(sc);
 			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
 		}
 	}
@@ -400,15 +397,11 @@ static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout
 		fftDim.data.i = sc->fftDim.data.i;
 	}
 
-	temp_int.data.i = 2;
-	PfDiv(sc, &fftDim_half, &fftDim, &temp_int);
-	PfInc(sc, &fftDim_half);
-
 	if (sc->stridedSharedLayout) {
-		PfDivCeil(sc, &used_registers, &fftDim_half, &sc->localSize[1]);
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
 	}
 	else {
-		PfDivCeil(sc, &used_registers, &fftDim_half, &sc->localSize[0]);
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
 	}
 
 	appendBarrierVkFFT(sc);
@@ -416,14 +409,14 @@ static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (sc->stridedSharedLayout) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->axis_id > 0) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
 			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
 
 			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim_half.data.i;
+			temp_int1.data.i = fftDim.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
 				//check that we only read fftDim * local batch data
 				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
@@ -444,165 +437,92 @@ static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 			}
+
 			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim_half.data.i * batching_localSize.data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim_half * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
 				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 			}
 		}
-
-		if (sc->LUT) {
-			if (sc->stridedSharedLayout) {
-				PfAdd(sc, &sc->tempInt, &sc->combinedID, &sc->startDCT3LUT);
-			}
-			else {
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
-				PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->startDCT3LUT);
-			}
-			appendGlobalToRegisters(sc, &sc->mult, &sc->LUTStruct, &sc->tempInt);
-			if ((!sc->mergeSequencesR2C) && (readWrite)) {
-				temp_double.data.d = 2.0l;
-				PfMul(sc, &sc->mult, &sc->mult, &temp_double, 0);
-			}
-			if (readWrite)
-				PfConjugate(sc, &sc->mult, &sc->mult);
+		if (sc->axis_id > 0){
+			temp_int.data.i = 2;
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
 		}
 		else {
-			if (readWrite)
-				temp_double.data.d = -sc->double_PI / 2.0 / fftDim.data.i;
-			else
-				temp_double.data.d = sc->double_PI / 2.0 / fftDim.data.i;
-			if (sc->stridedSharedLayout) {
-				PfMul(sc, &sc->tempFloat, &sc->combinedID, &temp_double, 0);
-			}
-			else {
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
-				PfMul(sc, &sc->tempFloat, &sc->tempInt, &temp_double, 0);
-			}
-
-			PfSinCos(sc, &sc->mult, &sc->tempFloat);
-			if ((!sc->mergeSequencesR2C) && (readWrite)) {
-				temp_double.data.d = 2.0l;
-				PfMul(sc, &sc->mult, &sc->mult, &temp_double, 0);
-			}
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			temp_int.data.i = 2;
+			PfMod(sc, &sc->sdataID, &sc->tempInt, &temp_int);
 		}
-
-		if (sc->stridedSharedLayout) {
-			PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
-
-			temp_int.data.i = fftDim.data.i * sc->sharedStride.data.i;
-			PfSub(sc, &sc->inoutID, &temp_int, &sc->sdataID);
-
-			temp_int.data.i = 0;
+		if (sc->performDST == 2)  {
+			temp_int.data.i = 1;
 			PfIf_eq_start(sc, &sc->sdataID, &temp_int);
-			PfMov(sc, &sc->inoutID, &sc->sdataID);
+			PfMovNeg(sc, &sc->regIDs[i], &sc->regIDs[i]);
 			PfIf_end(sc);
+		}
+		temp_int.data.i = 2;
+		PfMul(sc, &sc->blockInvocationID, &sc->sdataID, &temp_int, 0);
+		temp_int.data.i = 2;
+		if (sc->axis_id > 0) {
+			PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
+		}
+		else {
+			PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		}
+		PfMul(sc, &sc->blockInvocationID, &sc->blockInvocationID, &sc->tempInt, 0);
+		
+		temp_int.data.i = fftDim.data.i - 1;
+		PfMul(sc, &sc->sdataID, &sc->sdataID, &temp_int, 0);
+		PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+		PfSub(sc, &sc->sdataID, &sc->sdataID, &sc->blockInvocationID);
 
+		if (sc->axis_id > 0) {
+			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
-			PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->gl_LocalInvocationID_x);
 		}
 		else {
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim_half);
 			if (sc->stridedSharedLayout) {
 				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-
-				temp_int.data.i = fftDim.data.i * sc->sharedStride.data.i;
-				PfSub(sc, &sc->inoutID, &temp_int, &sc->sdataID);
-
-				temp_int.data.i = 0;
-				PfIf_eq_start(sc, &sc->sdataID, &temp_int);
-				PfMov(sc, &sc->inoutID, &sc->sdataID);
-				PfIf_end(sc);
-
-				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->tempInt);
 			}
 			else {
-				temp_int.data.i = fftDim.data.i;
-				PfSub(sc, &sc->inoutID, &temp_int, &sc->sdataID);
-
-				temp_int.data.i = 0;
-				PfIf_eq_start(sc, &sc->sdataID, &temp_int);
-				PfMov(sc, &sc->inoutID, &sc->sdataID);
-				PfIf_end(sc);
-
-				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
 				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->tempInt);
 			}
 		}
+		if(readWrite)
+			appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
+		else
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
 
-		if (readWrite) {
-			appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
-			if (sc->mergeSequencesR2C) {
-				appendSharedToRegisters(sc, &sc->w, &sc->inoutID);
-
-				PfAdd_x(sc, &sc->regIDs[0], &sc->temp, &sc->w);
-				PfSub_y(sc, &sc->regIDs[0], &sc->temp, &sc->w);
-				PfSub_x(sc, &sc->regIDs[1], &sc->w, &sc->temp);
-				PfAdd_y(sc, &sc->regIDs[1], &sc->temp, &sc->w);
-
-				PfMul(sc, &sc->temp, &sc->regIDs[0], &sc->mult, 0);
-				PfConjugate(sc, &sc->mult, &sc->mult);
-				PfMul(sc, &sc->w, &sc->regIDs[1], &sc->mult, 0);
-				PfMov_x(sc, &sc->regIDs[0], &sc->temp);
-				PfMov_y(sc, &sc->regIDs[0], &sc->w);
-				PfMov_x_Neg_y(sc, &sc->regIDs[1], &sc->temp);
-				PfMov_y_Neg_x(sc, &sc->regIDs[1], &sc->w);
-
-				appendRegistersToShared(sc, &sc->inoutID, &sc->regIDs[1]);
-				appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[0]);
+		if (sc->performDST == 3)  {
+			if (sc->axis_id > 0){
+				temp_int.data.i = 2;
+				PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
 			}
 			else {
-				PfMul(sc, &sc->regIDs[0], &sc->temp, &sc->mult, 0);
-				PfMov_x_Neg_y(sc, &sc->w, &sc->regIDs[0]);
-
-				appendRegistersToShared(sc, &sc->inoutID, &sc->w);
-				appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[0]);
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				temp_int.data.i = 2;
+				PfMod(sc, &sc->sdataID, &sc->tempInt, &temp_int);
 			}
-		}
-		else {
-			appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
-			appendSharedToRegisters(sc, &sc->w, &sc->inoutID);
-
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
-			temp_int.data.i = 0;
-			PfIf_eq_start(sc, &sc->tempInt, &temp_int);
-			PfSetToZero(sc, &sc->w);
+			temp_int.data.i = 1;
+			PfIf_eq_start(sc, &sc->sdataID, &temp_int);
+			PfMovNeg(sc, &sc->regIDs[i], &sc->regIDs[i]);
 			PfIf_end(sc);
-
-			PfMov_x_y(sc, &sc->regIDs[0], &sc->w);
-			PfMov_y_x(sc, &sc->regIDs[0], &sc->w);
-
-			PfSub_x(sc, &sc->regIDs[1], &sc->temp, &sc->regIDs[0]);
-			PfAdd_y(sc, &sc->regIDs[1], &sc->temp, &sc->regIDs[0]);
-
-			PfAdd_x(sc, &sc->w, &sc->temp, &sc->regIDs[0]);
-			PfSub_y(sc, &sc->w, &sc->temp, &sc->regIDs[0]);
-
-			PfMul(sc, &sc->regIDs[0], &sc->w, &sc->mult, 0);
-			PfConjugate(sc, &sc->mult, &sc->mult);
-
-			PfMul(sc, &sc->temp, &sc->regIDs[1], &sc->mult, 0);
-
-			appendRegistersToShared(sc, &sc->inoutID, &sc->temp);
-			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[0]);
 		}
-		if (sc->stridedSharedLayout) {
+		if (sc->axis_id > 0) {
 			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim_half.data.i;
+			temp_int1.data.i = fftDim.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
 				PfIf_end(sc);
 			}
 		}
 		else {
 			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim_half.data.i * batching_localSize.data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
 				PfIf_end(sc);
 			}
@@ -611,25 +531,30 @@ static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout
 	if (sc->useDisableThreads) {
 		PfIf_end(sc);
 	}
+	if (readWrite)
+		sc->writeFromRegisters = 1;
+	else
+		sc->readToRegisters = 0;
 	return;
 }
-
-static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+static inline void appendDCTII_write_III_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
 
 	PfContainer fftDim = VKFFT_ZERO_INIT;
 	fftDim.type = 31;
+
 	PfContainer fftDim_half = VKFFT_ZERO_INIT;
 	fftDim_half.type = 31;
+
 	PfContainer localSize = VKFFT_ZERO_INIT;
 	localSize.type = 31;
 
@@ -657,31 +582,27 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
 		}
 		else {
-			if (sc->readToRegisters == 1) {
-				appendSetSMToZero(sc);
-				appendBarrierVkFFT(sc);
-			}
 			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
 		}
 	}
-	else
-		fftDim.data.i = sc->fftDim.data.i;
-
-	fftDim.data.i = 2 * fftDim.data.i;	
-
-	if (sc->stridedSharedLayout) {
-		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
-	}
 	else {
-		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+		fftDim.data.i = sc->fftDim.data.i;
 	}
-	if (sc->readToRegisters == 1) {
+
+	if (sc->performDST == 3) {
+		appendBarrierVkFFT(sc);
+		if (sc->stridedSharedLayout) {
+			PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
+		}
+		else {
+			PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+		}
 		if (sc->useDisableThreads) {
 			temp_int.data.i = 0;
 			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 		}
-		for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-			if (sc->axis_id > 0) {
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->stridedSharedLayout) {
 				temp_int.data.i = (i)*sc->localSize[1].data.i;
 
 				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
@@ -712,69 +633,30 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
 				if (temp_int.data.i > temp_int1.data.i) {
 					//check that we only read fftDim * local batch data
-					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
 					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 				}
 			}
-			if (sc->axis_id > 0) {
-				temp_int1.data.i = 2;
-				PfDiv(sc, &sc->sdataID, &sc->combinedID, &temp_int1);
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->combinedID, &temp_int, &sc->combinedID);
+
+				PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
 
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
 			}
 			else {
-				if (sc->stridedSharedLayout) {
-					temp_int.data.i = fftDim.data.i;
-					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
-
-					temp_int1.data.i = 2;
-					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
-
-					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-				}
-				else {
-					temp_int.data.i = fftDim.data.i;
-					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
+				PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->sdataID, &temp_int, &sc->sdataID);
 
-					temp_int1.data.i = 2;
-					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 
-					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-				}
-			}
-			if (sc->axis_id > 0) {
-				temp_int.data.i = 2;
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-			}
-			else {
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				temp_int.data.i = 2;
-				PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			}
-			temp_int.data.i = 0;
-			PfIf_eq_start(sc, &sc->tempInt, &temp_int);
-			if (i < (uint64_t)used_registers.data.i / 2) {
-				appendRegistersToShared_x_x(sc, &sc->sdataID, &sc->regIDs[i]);
-			}
-			else {
-				appendRegistersToShared_x_y(sc, &sc->sdataID, &sc->regIDs[i - used_registers.data.i / 2]);
-			}
-#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))
-			PfIf_else(sc);
-			if (i < (uint64_t)used_registers.data.i / 2) {
-				appendRegistersToShared_y_x(sc, &sc->sdataID, &sc->regIDs[i]);
-			}
-			else {
-				appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i - used_registers.data.i / 2]);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 			}
-#endif
-			PfIf_end(sc);
-			if (sc->axis_id > 0) {
+			appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
+			if (sc->stridedSharedLayout) {
 				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
 				temp_int1.data.i = fftDim.data.i;
 				if (temp_int.data.i > temp_int1.data.i) {
@@ -792,14 +674,14 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 		if (sc->useDisableThreads) {
 			PfIf_end(sc);
 		}
-#if(((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))
 		appendBarrierVkFFT(sc);
+		
 		if (sc->useDisableThreads) {
 			temp_int.data.i = 0;
 			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 		}
-		for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-			if (sc->axis_id > 0) {
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->stridedSharedLayout) {
 				temp_int.data.i = (i)*sc->localSize[1].data.i;
 
 				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
@@ -830,61 +712,25 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
 				if (temp_int.data.i > temp_int1.data.i) {
 					//check that we only read fftDim * local batch data
-					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
 					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
-			}
-			if (sc->axis_id > 0) {
-				temp_int1.data.i = 2;
-				PfDiv(sc, &sc->sdataID, &sc->combinedID, &temp_int1);
+			if (sc->stridedSharedLayout) {
+				PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
 
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
 			}
 			else {
-				if (sc->stridedSharedLayout) {
-					temp_int.data.i = fftDim.data.i;
-					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
-
-					temp_int1.data.i = 2;
-					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
-
-					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-				}
-				else {
-					temp_int.data.i = fftDim.data.i;
-					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
-
-					temp_int1.data.i = 2;
-					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
+				PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 
-					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-				}
-			}
-			if (sc->axis_id > 0) {
-				temp_int.data.i = 2;
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &temp_int);
-			}
-			else {
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				temp_int.data.i = 2;
-				PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			}
-			temp_int.data.i = 1;
-			PfIf_eq_start(sc, &sc->tempInt, &temp_int);
-			if ((int64_t)i < used_registers.data.i / 2) {
-				appendRegistersToShared_y_x(sc, &sc->sdataID, &sc->regIDs[i]);
-			}
-			else {
-				appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i - used_registers.data.i / 2]);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 			}
-			PfIf_end(sc);
-
-			if (sc->axis_id > 0) {
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+			if (sc->stridedSharedLayout) {
 				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
 				temp_int1.data.i = fftDim.data.i;
 				if (temp_int.data.i > temp_int1.data.i) {
@@ -902,29 +748,32 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 		if (sc->useDisableThreads) {
 			PfIf_end(sc);
 		}
-#endif
-	}
-	appendBarrierVkFFT(sc);
-	if (sc->useDisableThreads) {
-		temp_int.data.i = 0;
-		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	fftDim.data.i = fftDim.data.i / 2;
+
+	temp_int.data.i = 2;
+	PfDiv(sc, &fftDim_half, &fftDim, &temp_int);
+	PfInc(sc, &fftDim_half);
 
 	if (sc->stridedSharedLayout) {
-		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
+		PfDivCeil(sc, &used_registers, &fftDim_half, &sc->localSize[1]);
 	}
 	else {
-		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+		PfDivCeil(sc, &used_registers, &fftDim_half, &sc->localSize[0]);
+	}
+
+	appendBarrierVkFFT(sc);
+	if (sc->useDisableThreads) {
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
 		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
 			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
 
 			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
+			temp_int1.data.i = fftDim_half.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
 				//check that we only read fftDim * local batch data
 				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
@@ -946,183 +795,164 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 			}
 			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			temp_int1.data.i = fftDim_half.data.i * batching_localSize.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				//check that we only read fftDim_half * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
 				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 			}
 		}
 
-		if (sc->stridedSharedLayout) {
-			PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
-
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->combinedID, &temp_int);
+		if (sc->LUT) {
+			if (sc->stridedSharedLayout) {
+				PfAdd(sc, &sc->tempInt, &sc->combinedID, &sc->startDCT3LUT);
+			}
+			else {
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+				PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->startDCT3LUT);
+			}
+			appendGlobalToRegisters(sc, &sc->mult, &sc->LUTStruct, &sc->tempInt);
+			if ((!sc->mergeSequencesR2C) && (readWrite)) {
+				temp_double.data.d = pfFPinit("2.0");
+				PfMul(sc, &sc->mult, &sc->mult, &temp_double, 0);
+			}
+			if (readWrite)
+				PfConjugate(sc, &sc->mult, &sc->mult);
 		}
 		else {
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			if (readWrite)
+				temp_double.data.d = -sc->double_PI / pfFPinit("2.0") / fftDim.data.i;
+			else
+				temp_double.data.d = sc->double_PI / pfFPinit("2.0") / fftDim.data.i;
+			if (sc->stridedSharedLayout) {
+				PfMul(sc, &sc->tempFloat, &sc->combinedID, &temp_double, 0);
+			}
+			else {
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+				PfMul(sc, &sc->tempFloat, &sc->tempInt, &temp_double, 0);
+			}
 
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+			PfSinCos(sc, &sc->mult, &sc->tempFloat);
+			if ((!sc->mergeSequencesR2C) && (readWrite)) {
+				temp_double.data.d = pfFPinit("2.0");
+				PfMul(sc, &sc->mult, &sc->mult, &temp_double, 0);
+			}
 		}
 
 		if (sc->stridedSharedLayout) {
-			PfSub(sc, &sc->tempInt, &sc->sdataID, &sc->sharedStride);
-			appendSharedToRegisters_y_y(sc, &sc->w, &sc->tempInt);
-		}
-		else {
-			temp_int.data.i = 1;
-			PfSub(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			appendSharedToRegisters_y_y(sc, &sc->w, &sc->tempInt);
-		}
+			PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
 
-		appendSharedToRegisters_x_x(sc, &sc->w, &sc->sdataID);
-		
-		PfMov_x_y(sc, &sc->regIDs[i], &sc->w);
-		PfMov_y_Neg_x(sc, &sc->regIDs[i], &sc->w);
-		PfAdd(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->w);
-		
-		PfIf_else(sc);
+			temp_int.data.i = fftDim.data.i * sc->sharedStride.data.i;
+			PfSub(sc, &sc->inoutID, &temp_int, &sc->sdataID);
 
-		appendSharedToRegisters_x_x(sc, &sc->regIDs[i], &sc->sdataID);
-		if (sc->stridedSharedLayout) {
-			temp_int.data.i = (fftDim.data.i - 1) * sc->sharedStride.data.i;
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
-		}
-		else {
-			temp_int.data.i = (fftDim.data.i - 1);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
-		}
-		appendSharedToRegisters_y_y(sc, &sc->regIDs[i], &sc->sdataID);
-		temp_double.data.d = 2.0l;
-		PfMul(sc, &sc->regIDs[i], &sc->regIDs[i], &temp_double, 0);
-		
-		PfIf_end(sc);
-		if (sc->stridedSharedLayout) {
-			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_end(sc);
-			}
+			temp_int.data.i = 0;
+			PfIf_eq_start(sc, &sc->sdataID, &temp_int);
+			PfMov(sc, &sc->inoutID, &sc->sdataID);
+			PfIf_end(sc);
+
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->gl_LocalInvocationID_x);
 		}
 		else {
-			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				PfIf_end(sc);
-			}
-		}
-	}
-	if (sc->useDisableThreads) {
-		PfIf_end(sc);
-	}
-	appendBarrierVkFFT(sc);
-	if (sc->useDisableThreads) {
-		temp_int.data.i = 0;
-		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
-	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (sc->stridedSharedLayout) {
-			temp_int.data.i = (i)*sc->localSize[1].data.i;
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim_half);
+			if (sc->stridedSharedLayout) {
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 
-			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+				temp_int.data.i = fftDim.data.i * sc->sharedStride.data.i;
+				PfSub(sc, &sc->inoutID, &temp_int, &sc->sdataID);
 
-			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
-			}
-		}
-		else {
-			if (sc->localSize[1].data.i == 1) {
-				temp_int.data.i = (i)*sc->localSize[0].data.i;
+				temp_int.data.i = 0;
+				PfIf_eq_start(sc, &sc->sdataID, &temp_int);
+				PfMov(sc, &sc->inoutID, &sc->sdataID);
+				PfIf_end(sc);
 
-				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->tempInt);
 			}
 			else {
-				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+				temp_int.data.i = fftDim.data.i;
+				PfSub(sc, &sc->inoutID, &temp_int, &sc->sdataID);
 
-				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int.data.i = 0;
+				PfIf_eq_start(sc, &sc->sdataID, &temp_int);
+				PfMov(sc, &sc->inoutID, &sc->sdataID);
+				PfIf_end(sc);
 
-				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
-				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
-			}
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 
-			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->tempInt);
 			}
 		}
-		if (sc->stridedSharedLayout) {
-			PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
 
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->combinedID, &temp_int);
-		}
-		else {
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+		if (readWrite) {
+			appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
+			if (sc->mergeSequencesR2C) {
+				appendSharedToRegisters(sc, &sc->w, &sc->inoutID);
 
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-		}
+				PfAdd(sc, &sc->regIDs[0].data.c[0], &sc->temp.data.c[0], &sc->w.data.c[0]);
+				PfSub(sc, &sc->regIDs[0].data.c[1], &sc->temp.data.c[1], &sc->w.data.c[1]);
+				PfSub(sc, &sc->regIDs[1].data.c[0], &sc->w.data.c[0], &sc->temp.data.c[0]);
+				PfAdd(sc, &sc->regIDs[1].data.c[1], &sc->temp.data.c[1], &sc->w.data.c[1]);
 
-		appendRegistersToShared_x_x(sc, &sc->sdataID, &sc->regIDs[i]);
+				PfMul(sc, &sc->temp, &sc->regIDs[0], &sc->mult, 0);
+				PfConjugate(sc, &sc->mult, &sc->mult);
+				PfMul(sc, &sc->w, &sc->regIDs[1], &sc->mult, 0);
+				PfMov(sc, &sc->regIDs[0].data.c[0], &sc->temp.data.c[0]);
+				PfMov(sc, &sc->regIDs[0].data.c[1], &sc->w.data.c[1]);
+				PfMovNeg(sc, &sc->regIDs[1].data.c[0], &sc->temp.data.c[1]);
+				PfMovNeg(sc, &sc->regIDs[1].data.c[1], &sc->w.data.c[0]);
 
-#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
-		if (sc->stridedSharedLayout) {
-			PfSub(sc, &sc->sdataID, &fftDim, &sc->combinedID);
+				appendRegistersToShared(sc, &sc->inoutID, &sc->regIDs[1]);
+				appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[0]);
+			}
+			else {
+				PfMul(sc, &sc->regIDs[0], &sc->temp, &sc->mult, 0);
+				PfMovNeg(sc, &sc->w.data.c[0], &sc->regIDs[0].data.c[1]);
 
-			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+				appendRegistersToShared(sc, &sc->inoutID, &sc->w);
+				appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[0]);
+			}
 		}
 		else {
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-			PfSub(sc, &sc->sdataID, &fftDim, &sc->sdataID);
+			appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
+			appendSharedToRegisters(sc, &sc->w, &sc->inoutID);
 
-			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-		}
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim_half);
+			temp_int.data.i = 0;
+			PfIf_eq_start(sc, &sc->tempInt, &temp_int);
+			PfSetToZero(sc, &sc->w);
+			PfIf_end(sc);
 
-		appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i]);
-#endif
-		
-		PfIf_else(sc);
+			PfMov(sc, &sc->regIDs[0].data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->regIDs[0].data.c[1], &sc->w.data.c[0]);
 
-		appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+			PfSub(sc, &sc->regIDs[1].data.c[0], &sc->temp.data.c[0], &sc->regIDs[0].data.c[0]);
+			PfAdd(sc, &sc->regIDs[1].data.c[1], &sc->temp.data.c[1], &sc->regIDs[0].data.c[1]);
 
-		PfIf_end(sc);
+			PfAdd(sc, &sc->w.data.c[0], &sc->temp.data.c[0], &sc->regIDs[0].data.c[0]);
+			PfSub(sc, &sc->w.data.c[1], &sc->temp.data.c[1], &sc->regIDs[0].data.c[1]);
+
+			PfMul(sc, &sc->regIDs[0], &sc->w, &sc->mult, 0);
+			PfConjugate(sc, &sc->mult, &sc->mult);
 
+			PfMul(sc, &sc->temp, &sc->regIDs[1], &sc->mult, 0);
+
+			appendRegistersToShared(sc, &sc->inoutID, &sc->temp);
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[0]);
+		}
 		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
+			temp_int1.data.i = fftDim_half.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
 				PfIf_end(sc);
 			}
 		}
 		else {
 			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			temp_int1.data.i = fftDim_half.data.i * batching_localSize.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
 				PfIf_end(sc);
 			}
@@ -1131,120 +961,185 @@ static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc,
 	if (sc->useDisableThreads) {
 		PfIf_end(sc);
 	}
-#if(((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
-
-	appendBarrierVkFFT(sc);
-	if (sc->useDisableThreads) {
-		temp_int.data.i = 0;
-		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
-	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
+	if (sc->performDST == 2) {
+		appendBarrierVkFFT(sc);
 		if (sc->stridedSharedLayout) {
-			temp_int.data.i = (i)*sc->localSize[1].data.i;
-
-			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
-
-			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
-			}
+			PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
 		}
 		else {
-			if (sc->localSize[1].data.i == 1) {
-				temp_int.data.i = (i)*sc->localSize[0].data.i;
+			PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+		}
+		if (sc->useDisableThreads) {
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
 
-				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
 			else {
-				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
 
-				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
 
-				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
-				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
-			}
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
 
-			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
-		}
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->combinedID, &temp_int, &sc->combinedID);
 
-		if (sc->stridedSharedLayout) {
-			PfSub(sc, &sc->sdataID, &fftDim, &sc->combinedID);
+				PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
 
-			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			}
+			else {
+				PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->sdataID, &temp_int, &sc->sdataID);
 
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->combinedID, &temp_int);
-		}
-		else {
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-			PfSub(sc, &sc->sdataID, &fftDim, &sc->sdataID);
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 
-			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-			
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			}
+			appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+		}
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+		appendBarrierVkFFT(sc);
+		
+		if (sc->useDisableThreads) {
 			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
 
-		appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i]);
-
-		PfIf_end(sc);
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
 
-		if (sc->stridedSharedLayout) {
-			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				PfIf_end(sc);
-			}
-		}
-		else {
-			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				PfIf_end(sc);
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
-		}
-	}
-	if (sc->useDisableThreads) {
-		PfIf_end(sc);
-	}
-#endif
+			else {
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
 
-	appendDCTII_write_III_read(sc, type, 0);
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
 
-	sc->readToRegisters = 0;
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			if (sc->stridedSharedLayout) {
+				PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
+
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			}
+			else {
+				PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			}
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+		}
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+	}
 	return;
 }
-static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+
+static inline void appendDCTIV_even_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
-	PfContainer temp_int2 = VKFFT_ZERO_INIT;
-	temp_int2.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
 
 	PfContainer fftDim = VKFFT_ZERO_INIT;
 	fftDim.type = 31;
-
+	PfContainer fftDim_half = VKFFT_ZERO_INIT;
+	fftDim_half.type = 31;
 	PfContainer localSize = VKFFT_ZERO_INIT;
 	localSize.type = 31;
 
@@ -1268,11 +1163,21 @@ static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc
 	}
 
 	if (sc->zeropadBluestein[readWrite]) {
-		fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
+		if (readWrite) {
+			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
+		}
+		else {
+			if (sc->readToRegisters == 1) {
+				appendSetSMToZero(sc);
+				appendBarrierVkFFT(sc);
+			}
+			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
+		}
 	}
-	else {
+	else
 		fftDim.data.i = sc->fftDim.data.i;
-	}
+
+	fftDim.data.i = 2 * fftDim.data.i;	
 
 	if (sc->stridedSharedLayout) {
 		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
@@ -1280,147 +1185,264 @@ static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc
 	else {
 		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
 	}
+	if (sc->readToRegisters == 1) {
+		if (sc->useDisableThreads) {
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
 
-	appendDCTII_read_III_write(sc, type, 1);
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
 
-	appendBarrierVkFFT(sc);
-	if (sc->useDisableThreads) {
-		temp_int.data.i = 0;
-		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
-	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (sc->axis_id > 0) {
-			temp_int.data.i = (i)*sc->localSize[1].data.i;
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			else {
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
 
-			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
 
-			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
-		}
-		else {
-			if (sc->localSize[1].data.i == 1) {
-				temp_int.data.i = (i)*sc->localSize[0].data.i;
+			if (sc->axis_id > 0) {
+				temp_int1.data.i = 2;
+				PfDiv(sc, &sc->sdataID, &sc->combinedID, &temp_int1);
 
-				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
 			}
 			else {
-				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+				if (sc->stridedSharedLayout) {
+					temp_int.data.i = fftDim.data.i;
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
 
-				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+					temp_int1.data.i = 2;
+					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
 
-				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
-				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
-			}
+					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+				else {
+					temp_int.data.i = fftDim.data.i;
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
 
-			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				//check that we only read fftDim * local batch data
-				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
-			}
-		}
-		if (sc->axis_id > 0) {
-			temp_int.data.i = 2;
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
-		}
-		else {
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			temp_int.data.i = 2;
-			PfMod(sc, &sc->sdataID, &sc->tempInt, &temp_int);
-		}
-		PfMul(sc, &sc->blockInvocationID, &sc->sdataID, &temp_int, 0);
-		temp_double.data.d = 1;
-		PfSub(sc, &sc->tempFloat, &temp_double, &sc->blockInvocationID);
-		PfMul_y(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->tempFloat, 0);
+					temp_int1.data.i = 2;
+					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
 
-		if (sc->LUT) {
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
+					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+			}
 			if (sc->axis_id > 0) {
-				PfAdd(sc, &sc->tempInt, &sc->combinedID, &sc->startDCT4LUT);
+				temp_int.data.i = 2;
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &temp_int);
 			}
 			else {
-				PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->startDCT4LUT);
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				temp_int.data.i = 2;
+				PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
 			}
-			appendGlobalToRegisters(sc, &sc->mult, &sc->LUTStruct, &sc->tempInt);
-		}
-		else {
-			temp_int.data.i = 2;
-			if (sc->axis_id > 0) {
-				PfMul(sc, &sc->tempInt, &sc->combinedID, &temp_int, 0);
+			if (sc->performDST)  {
+				if (i < (pfUINT)used_registers.data.i / 2) {
+					temp_int.data.i = 1;
+					PfIf_eq_start(sc, &sc->tempInt, &temp_int);
+					PfMovNeg(sc, &sc->regIDs[i].data.c[0], &sc->regIDs[i].data.c[0]);
+					PfIf_end(sc);
+				}
+				else {
+					temp_int.data.i = 1;
+					PfIf_eq_start(sc, &sc->tempInt, &temp_int);
+					PfMovNeg(sc, &sc->regIDs[i - used_registers.data.i / 2].data.c[1], &sc->regIDs[i - used_registers.data.i / 2].data.c[1]);
+					PfIf_end(sc);
+				}
+			}
+			temp_int.data.i = 0;
+			PfIf_eq_start(sc, &sc->tempInt, &temp_int);
+			if (i < (pfUINT)used_registers.data.i / 2) {
+				appendRegistersToShared_x_x(sc, &sc->sdataID, &sc->regIDs[i]);
 			}
 			else {
-				PfMul(sc, &sc->tempInt, &sc->tempInt, &temp_int, 0);
+				appendRegistersToShared_x_y(sc, &sc->sdataID, &sc->regIDs[i - used_registers.data.i / 2]);
 			}
-			PfInc(sc, &sc->tempInt);
-			if (readWrite)
-				temp_double.data.d = -sc->double_PI / 8.0 / fftDim.data.i;
-			else
-				temp_double.data.d = sc->double_PI / 8.0 / fftDim.data.i;
-			PfMul(sc, &sc->tempFloat, &sc->tempInt, &temp_double, 0);
-
-			PfSinCos(sc, &sc->mult, &sc->tempFloat);
-		}
-
-		PfMul(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->mult, &sc->temp);
-		PfConjugate(sc, &sc->regIDs[i], &sc->regIDs[i]);
-
-		if (sc->axis_id > 0) {
-			PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
-
-			temp_int.data.i = (fftDim.data.i - 1) * sc->sharedStride.data.i;
-			PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &temp_int);
-			PfSub(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-		}
-		else {
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			if (sc->stridedSharedLayout) {
-				temp_int.data.i = (fftDim.data.i-1);
-				PfSub(sc, &sc->sdataID, &temp_int, &sc->tempInt);
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-				
-				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))
+			PfIf_else(sc);
+			if (i < (pfUINT)used_registers.data.i / 2) {
+				appendRegistersToShared_y_x(sc, &sc->sdataID, &sc->regIDs[i]);
 			}
 			else {
-				PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
-
-				temp_int.data.i = (fftDim.data.i-1);
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
-				PfSub(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i - used_registers.data.i / 2]);
 			}
-		}
-		appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i]);
-		if (sc->axis_id > 0) {
-			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				PfIf_end(sc);
+#endif
+			PfIf_end(sc);
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
 			}
 		}
-		else {
-			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-			if (temp_int.data.i > temp_int1.data.i) {
-				PfIf_end(sc);
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+#if(((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))
+		appendBarrierVkFFT(sc);
+		if (sc->useDisableThreads) {
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			else {
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+			}
+			if (sc->axis_id > 0) {
+				temp_int1.data.i = 2;
+				PfDiv(sc, &sc->sdataID, &sc->combinedID, &temp_int1);
+
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			}
+			else {
+				if (sc->stridedSharedLayout) {
+					temp_int.data.i = fftDim.data.i;
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
+
+					temp_int1.data.i = 2;
+					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
+
+					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+				else {
+					temp_int.data.i = fftDim.data.i;
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
+
+					temp_int1.data.i = 2;
+					PfDiv(sc, &sc->sdataID, &sc->sdataID, &temp_int1);
+
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
+					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+			}
+			if (sc->axis_id > 0) {
+				temp_int.data.i = 2;
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &temp_int);
+			}
+			else {
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				temp_int.data.i = 2;
+				PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+			}
+			temp_int.data.i = 1;
+			PfIf_eq_start(sc, &sc->tempInt, &temp_int);
+			if ((pfINT)i < used_registers.data.i / 2) {
+				appendRegistersToShared_y_x(sc, &sc->sdataID, &sc->regIDs[i]);
+			}
+			else {
+				appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i - used_registers.data.i / 2]);
+			}
+			PfIf_end(sc);
+
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
 			}
 		}
-	}
-	if (sc->useDisableThreads) {
-		PfIf_end(sc);
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+#endif
 	}
 	appendBarrierVkFFT(sc);
 	if (sc->useDisableThreads) {
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-		if (sc->axis_id > 0) {
+	fftDim.data.i = fftDim.data.i / 2;
+
+	if (sc->stridedSharedLayout) {
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
+	}
+	else {
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+	}
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
 			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
@@ -1447,7 +1469,6 @@ static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
 				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 			}
-
 			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
 			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
@@ -1457,31 +1478,62 @@ static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc
 			}
 		}
 
-		if (sc->axis_id > 0) {
-			PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
+		if (sc->stridedSharedLayout) {
+			PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
 
-			PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &sc->tempInt);
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->combinedID, &temp_int);
 		}
 		else {
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+
 			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			if (sc->stridedSharedLayout) {
-				PfMul(sc, &sc->sdataID, &sc->tempInt, &sc->sharedStride, 0);
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		}
 
-				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-			}
-			else {
-				PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+		if (sc->stridedSharedLayout) {
+			PfSub(sc, &sc->tempInt, &sc->sdataID, &sc->sharedStride);
+			appendSharedToRegisters_y_y(sc, &sc->w, &sc->tempInt);
+		}
+		else {
+			temp_int.data.i = 1;
+			PfSub(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+			appendSharedToRegisters_y_y(sc, &sc->w, &sc->tempInt);
+		}
 
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-			}
+		appendSharedToRegisters_x_x(sc, &sc->w, &sc->sdataID);
+		
+		PfMov(sc, &sc->regIDs[i].data.c[0], &sc->w.data.c[1]);
+		PfMovNeg(sc, &sc->regIDs[i].data.c[1], &sc->w.data.c[0]);
+		PfAdd(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->w);
+		
+		PfIf_else(sc);
+
+		appendSharedToRegisters_x_x(sc, &sc->regIDs[i], &sc->sdataID);
+		if (sc->stridedSharedLayout) {
+			temp_int.data.i = (fftDim.data.i - 1) * sc->sharedStride.data.i;
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
+		}
+		else {
+			temp_int.data.i = (fftDim.data.i - 1);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
 		}
 		appendSharedToRegisters_y_y(sc, &sc->regIDs[i], &sc->sdataID);
-		if (sc->axis_id > 0) {
+		temp_double.data.d = pfFPinit("2.0");
+		PfMul(sc, &sc->regIDs[i], &sc->regIDs[i], &temp_double, 0);
+		
+		PfIf_end(sc);
+		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
 			temp_int1.data.i = fftDim.data.i;
 			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
 				PfIf_end(sc);
 			}
 		}
@@ -1496,71 +1548,121 @@ static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc
 	if (sc->useDisableThreads) {
 		PfIf_end(sc);
 	}
-	return;
-}
-
-static inline void appendDCTIV_odd_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	PfContainer temp_int = VKFFT_ZERO_INIT;
-	temp_int.type = 31;
-	PfContainer temp_int1 = VKFFT_ZERO_INIT;
-	temp_int1.type = 31;
-	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	appendBarrierVkFFT(sc);
+	if (sc->useDisableThreads) {
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+	}
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->stridedSharedLayout) {
+			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
-	PfContainer used_registers = VKFFT_ZERO_INIT;
-	used_registers.type = 31;
+			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
 
-	PfContainer fftDim = VKFFT_ZERO_INIT;
-	fftDim.type = 31;
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+		}
+		else {
+			if (sc->localSize[1].data.i == 1) {
+				temp_int.data.i = (i)*sc->localSize[0].data.i;
 
-	PfContainer localSize = VKFFT_ZERO_INIT;
-	localSize.type = 31;
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+			}
+			else {
+				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
 
-	PfContainer batching_localSize = VKFFT_ZERO_INIT;
-	batching_localSize.type = 31;
+				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
 
-	PfContainer* localInvocationID = VKFFT_ZERO_INIT;
-	PfContainer* batchingInvocationID = VKFFT_ZERO_INIT;
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+			}
 
-	if (sc->stridedSharedLayout) {
-		batching_localSize.data.i = sc->localSize[0].data.i;
-		localSize.data.i = sc->localSize[1].data.i;
-		localInvocationID = &sc->gl_LocalInvocationID_y;
-		batchingInvocationID = &sc->gl_LocalInvocationID_x;
-	}
-	else {
-		batching_localSize.data.i = sc->localSize[1].data.i;
-		localSize.data.i = sc->localSize[0].data.i;
-		localInvocationID = &sc->gl_LocalInvocationID_x;
-		batchingInvocationID = &sc->gl_LocalInvocationID_y;
-	}
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+		}
+		if (sc->stridedSharedLayout) {
+			PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
 
-	if (sc->zeropadBluestein[readWrite]) {
-		if (readWrite) {
-			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->combinedID, &temp_int);
 		}
 		else {
-			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
 		}
-	}
-	else {
-		fftDim.data.i = sc->fftDim.data.i;
-	}
 
-	if (sc->stridedSharedLayout) {
-		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
+		appendRegistersToShared_x_x(sc, &sc->sdataID, &sc->regIDs[i]);
+
+#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
+		if (sc->stridedSharedLayout) {
+			PfSub(sc, &sc->sdataID, &fftDim, &sc->combinedID);
+
+			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+		}
+		else {
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+			PfSub(sc, &sc->sdataID, &fftDim, &sc->sdataID);
+
+			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+		}
+
+		appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i]);
+#endif
+		
+		PfIf_else(sc);
+
+		appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+
+		PfIf_end(sc);
+
+		if (sc->stridedSharedLayout) {
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_end(sc);
+			}
+		}
+		else {
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				PfIf_end(sc);
+			}
+		}
 	}
-	else {
-		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+	if (sc->useDisableThreads) {
+		PfIf_end(sc);
 	}
+#if(((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
 
 	appendBarrierVkFFT(sc);
 	if (sc->useDisableThreads) {
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
 		if (sc->stridedSharedLayout) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
@@ -1597,80 +1699,31 @@ static inline void appendDCTIV_odd_read(VkFFTSpecializationConstantsLayout* sc,
 				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 			}
 		}
-		if (sc->stridedSharedLayout) {
-			temp_int.data.i = 4;
-			PfMul(sc, &sc->inoutID, &sc->combinedID, &temp_int,0);
-		}
-		else {
-			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-			temp_int.data.i = 4;
-			PfMul(sc, &sc->inoutID, &sc->tempInt, &temp_int,0);
-		}
-		temp_int.data.i = fftDim.data.i / 2;
-		PfAdd(sc, &sc->inoutID, &sc->inoutID, &temp_int);
-		
-		PfIf_lt_start(sc, &sc->inoutID, &fftDim);
-		PfMov(sc, &sc->sdataID, &sc->inoutID);
-		PfIf_end(sc);
-		
-		temp_int.data.i = fftDim.data.i * 2;
-		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
-		PfIf_ge_start(sc, &sc->inoutID, &fftDim);
-		temp_int.data.i = fftDim.data.i * 2 - 1;
-		PfSub(sc, &sc->sdataID, &temp_int, &sc->inoutID);
-		PfIf_end(sc);
-		PfIf_end(sc);
-
-		temp_int.data.i = fftDim.data.i * 3;
-		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
-		temp_int.data.i = fftDim.data.i * 2;
-		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
-		temp_int.data.i = fftDim.data.i * 2;
-		PfSub(sc, &sc->sdataID, &sc->inoutID, &temp_int);
-		PfIf_end(sc);
-		PfIf_end(sc);
-
-		temp_int.data.i = fftDim.data.i * 4;
-		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
-		temp_int.data.i = fftDim.data.i * 3;
-		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
-		temp_int.data.i = fftDim.data.i * 4 - 1;
-		PfSub(sc, &sc->sdataID, &temp_int, &sc->inoutID);
-		PfIf_end(sc);
-		PfIf_end(sc);
 
-		temp_int.data.i = fftDim.data.i * 4;
-		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
-		temp_int.data.i = fftDim.data.i * 4;
-		PfSub(sc, &sc->sdataID, &sc->inoutID, &temp_int);
-		PfIf_end(sc);
-		
 		if (sc->stridedSharedLayout) {
+			PfSub(sc, &sc->sdataID, &fftDim, &sc->combinedID);
+
 			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->combinedID, &temp_int);
 		}
 		else {
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+			PfSub(sc, &sc->sdataID, &fftDim, &sc->sdataID);
+
 			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
 			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
 		}
-		appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
 
-		temp_int.data.i = fftDim.data.i * 2;
-		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
-		PfIf_ge_start(sc, &sc->inoutID, &fftDim);
-		PfMov_x_Neg_x(sc, &sc->regIDs[i], &sc->regIDs[i]);
-		PfMov_y_Neg_y(sc, &sc->regIDs[i], &sc->regIDs[i]);
-		PfIf_end(sc);
-		PfIf_end(sc);
+		appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i]);
 
-		temp_int.data.i = fftDim.data.i * 3;
-		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
-		temp_int.data.i = fftDim.data.i * 2;
-		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
-		PfMov_x_Neg_x(sc, &sc->regIDs[i], &sc->regIDs[i]);
-		PfMov_y_Neg_y(sc, &sc->regIDs[i], &sc->regIDs[i]);
-		PfIf_end(sc);
 		PfIf_end(sc);
 
 		if (sc->stridedSharedLayout) {
@@ -1691,101 +1744,24 @@ static inline void appendDCTIV_odd_read(VkFFTSpecializationConstantsLayout* sc,
 	if (sc->useDisableThreads) {
 		PfIf_end(sc);
 	}
-	int64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1;
-	if ((sc->rader_generator[0] > 0) || ((sc->fftDim.data.i / registers_first_stage) != localSize.data.i))
-		sc->readToRegisters = 0;
-	else
-		sc->readToRegisters = 0; // can be switched to 1 if the indexing in previous step is aligned to 1 stage of fft (here it is combined)
-
-	if (!sc->readToRegisters) {
-
-		appendBarrierVkFFT(sc);
-		if (sc->useDisableThreads) {
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
-		}
-		for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
-			if (sc->stridedSharedLayout) {
-				temp_int.data.i = (i)*sc->localSize[1].data.i;
-
-				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
-
-				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-				temp_int1.data.i = fftDim.data.i;
-				if (temp_int.data.i > temp_int1.data.i) {
-					//check that we only read fftDim * local batch data
-					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
-				}
-			}
-			else {
-				if (sc->localSize[1].data.i == 1) {
-					temp_int.data.i = (i)*sc->localSize[0].data.i;
-
-					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
-				}
-				else {
-					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
-
-					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
-
-					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
-					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
-				}
-
-				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-				if (temp_int.data.i > temp_int1.data.i) {
-					//check that we only read fftDim * local batch data
-					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
-					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
-				}
-			}
-			
-			if (sc->stridedSharedLayout) {
-				PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
-				PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &sc->tempInt);
-			}
-			else {
-				PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
-				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+#endif
 
-				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
-				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
-			}
+	appendDCTII_write_III_read(sc, type, 0);
 
-			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+	sc->readToRegisters = 0;
 
-			if (sc->stridedSharedLayout) {
-				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
-				temp_int1.data.i = fftDim.data.i;
-				if (temp_int.data.i > temp_int1.data.i) {
-					PfIf_end(sc);
-				}
-			}
-			else {
-				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
-				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
-				if (temp_int.data.i > temp_int1.data.i) {
-					PfIf_end(sc);
-				}
-			}
-		}
-		if (sc->useDisableThreads) {
-			PfIf_end(sc);
-		}
-	}
 	return;
 }
-
-static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+static inline void appendDCTIV_even_write(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
+	PfContainer temp_int2 = VKFFT_ZERO_INIT;
+	temp_int2.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer used_registers = VKFFT_ZERO_INIT;
 	used_registers.type = 31;
@@ -1816,12 +1792,7 @@ static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc,
 	}
 
 	if (sc->zeropadBluestein[readWrite]) {
-		if (readWrite) {
-			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
-		}
-		else {
-			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
-		}
+		fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
 	}
 	else {
 		fftDim.data.i = sc->fftDim.data.i;
@@ -1834,12 +1805,14 @@ static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc,
 		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
 	}
 
+	appendDCTII_read_III_write(sc, type, 1);
+
 	appendBarrierVkFFT(sc);
 	if (sc->useDisableThreads) {
 		temp_int.data.i = 0;
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
-	for (uint64_t i = 0; i < (uint64_t)used_registers.data.i; i++) {
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
 		if (sc->axis_id > 0) {
 			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
@@ -1877,300 +1850,907 @@ static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc,
 			}
 		}
 		if (sc->axis_id > 0) {
-			PfMov(sc, &sc->sdataID, &sc->combinedID);
+			temp_int.data.i = 2;
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &temp_int);
 		}
 		else {
-			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			temp_int.data.i = 2;
+			PfMod(sc, &sc->sdataID, &sc->tempInt, &temp_int);
 		}
+		PfMul(sc, &sc->blockInvocationID, &sc->sdataID, &temp_int, 0);
+		temp_double.data.d = pfFPinit("1.0");
+		if (sc->precision == 3) {
+			PfSetToZero(sc, &sc->tempFloat);
+			PfMov(sc, &sc->tempFloat.data.dd[0], &sc->blockInvocationID);
+			PfSub(sc, &sc->tempFloat, &temp_double, &sc->tempFloat);
+		} else {
+			PfSub(sc, &sc->tempFloat, &temp_double, &sc->blockInvocationID);
+		}
+		PfMul(sc, &sc->regIDs[i].data.c[1], &sc->regIDs[i].data.c[1], &sc->tempFloat, 0);
 
-		temp_int.data.i = fftDim.data.i / 4;
-		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
-
+		if (sc->LUT) {
+			if (sc->axis_id > 0) {
+				PfAdd(sc, &sc->tempInt, &sc->combinedID, &sc->startDCT4LUT);
+			}
+			else {
+				PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->startDCT4LUT);
+			}
+			appendGlobalToRegisters(sc, &sc->mult, &sc->LUTStruct, &sc->tempInt);
+		}
+		else {
 			temp_int.data.i = 2;
-			PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
-			PfInc(sc, &sc->inoutID);
-			if (sc->mergeSequencesR2C) {
-				PfSub(sc, &sc->tempInt, &fftDim, &sc->inoutID);
-				PfIf_eq_start(sc, &sc->tempInt, &fftDim);
-				PfSetToZero(sc, &sc->tempInt);
-				PfIf_end(sc);
+			if (sc->axis_id > 0) {
+				PfMul(sc, &sc->tempInt, &sc->combinedID, &temp_int, 0);
+			}
+			else {
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &temp_int, 0);
 			}
+			PfInc(sc, &sc->tempInt);
+			if (readWrite)
+				temp_double.data.d = -sc->double_PI / pfFPinit("8.0") / fftDim.data.i;
+			else
+				temp_double.data.d = sc->double_PI / pfFPinit("8.0") / fftDim.data.i;
+			PfMul(sc, &sc->tempFloat, &sc->tempInt, &temp_double, 0);
 
-			PfIf_eq_start(sc, &sc->inoutID, &fftDim);
-			PfSetToZero(sc, &sc->inoutID);
-			PfIf_end(sc);
+			PfSinCos(sc, &sc->mult, &sc->tempFloat);
+		}
 
-		PfIf_end(sc);
+		PfMul(sc, &sc->regIDs[i], &sc->regIDs[i], &sc->mult, &sc->temp);
+		PfConjugate(sc, &sc->regIDs[i], &sc->regIDs[i]);
 
-		temp_int.data.i = fftDim.data.i / 2;
-		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
-		temp_int.data.i = fftDim.data.i / 4;
-		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+		if (sc->axis_id > 0) {
+			PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
 
-			temp_int.data.i = 2;
-			PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
-			if (sc->mergeSequencesR2C) {
-				temp_int.data.i = fftDim.data.i - 2 * (fftDim.data.i / 2);
-				PfAdd(sc, &sc->tempInt, &temp_int, &sc->inoutID);
+			temp_int.data.i = (fftDim.data.i - 1) * sc->sharedStride.data.i;
+			PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &temp_int);
+			PfSub(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+		}
+		else {
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (fftDim.data.i-1);
+				PfSub(sc, &sc->sdataID, &temp_int, &sc->tempInt);
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+				
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			}
+			else {
+				PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 
-				PfIf_eq_start(sc, &sc->tempInt, &fftDim);
-				PfSetToZero(sc, &sc->tempInt);
+				temp_int.data.i = (fftDim.data.i-1);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
+				PfSub(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			}
+		}
+		if (sc->performDST) 
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+		else
+			appendRegistersToShared_y_y(sc, &sc->sdataID, &sc->regIDs[i]);
+		if (sc->axis_id > 0) {
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
 				PfIf_end(sc);
 			}
-			temp_int.data.i = 2 * (fftDim.data.i / 2);
-			PfSub(sc, &sc->inoutID, &temp_int, &sc->inoutID);
-
-			PfIf_eq_start(sc, &sc->inoutID, &fftDim);
-			PfSetToZero(sc, &sc->inoutID);
-			PfIf_end(sc);
-
-		PfIf_end(sc);
+		}
+		else {
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				PfIf_end(sc);
+			}
+		}
+	}
+	if (sc->useDisableThreads) {
 		PfIf_end(sc);
+	}
+	appendBarrierVkFFT(sc);
+	if (sc->useDisableThreads) {
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+	}
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->axis_id > 0) {
+			temp_int.data.i = (i)*sc->localSize[1].data.i;
 
-		temp_int.data.i = 3 * fftDim.data.i / 4;
-		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
-		temp_int.data.i = fftDim.data.i / 2;
-		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
 
-			temp_int.data.i = 2;
-			PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
-			if (sc->mergeSequencesR2C) {
-				temp_int.data.i = fftDim.data.i + 2 * (fftDim.data.i / 2);
-				PfSub(sc, &sc->tempInt, &temp_int, &sc->inoutID);
-				PfIf_eq_start(sc, &sc->tempInt, &fftDim);
-				PfSetToZero(sc, &sc->tempInt);
-				PfIf_end(sc);
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 			}
-			temp_int.data.i = 2 * (fftDim.data.i / 2);
-			PfSub(sc, &sc->inoutID, &sc->inoutID, &temp_int);
-
-			PfIf_eq_start(sc, &sc->inoutID, &fftDim);
-			PfSetToZero(sc, &sc->inoutID);
-			PfIf_end(sc);
+		}
+		else {
+			if (sc->localSize[1].data.i == 1) {
+				temp_int.data.i = (i)*sc->localSize[0].data.i;
 
-		PfIf_end(sc);
-		PfIf_end(sc);
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+			}
+			else {
+				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
 
-		temp_int.data.i = 3 * fftDim.data.i / 4;
-		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
 
-			temp_int.data.i = 2;
-			PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
-			if (sc->mergeSequencesR2C) {
-				temp_int.data.i = fftDim.data.i - 1;
-				PfSub(sc, &sc->tempInt, &sc->inoutID, &temp_int);
-				PfIf_eq_start(sc, &sc->tempInt, &fftDim);
-				PfSetToZero(sc, &sc->tempInt);
-				PfIf_end(sc);
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 			}
-			temp_int.data.i = 2 * fftDim.data.i - 1;
-			PfSub(sc, &sc->inoutID, &temp_int, &sc->inoutID);
 
-			PfIf_eq_start(sc, &sc->inoutID, &fftDim);
-			PfSetToZero(sc, &sc->inoutID);
-			PfIf_end(sc);
-
-		PfIf_end(sc);
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+		}
 
 		if (sc->axis_id > 0) {
-			PfMul(sc, &sc->inoutID, &sc->inoutID, &sc->sharedStride, 0);
-			PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->gl_LocalInvocationID_x);
-			if (sc->mergeSequencesR2C) {
+			if (sc->performDST) {
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->tempInt, &temp_int, &sc->combinedID);
 				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-				PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->gl_LocalInvocationID_x);
 			}
+			else
+				PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
+
+			PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &sc->tempInt);
 		}
 		else {
-			PfDiv(sc, &sc->blockInvocationID, &sc->combinedID, &fftDim);
-
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			if (sc->performDST) {
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->tempInt, &temp_int, &sc->tempInt);
+			}
 			if (sc->stridedSharedLayout) {
-				PfMul(sc, &sc->inoutID, &sc->inoutID, &sc->sharedStride, 0);
-				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->blockInvocationID);
-				if (sc->mergeSequencesR2C) {
-					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
-					PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->blockInvocationID);
-				}
+				PfMul(sc, &sc->sdataID, &sc->tempInt, &sc->sharedStride, 0);
+
+				PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 			}
 			else {
-				PfMul(sc, &sc->blockInvocationID, &sc->blockInvocationID, &sc->sharedStride, 0);
-				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->blockInvocationID);
-				if (sc->mergeSequencesR2C) {
-					PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->blockInvocationID);
-				}
-			}
-		}
-		appendSharedToRegisters(sc, &sc->temp, &sc->inoutID);
-		if (sc->mergeSequencesR2C) {
-			appendSharedToRegisters(sc, &sc->w, &sc->tempInt);
-		}
-
-		if (sc->mergeSequencesR2C) {
-			PfAdd_x(sc, &sc->regIDs[i], &sc->temp, &sc->w);
-			PfSub_y(sc, &sc->regIDs[i], &sc->temp, &sc->w);
+				PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
 
-			PfAdd_y(sc, &sc->w, &sc->temp, &sc->w);
-			PfSub_x(sc, &sc->w, &sc->w, &sc->temp);
-			PfMov_x_y(sc, &sc->temp, &sc->w);
-			PfMov_y_x(sc, &sc->temp, &sc->w);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+			}
 		}
-		
-		temp_int.data.i = fftDim.data.i / 4;
-		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+		if (sc->performDST) {
+			appendSharedToRegisters_x_y(sc, &sc->regIDs[i], &sc->sdataID);
+			if (sc->axis_id > 0) {
+				PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
 
-			temp_int.data.i = 1;
-			PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			else {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_x(sc, &sc->w, &sc->temp);
+				PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &sc->tempInt);
 			}
 			else {
-				PfMov_x(sc, &sc->w, &sc->temp);
-			}
-			PfIf_end(sc);
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				if (sc->stridedSharedLayout) {
+					PfMul(sc, &sc->sdataID, &sc->tempInt, &sc->sharedStride, 0);
 
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfAdd_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			else {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->temp);
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+				else {
+					PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
 			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfSub_y(sc, &sc->w, &sc->w, &sc->temp);
+			appendSharedToRegisters_y_x(sc, &sc->regIDs[i], &sc->sdataID);
+		}
+		else
+			appendSharedToRegisters_y_y(sc, &sc->regIDs[i], &sc->sdataID);
+		if (sc->axis_id > 0) {
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				PfIf_end(sc);
 			}
-			else {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->temp);
+		}
+		else {
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				PfIf_end(sc);
 			}
-			PfIf_end(sc);
-
+		}
+	}
+	if (sc->useDisableThreads) {
 		PfIf_end(sc);
+	}
+	return;
+}
 
+static inline void appendDCTIV_odd_read(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	PfContainer temp_int = VKFFT_ZERO_INIT;
+	temp_int.type = 31;
+	PfContainer temp_int1 = VKFFT_ZERO_INIT;
+	temp_int1.type = 31;
+	PfContainer temp_double = VKFFT_ZERO_INIT;
+	temp_double.type = 22;
 
-		temp_int.data.i = fftDim.data.i / 2;
-		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
-		temp_int.data.i = fftDim.data.i / 4;
-		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+	PfContainer used_registers = VKFFT_ZERO_INIT;
+	used_registers.type = 31;
 
-			temp_int.data.i = 1;
-			PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			else {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->temp);
+	PfContainer fftDim = VKFFT_ZERO_INIT;
+	fftDim.type = 31;
+
+	PfContainer localSize = VKFFT_ZERO_INIT;
+	localSize.type = 31;
+
+	PfContainer batching_localSize = VKFFT_ZERO_INIT;
+	batching_localSize.type = 31;
+
+	PfContainer* localInvocationID = VKFFT_ZERO_INIT;
+	PfContainer* batchingInvocationID = VKFFT_ZERO_INIT;
+
+	if (sc->stridedSharedLayout) {
+		batching_localSize.data.i = sc->localSize[0].data.i;
+		localSize.data.i = sc->localSize[1].data.i;
+		localInvocationID = &sc->gl_LocalInvocationID_y;
+		batchingInvocationID = &sc->gl_LocalInvocationID_x;
+	}
+	else {
+		batching_localSize.data.i = sc->localSize[1].data.i;
+		localSize.data.i = sc->localSize[0].data.i;
+		localInvocationID = &sc->gl_LocalInvocationID_x;
+		batchingInvocationID = &sc->gl_LocalInvocationID_y;
+	}
+
+	if (sc->zeropadBluestein[readWrite]) {
+		if (readWrite) {
+			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
+		}
+		else {
+			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
+		}
+	}
+	else {
+		fftDim.data.i = sc->fftDim.data.i;
+	}
+
+	if (sc->stridedSharedLayout) {
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
+	}
+	else {
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+	}
+
+	appendBarrierVkFFT(sc);
+	if (sc->useDisableThreads) {
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+	}
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->stridedSharedLayout) {
+			temp_int.data.i = (i)*sc->localSize[1].data.i;
+
+			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
 			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_x(sc, &sc->w, &sc->temp);
+		}
+		else {
+			if (sc->localSize[1].data.i == 1) {
+				temp_int.data.i = (i)*sc->localSize[0].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
 			}
 			else {
-				PfMov_x(sc, &sc->w, &sc->temp);
+				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+
+				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 			}
-			PfIf_end(sc);
 
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+		}
+		if (sc->stridedSharedLayout) {
+			temp_int.data.i = 4;
+			PfMul(sc, &sc->inoutID, &sc->combinedID, &temp_int,0);
+		}
+		else {
+			PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			temp_int.data.i = 4;
+			PfMul(sc, &sc->inoutID, &sc->tempInt, &temp_int,0);
+		}
+		temp_int.data.i = fftDim.data.i / 2;
+		PfAdd(sc, &sc->inoutID, &sc->inoutID, &temp_int);
+		
+		PfIf_lt_start(sc, &sc->inoutID, &fftDim);
+		PfMov(sc, &sc->sdataID, &sc->inoutID);
+		PfIf_end(sc);
+		
+		temp_int.data.i = fftDim.data.i * 2;
+		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
+		PfIf_ge_start(sc, &sc->inoutID, &fftDim);
+		temp_int.data.i = fftDim.data.i * 2 - 1;
+		PfSub(sc, &sc->sdataID, &temp_int, &sc->inoutID);
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		temp_int.data.i = fftDim.data.i * 3;
+		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
+		temp_int.data.i = fftDim.data.i * 2;
+		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
+		temp_int.data.i = fftDim.data.i * 2;
+		PfSub(sc, &sc->sdataID, &sc->inoutID, &temp_int);
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		temp_int.data.i = fftDim.data.i * 4;
+		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
+		temp_int.data.i = fftDim.data.i * 3;
+		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
+		temp_int.data.i = fftDim.data.i * 4 - 1;
+		PfSub(sc, &sc->sdataID, &temp_int, &sc->inoutID);
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		temp_int.data.i = fftDim.data.i * 4;
+		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
+		temp_int.data.i = fftDim.data.i * 4;
+		PfSub(sc, &sc->sdataID, &sc->inoutID, &temp_int);
+		PfIf_end(sc);
+
+		if (sc->performDST) {
 			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+			PfMod(sc, &sc->blockInvocationID, &sc->sdataID, &temp_int);
+		}
+
+		if (sc->stridedSharedLayout) {
+			PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+		}
+		else {
+			PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+			PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+		}
+		appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
+
+		if (sc->performDST)  {
+			temp_int.data.i = 1;
+			PfIf_eq_start(sc, &sc->blockInvocationID, &temp_int);
+			PfMovNeg(sc, &sc->regIDs[i], &sc->regIDs[i]);
+			PfIf_end(sc);
+		}
+
+		temp_int.data.i = fftDim.data.i * 2;
+		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
+		PfIf_ge_start(sc, &sc->inoutID, &fftDim);
+		PfMovNeg(sc, &sc->regIDs[i].data.c[0], &sc->regIDs[i].data.c[0]);
+		PfMovNeg(sc, &sc->regIDs[i].data.c[1], &sc->regIDs[i].data.c[1]);
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		temp_int.data.i = fftDim.data.i * 3;
+		PfIf_lt_start(sc, &sc->inoutID, &temp_int);
+		temp_int.data.i = fftDim.data.i * 2;
+		PfIf_ge_start(sc, &sc->inoutID, &temp_int);
+		PfMovNeg(sc, &sc->regIDs[i].data.c[0], &sc->regIDs[i].data.c[0]);
+		PfMovNeg(sc, &sc->regIDs[i].data.c[1], &sc->regIDs[i].data.c[1]);
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		if (sc->stridedSharedLayout) {
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				PfIf_end(sc);
+			}
+		}
+		else {
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				PfIf_end(sc);
+			}
+		}
+	}
+	if (sc->useDisableThreads) {
+		PfIf_end(sc);
+	}
+	pfINT registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1;
+	if ((sc->rader_generator[0] > 0) || ((sc->fftDim.data.i / registers_first_stage) != localSize.data.i))
+		sc->readToRegisters = 0;
+	else
+		sc->readToRegisters = 0; // can be switched to 1 if the indexing in previous step is aligned to 1 stage of fft (here it is combined)
+
+	if (!sc->readToRegisters) {
+
+		appendBarrierVkFFT(sc);
+		if (sc->useDisableThreads) {
 			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfSub_y(sc, &sc->w, &sc->w, &sc->temp);
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
 			else {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->temp);
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
 			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfAdd_y(sc, &sc->w, &sc->w, &sc->temp);
+			
+			if (sc->stridedSharedLayout) {
+				PfMul(sc, &sc->tempInt, &sc->combinedID, &sc->sharedStride, 0);
+				PfAdd(sc, &sc->sdataID, &sc->gl_LocalInvocationID_x, &sc->tempInt);
 			}
 			else {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->temp);
+				PfDiv(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+				PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+
+				PfMod(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 			}
-			PfIf_end(sc);
 
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+
+			if (sc->stridedSharedLayout) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+		}
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+	}
+	return;
+}
+
+static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc, int type, int readWrite) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	PfContainer temp_int = VKFFT_ZERO_INIT;
+	temp_int.type = 31;
+	PfContainer temp_int1 = VKFFT_ZERO_INIT;
+	temp_int1.type = 31;
+	PfContainer temp_double = VKFFT_ZERO_INIT;
+	temp_double.type = 22;
+
+	PfContainer used_registers = VKFFT_ZERO_INIT;
+	used_registers.type = 31;
+
+	PfContainer fftDim = VKFFT_ZERO_INIT;
+	fftDim.type = 31;
+
+	PfContainer localSize = VKFFT_ZERO_INIT;
+	localSize.type = 31;
+
+	PfContainer batching_localSize = VKFFT_ZERO_INIT;
+	batching_localSize.type = 31;
+
+	PfContainer* localInvocationID = VKFFT_ZERO_INIT;
+	PfContainer* batchingInvocationID = VKFFT_ZERO_INIT;
+
+	if (sc->stridedSharedLayout) {
+		batching_localSize.data.i = sc->localSize[0].data.i;
+		localSize.data.i = sc->localSize[1].data.i;
+		localInvocationID = &sc->gl_LocalInvocationID_y;
+		batchingInvocationID = &sc->gl_LocalInvocationID_x;
+	}
+	else {
+		batching_localSize.data.i = sc->localSize[1].data.i;
+		localSize.data.i = sc->localSize[0].data.i;
+		localInvocationID = &sc->gl_LocalInvocationID_x;
+		batchingInvocationID = &sc->gl_LocalInvocationID_y;
+	}
+
+	if (sc->zeropadBluestein[readWrite]) {
+		if (readWrite) {
+			fftDim.data.i = sc->fft_zeropad_Bluestein_left_write[sc->axis_id].data.i;
+		}
+		else {
+			fftDim.data.i = sc->fft_zeropad_Bluestein_left_read[sc->axis_id].data.i;
+		}
+	}
+	else {
+		fftDim.data.i = sc->fftDim.data.i;
+	}
+
+	if (sc->stridedSharedLayout) {
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[1]);
+	}
+	else {
+		PfDivCeil(sc, &used_registers, &fftDim, &sc->localSize[0]);
+	}
+
+	appendBarrierVkFFT(sc);
+	if (sc->useDisableThreads) {
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+	}
+	for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+		if (sc->axis_id > 0) {
+			temp_int.data.i = (i)*sc->localSize[1].data.i;
+
+			PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+			temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+		}
+		else {
+			if (sc->localSize[1].data.i == 1) {
+				temp_int.data.i = (i)*sc->localSize[0].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+			}
+			else {
+				PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+
+				temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+				PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+			}
+
+			temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+			if (temp_int.data.i > temp_int1.data.i) {
+				//check that we only read fftDim * local batch data
+				//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+				PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+			}
+		}
+		if (sc->axis_id > 0) {
+			PfMov(sc, &sc->sdataID, &sc->combinedID);
+		}
+		else {
+			PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+		}
+
+		temp_int.data.i = fftDim.data.i / 4;
+		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+
+		temp_int.data.i = 2;
+		PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
+		PfInc(sc, &sc->inoutID);
+		if (sc->mergeSequencesR2C) {
+			PfSub(sc, &sc->tempInt, &fftDim, &sc->inoutID);
+			PfIf_eq_start(sc, &sc->tempInt, &fftDim);
+			PfSetToZero(sc, &sc->tempInt);
+			PfIf_end(sc);
+		}
+
+		PfIf_eq_start(sc, &sc->inoutID, &fftDim);
+		PfSetToZero(sc, &sc->inoutID);
+		PfIf_end(sc);
+
+		PfIf_end(sc);
+
+		temp_int.data.i = fftDim.data.i / 2;
+		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+		temp_int.data.i = fftDim.data.i / 4;
+		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+
+		temp_int.data.i = 2;
+		PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
+		if (sc->mergeSequencesR2C) {
+			temp_int.data.i = fftDim.data.i - 2 * (fftDim.data.i / 2);
+			PfAdd(sc, &sc->tempInt, &temp_int, &sc->inoutID);
+
+			PfIf_eq_start(sc, &sc->tempInt, &fftDim);
+			PfSetToZero(sc, &sc->tempInt);
+			PfIf_end(sc);
+		}
+		temp_int.data.i = 2 * (fftDim.data.i / 2);
+		PfSub(sc, &sc->inoutID, &temp_int, &sc->inoutID);
+
+		PfIf_eq_start(sc, &sc->inoutID, &fftDim);
+		PfSetToZero(sc, &sc->inoutID);
+		PfIf_end(sc);
+
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		temp_int.data.i = 3 * fftDim.data.i / 4;
+		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+		temp_int.data.i = fftDim.data.i / 2;
+		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+
+		temp_int.data.i = 2;
+		PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
+		if (sc->mergeSequencesR2C) {
+			temp_int.data.i = fftDim.data.i + 2 * (fftDim.data.i / 2);
+			PfSub(sc, &sc->tempInt, &temp_int, &sc->inoutID);
+			PfIf_eq_start(sc, &sc->tempInt, &fftDim);
+			PfSetToZero(sc, &sc->tempInt);
+			PfIf_end(sc);
+		}
+		temp_int.data.i = 2 * (fftDim.data.i / 2);
+		PfSub(sc, &sc->inoutID, &sc->inoutID, &temp_int);
+
+		PfIf_eq_start(sc, &sc->inoutID, &fftDim);
+		PfSetToZero(sc, &sc->inoutID);
+		PfIf_end(sc);
+
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+		temp_int.data.i = 3 * fftDim.data.i / 4;
+		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+
+		temp_int.data.i = 2;
+		PfMul(sc, &sc->inoutID, &sc->sdataID, &temp_int, 0);
+		if (sc->mergeSequencesR2C) {
+			temp_int.data.i = fftDim.data.i - 1;
+			PfSub(sc, &sc->tempInt, &sc->inoutID, &temp_int);
+			PfIf_eq_start(sc, &sc->tempInt, &fftDim);
+			PfSetToZero(sc, &sc->tempInt);
+			PfIf_end(sc);
+		}
+		temp_int.data.i = 2 * fftDim.data.i - 1;
+		PfSub(sc, &sc->inoutID, &temp_int, &sc->inoutID);
+
+		PfIf_eq_start(sc, &sc->inoutID, &fftDim);
+		PfSetToZero(sc, &sc->inoutID);
+		PfIf_end(sc);
+
+		PfIf_end(sc);
+
+		if (sc->axis_id > 0) {
+			PfMul(sc, &sc->inoutID, &sc->inoutID, &sc->sharedStride, 0);
+			PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->gl_LocalInvocationID_x);
+			if (sc->mergeSequencesR2C) {
+				PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+				PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->gl_LocalInvocationID_x);
+			}
+		}
+		else {
+			PfDiv(sc, &sc->blockInvocationID, &sc->combinedID, &fftDim);
+
+			if (sc->stridedSharedLayout) {
+				PfMul(sc, &sc->inoutID, &sc->inoutID, &sc->sharedStride, 0);
+				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->blockInvocationID);
+				if (sc->mergeSequencesR2C) {
+					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+					PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->blockInvocationID);
+				}
+			}
+			else {
+				PfMul(sc, &sc->blockInvocationID, &sc->blockInvocationID, &sc->sharedStride, 0);
+				PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->blockInvocationID);
+				if (sc->mergeSequencesR2C) {
+					PfAdd(sc, &sc->tempInt, &sc->tempInt, &sc->blockInvocationID);
+				}
+			}
+		}
+		appendSharedToRegisters(sc, &sc->temp, &sc->inoutID);
+		if (sc->mergeSequencesR2C) {
+			appendSharedToRegisters(sc, &sc->w, &sc->tempInt);
+		}
+
+		if (sc->mergeSequencesR2C) {
+			PfAdd(sc, &sc->regIDs[i].data.c[0], &sc->temp.data.c[0], &sc->w.data.c[0]);
+			PfSub(sc, &sc->regIDs[i].data.c[1], &sc->temp.data.c[1], &sc->w.data.c[1]);
+
+			PfAdd(sc, &sc->w.data.c[1], &sc->temp.data.c[1], &sc->w.data.c[1]);
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[0]);
+			PfMov(sc, &sc->temp.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->temp.data.c[1], &sc->w.data.c[0]);
+		}
+		
+		temp_int.data.i = fftDim.data.i / 4;
+		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+
+		temp_int.data.i = 1;
+		PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfMov(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMov(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMov(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_end(sc);
+
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfAdd(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfSub(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_end(sc);
+
+		PfIf_end(sc);
+
+
+		temp_int.data.i = fftDim.data.i / 2;
+		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+		temp_int.data.i = fftDim.data.i / 4;
+		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+
+		temp_int.data.i = 1;
+		PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfMov(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMov(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMov(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_end(sc);
+
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfSub(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfAdd(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_end(sc);
+
+		PfIf_end(sc);
+		PfIf_end(sc);
+
+
+		temp_int.data.i = 3 * fftDim.data.i / 4;
+		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
+		temp_int.data.i = fftDim.data.i / 2;
+		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
+		
+		temp_int.data.i = 1;
+		PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfMov(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMov(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMov(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
 		PfIf_end(sc);
-		PfIf_end(sc);
-
-
-		temp_int.data.i = 3 * fftDim.data.i / 4;
-		PfIf_lt_start(sc, &sc->sdataID, &temp_int);
-		temp_int.data.i = fftDim.data.i / 2;
-		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
-		
-			temp_int.data.i = 1;
-			PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			else {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_x(sc, &sc->w, &sc->temp);
-			}
-			else {
-				PfMov_x(sc, &sc->w, &sc->temp);
-			}
-			PfIf_end(sc);
 
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfAdd_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			else {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfSub_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			else {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			PfIf_end(sc);
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfAdd(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfSub(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_end(sc);
 
 		PfIf_end(sc);
 		PfIf_end(sc);
@@ -2179,64 +2759,64 @@ static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc,
 		temp_int.data.i = 3 * fftDim.data.i / 4;
 		PfIf_ge_start(sc, &sc->sdataID, &temp_int);
 		
-			temp_int.data.i = 1;
-			PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			else {
-				PfMov_x_Neg_x(sc, &sc->w, &sc->temp);
-			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfMov_x(sc, &sc->w, &sc->regIDs[i]);
-				PfMov_y_x(sc, &sc->w, &sc->temp);
-			}
-			else {
-				PfMov_x(sc, &sc->w, &sc->temp);
-			}
-			PfIf_end(sc);
+		temp_int.data.i = 1;
+		PfAdd(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfMov(sc, &sc->w.data.c[0], &sc->regIDs[i].data.c[0]);
+			PfMov(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
+		}
+		else {
+			PfMov(sc, &sc->w.data.c[0], &sc->temp.data.c[0]);
+		}
+		PfIf_end(sc);
 
-			temp_int.data.i = 2;
-			PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
-			PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
-			temp_int.data.i = 0;
-			PfIf_gt_start(sc, &sc->tempInt, &temp_int);
-			if (sc->mergeSequencesR2C) {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfSub_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			else {
-				PfSub_x_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			PfIf_else(sc);
-			if (sc->mergeSequencesR2C) {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->regIDs[i]);
-				PfAdd_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			else {
-				PfAdd_x_y(sc, &sc->w, &sc->w, &sc->temp);
-			}
-			PfIf_end(sc);
+		temp_int.data.i = 2;
+		PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
+		PfMod(sc, &sc->tempInt, &sc->tempInt, &temp_int);
+		temp_int.data.i = 0;
+		PfIf_gt_start(sc, &sc->tempInt, &temp_int);
+		if (sc->mergeSequencesR2C) {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfSub(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfSub(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_else(sc);
+		if (sc->mergeSequencesR2C) {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->regIDs[i].data.c[1]);
+			PfAdd(sc, &sc->w.data.c[1], &sc->w.data.c[1], &sc->temp.data.c[1]);
+		}
+		else {
+			PfAdd(sc, &sc->w.data.c[0], &sc->w.data.c[0], &sc->temp.data.c[1]);
+		}
+		PfIf_end(sc);
 
 		PfIf_end(sc);
 
-		temp_double.data.d = sqrt(2);
+		temp_double.data.d = pfFPinit("1.41421356237309504880168872420969807856967");
 		if (sc->mergeSequencesR2C) {
-			temp_double.data.d *= 0.5;
+			temp_double.data.d *= pfFPinit("0.5");
 		}
 
 		if (sc->mergeSequencesR2C) {
 			PfMul(sc, &sc->regIDs[i], &sc->w, &temp_double, 0);
 		}
 		else {
-			PfMul_x(sc, &sc->regIDs[i], &sc->w, &temp_double, 0);
+			PfMul(sc, &sc->regIDs[i].data.c[0], &sc->w.data.c[0], &temp_double, 0);
 		}
 
 		if (sc->axis_id > 0) {
@@ -2257,6 +2837,183 @@ static inline void appendDCTIV_odd_write(VkFFTSpecializationConstantsLayout* sc,
 	if (sc->useDisableThreads) {
 		PfIf_end(sc);
 	}
+
+	if (sc->performDST) {
+		appendBarrierVkFFT(sc);
+		if (sc->useDisableThreads) {
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			else {
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			if (sc->axis_id > 0) {
+				temp_int.data.i = fftDim.data.i - 1;
+				PfSub(sc, &sc->combinedID, &temp_int, &sc->combinedID);
+
+				PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
+
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			}
+			else {
+				if (sc->stridedSharedLayout) {
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+					temp_int.data.i = fftDim.data.i - 1;
+					PfSub(sc, &sc->sdataID, &temp_int, &sc->sdataID);
+					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+					
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+				else {
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+					temp_int.data.i = fftDim.data.i - 1;
+					PfSub(sc, &sc->sdataID, &temp_int, &sc->sdataID);
+
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+			}
+			appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[i]);
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+		}
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+		appendBarrierVkFFT(sc);
+		
+		if (sc->useDisableThreads) {
+			temp_int.data.i = 0;
+			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
+		}
+		for (pfUINT i = 0; i < (pfUINT)used_registers.data.i; i++) {
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i)*sc->localSize[1].data.i;
+
+				PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &temp_int);
+
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			else {
+				if (sc->localSize[1].data.i == 1) {
+					temp_int.data.i = (i)*sc->localSize[0].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->gl_LocalInvocationID_x, &temp_int);
+				}
+				else {
+					PfMul(sc, &sc->combinedID, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
+
+					temp_int.data.i = (i)*sc->localSize[0].data.i * sc->localSize[1].data.i;
+
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
+					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
+				}
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					//check that we only read fftDim * local batch data
+					//&sc->tempIntLen = sprintf(&sc->tempIntStr, "		if(combinedID < %" PRIu64 "){\n", &sc->fftDim_half * &sc->localSize[0]);
+					PfIf_lt_start(sc, &sc->combinedID, &temp_int1);
+				}
+			}
+			if (sc->axis_id > 0) {
+				PfMul(sc, &sc->sdataID, &sc->combinedID, &sc->sharedStride, 0);
+
+				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+			}
+			else {
+				if (sc->stridedSharedLayout) {
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+					PfMul(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride, 0);
+
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+					
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+				else {
+					PfMod(sc, &sc->sdataID, &sc->combinedID, &fftDim);
+					
+					PfDiv(sc, &sc->tempInt, &sc->combinedID, &fftDim);
+					PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
+
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
+				}
+			}
+			appendSharedToRegisters(sc, &sc->regIDs[i], &sc->sdataID);
+			if (sc->axis_id > 0) {
+				temp_int.data.i = (i + 1) * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+			else {
+				temp_int.data.i = (i + 1) * sc->localSize[0].data.i * sc->localSize[1].data.i;
+				temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
+				if (temp_int.data.i > temp_int1.data.i) {
+					PfIf_end(sc);
+				}
+			}
+		}
+		if (sc->useDisableThreads) {
+			PfIf_end(sc);
+		}
+	}
 	sc->writeFromRegisters = 1;
 	
 	return;
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RaderKernels.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RaderKernels.h
index 8591e249..9763f31c 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RaderKernels.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RaderKernels.h
@@ -30,22 +30,24 @@
 static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, PfContainer* stageSize, PfContainer* stageSizeSum, PfContainer* stageAngle, PfContainer* stageRadix, int stageID) {
 	if (sc->res != VKFFT_SUCCESS) return;
 
-	PfContainer temp_double;
-	temp_double.type = 32;
-	PfContainer temp_int;
+	PfContainer temp_double = VKFFT_ZERO_INIT;
+	temp_double.type = 22;
+	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
-	PfContainer temp_int1;
+	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 
-	PfContainer stageNormalization;
-	stageNormalization.type = 32;
-	PfContainer normalizationValue;
+	PfContainer stageNormalization = VKFFT_ZERO_INIT;
+	stageNormalization.type = 22;
+	PfContainer normalizationValue = VKFFT_ZERO_INIT;
 	normalizationValue.type = 31;
 	normalizationValue.data.i = 1;
 
 	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle->data.d > 0))) && (stageSize->data.i == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle->data.d < 0)))) {
-		if ((sc->performDCT) && (sc->actualInverse)) {
-			if (sc->performDCT == 1)
+		if (((sc->performDCT) || (sc->performDST)) && (sc->actualInverse)) {
+			if (sc->performDST == 1)
+				normalizationValue.data.i = (sc->sourceFFTSize.data.i + 1) * 2;
+			else if (sc->performDCT == 1)
 				normalizationValue.data.i = (sc->sourceFFTSize.data.i - 1) * 2;
 			else
 				normalizationValue.data.i = sc->sourceFFTSize.data.i * 2;
@@ -57,7 +59,7 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 		normalizationValue.data.i *= sc->fft_dim_full.data.i;
 	}
 	if (normalizationValue.data.i != 1) {
-		stageNormalization.data.d = 1.0 / (long double)(normalizationValue.data.i);
+		stageNormalization.data.d = pfFPinit("1.0") / (pfLD)(normalizationValue.data.i);
 	}
 
 	sc->useCoalescedLUTUploadToSM = 0;
@@ -90,16 +92,16 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 	}
 
 	if (stageSize->data.i > 1) {
-		PfContainer num_logical_subgroups; 
+		PfContainer num_logical_subgroups = VKFFT_ZERO_INIT; 
 		num_logical_subgroups.type = 31;
 		num_logical_subgroups.data.i = (sc->stridedSharedLayout) ? sc->localSize[1].data.i : sc->localSize[0].data.i;
-		PfContainer num_logical_groups;
+		PfContainer num_logical_groups = VKFFT_ZERO_INIT;
 		num_logical_groups.type = 31;
 		PfDivCeil(sc, &num_logical_groups, &sc->fftDim, &num_logical_subgroups);
 
-		for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
-			if (((1 + (int64_t)t) * num_logical_subgroups.data.i) > sc->fftDim.data.i) {
-				PfContainer current_group_cut;
+		for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
+			if (((1 + (pfINT)t) * num_logical_subgroups.data.i) > sc->fftDim.data.i) {
+				PfContainer current_group_cut = VKFFT_ZERO_INIT;
 				current_group_cut.type = 31;
 				current_group_cut.data.i = sc->fftDim.data.i - t * num_logical_subgroups.data.i;
 				PfIf_lt_start(sc, localInvocationID, &current_group_cut);
@@ -120,7 +122,7 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 				}
 			}
 			else {
-				temp_double.data.d = stageAngle->data.d * 2.0 / (stageRadix->data.i);
+				temp_double.data.d = stageAngle->data.d * pfFPinit("2.0") / (stageRadix->data.i);
 				PfMul(sc, &sc->angle, &sc->stageInvocationID, &temp_double, 0);
 				temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i;
 				PfDiv(sc, &sc->tempInt, &sc->sdataID, &temp_int);
@@ -158,7 +160,7 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 			
 			appendRegistersToShared(sc, &sc->sdataID, &sc->temp);
 
-			if (((1 + (int64_t)t) * num_logical_subgroups.data.i) > sc->fftDim.data.i) {
+			if (((1 + (pfINT)t) * num_logical_subgroups.data.i) > sc->fftDim.data.i) {
 				PfIf_end(sc);				
 			}
 		}
@@ -182,11 +184,11 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 	{
 		int locStageRadix = sc->currentRaderContainer->stageRadix[0];
 		int logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		//uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		//pfUINT logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 		int locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
-		//uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
-		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
-		PfContainer subLogicalGroupSize;
+		//pfUINT locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
+		//pfUINT logicalGroupSize = (pfUINT)pfceil(locFFTsCombined / (double)logicalStoragePerThread);
+		PfContainer subLogicalGroupSize = VKFFT_ZERO_INIT;
 		subLogicalGroupSize.type = 31;
 		temp_int.data.i = locFFTDim;
 		temp_int1.data.i = logicalStoragePerThread;
@@ -228,12 +230,12 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 	{
 		int locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1];
 		int logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		//uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		//pfUINT logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 		int locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
-		//uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
-		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
+		//pfUINT locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
+		//pfUINT logicalGroupSize = (pfUINT)pfceil(locFFTsCombined / (double)logicalStoragePerThread);
 
-		PfContainer subLogicalGroupSize;
+		PfContainer subLogicalGroupSize = VKFFT_ZERO_INIT;
 		subLogicalGroupSize.type = 31;
 		temp_int.data.i = locFFTDim;
 		temp_int1.data.i = logicalStoragePerThread;
@@ -293,25 +295,25 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 			PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 		}
 	}
-	int64_t locStageSize = 1;
-	int64_t locStageSizeSum = 0;
-	long double locStageAngle = -sc->double_PI;
-	int64_t shift = 0;
+	pfINT locStageSize = 1;
+	pfINT locStageSizeSum = 0;
+	pfLD locStageAngle = -sc->double_PI;
+	pfINT shift = 0;
 	for (int rader_stage = 0; rader_stage < sc->currentRaderContainer->numStages; rader_stage++) {
-		int64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage];
-		int64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		int64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-		int64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
-		int64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
-		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
-
-		PfContainer subLogicalGroupSize;
+		pfINT locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage];
+		pfINT logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		pfINT logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		pfINT locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
+		pfINT locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
+		//pfUINT logicalGroupSize = (pfUINT)pfceil(locFFTsCombined / (double)logicalStoragePerThread);
+
+		PfContainer subLogicalGroupSize = VKFFT_ZERO_INIT;
 		subLogicalGroupSize.type = 31;
 		temp_int.data.i = locFFTDim;
 		temp_int1.data.i = logicalStoragePerThread;
 		PfDivCeil(sc, &subLogicalGroupSize, &temp_int, &temp_int1);
 
-		int64_t locFFTDimStride = locFFTDim;
+		pfINT locFFTDimStride = locFFTDim;
 		if (shift <= sc->sharedShiftRaderFFT.data.i) locFFTDimStride = locFFTDim + shift;
 		//local radix
 		if ((rader_stage == 0) || (!raderTranspose)) {
@@ -324,7 +326,7 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 			PfMod(sc, &sc->raderIDx2, localInvocationID, &temp_int);
 		}
 
-		for (uint64_t k = 0; k < sc->registerBoost; k++) {
+		for (pfUINT k = 0; k < sc->registerBoost; k++) {
 			if ((rader_stage == 0) || (!raderTranspose)) {
 				temp_int.data.i = sc->currentRaderContainer->containerFFTNum;
 				PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);				
@@ -332,9 +334,9 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 			else {
 				PfIf_lt_start(sc, &sc->raderIDx, &subLogicalGroupSize);				
 			}
-			for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread / locStageRadix; j++) {
-				if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue;
-				if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+			for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread / locStageRadix; j++) {
+				if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue;
+				if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 					temp_int.data.i = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize.data.i;
 					PfIf_lt_start(sc, &sc->raderIDx, &temp_int);
 				}
@@ -400,7 +402,7 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 						}
 					}
 
-					int64_t id = j + i * logicalRegistersPerThread / locStageRadix;
+					pfINT id = j + i * logicalRegistersPerThread / locStageRadix;
 					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
 					if (!sc->stridedSharedLayout) {
 						if (sc->resolveBankConflictFirstStages == 1) {
@@ -428,19 +430,19 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 
 				PfContainer* regID = (PfContainer*)calloc(locStageRadix, sizeof(PfContainer));
 				if (regID) {
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
 						id.data.i = (id.data.i / logicalRegistersPerThread) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread;
-						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						regID[i].type = sc->regIDs[id.data.i].type;
+						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						PfCopyContainer(sc, &regID[i], &sc->regIDs[id.data.i]);
 					}
 					inlineRadixKernelVkFFT(sc, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID);
 
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
 						id.data.i = (id.data.i / logicalRegistersPerThread) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread;
@@ -455,7 +457,7 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 					sc->res = VKFFT_ERROR_MALLOC_FAILED;
 					return;
 				}
-				if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+				if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 					PfIf_end(sc);
 				}
 			}
@@ -482,12 +484,12 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 		PfContainer* tempID;
 		tempID = (PfContainer*)calloc(sc->registers_per_thread * sc->registerBoost, sizeof(PfContainer));
 		if (tempID) {
-			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
-				PfAllocateContainerFlexible(sc, &tempID[i], 50);
+			for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 				tempID[i].type = sc->regIDs[0].type;
+				PfAllocateContainerFlexible(sc, &tempID[i], 50);
 			}
-			for (uint64_t k = 0; k < sc->registerBoost; ++k) {
-				uint64_t t = 0;
+			for (pfUINT k = 0; k < sc->registerBoost; ++k) {
+				pfUINT t = 0;
 
 				if ((rader_stage == 0) || (!raderTranspose)) {
 					temp_int.data.i = sc->currentRaderContainer->containerFFTNum;
@@ -528,9 +530,9 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 						if (shift <= sc->sharedShiftRaderFFT.data.i) locFFTDimStride = locFFTDim + shift;
 					}
 				}
-				for (int64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
-					if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
-						if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+				for (pfINT j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
+					if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
+						if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 							temp_int.data.i = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize.data.i;
 							PfIf_lt_start(sc, &sc->raderIDx, &temp_int);
 						}
@@ -555,14 +557,14 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 	blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
 	inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/
 
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
 						id.data.i = (id.data.i / logicalRegistersPerThread) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread;
 						PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[id.data.i]);
 						t++;
-						if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
+						if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
 							temp_int.data.i = i * locStageSize;
 							PfAdd(sc, &sc->combinedID, &sc->inoutID, &temp_int);
 							//last stage - mult rader kernel
@@ -646,25 +648,25 @@ static inline void appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, P
 						/*sc->tempLen = sprintf(sc->tempStr, "\
 sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
 					}
-					if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
-						if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+					if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
+						if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 							PfIf_end(sc);
 						}
 					}
 				}
 				PfIf_end(sc);
 				
-				for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
+				for (pfUINT j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
 					PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[t + k * sc->registers_per_thread]);
 					t++;
 				}
 				t = 0;
 			}
 			if (rader_stage != sc->currentRaderContainer->numStages - 1) {
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfCopyContainer(sc, &sc->regIDs[i], &tempID[i]);
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfDeallocateContainer(sc, &tempID[i]);
 				}
 			}
@@ -756,19 +758,19 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 	locStageSize = 1;
 	locStageAngle = sc->double_PI;
 	locStageSizeSum = 0;
-	for (int64_t rader_stage = sc->currentRaderContainer->numStages - 1; rader_stage >= 0; rader_stage--) {
-		int64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage];
-		int64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		int64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-		int64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
-		int64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
-		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
-		PfContainer subLogicalGroupSize;
+	for (pfINT rader_stage = sc->currentRaderContainer->numStages - 1; rader_stage >= 0; rader_stage--) {
+		pfINT locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage];
+		pfINT logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		pfINT logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		pfINT locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
+		pfINT locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
+		//pfUINT logicalGroupSize = (pfUINT)pfceil(locFFTsCombined / (double)logicalStoragePerThread);
+		PfContainer subLogicalGroupSize = VKFFT_ZERO_INIT;
 		subLogicalGroupSize.type = 31;
 		temp_int.data.i = locFFTDim;
 		temp_int1.data.i = logicalStoragePerThread;
 		PfDivCeil(sc, &subLogicalGroupSize, &temp_int, &temp_int1);
-		int64_t locFFTDimStride = locFFTDim; //different length due to all -1 cutoffs
+		pfINT locFFTDimStride = locFFTDim; //different length due to all -1 cutoffs
 		if (shift <= sc->sharedShiftRaderFFT.data.i) locFFTDimStride = locFFTDim + shift;
 		//local radix
 		if (!raderTranspose) {
@@ -780,7 +782,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 			PfDiv(sc, &sc->raderIDx, localInvocationID, &temp_int);
 			PfMod(sc, &sc->raderIDx2, localInvocationID, &temp_int);
 		}
-		for (uint64_t k = 0; k < sc->registerBoost; k++) {
+		for (pfUINT k = 0; k < sc->registerBoost; k++) {
 			if (!raderTranspose) {
 				temp_int.data.i = sc->currentRaderContainer->containerFFTNum;
 				PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -788,9 +790,9 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 			else {
 				PfIf_lt_start(sc, &sc->raderIDx, &subLogicalGroupSize);
 			}
-			for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread / locStageRadix; j++) {
-				if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue;
-				if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+			for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread / locStageRadix; j++) {
+				if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue;
+				if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 					temp_int.data.i = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize.data.i;
 					PfIf_lt_start(sc, &sc->raderIDx, &temp_int);				
 				}
@@ -810,9 +812,9 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 					PfMul(sc, &sc->angle, &sc->stageInvocationID, &temp_double, 0);
 				}
 
-				if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) {
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						uint64_t id = j + i * logicalRegistersPerThread / locStageRadix;
+				if (rader_stage != (pfINT)sc->currentRaderContainer->numStages - 1) {
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						pfUINT id = j + i * logicalRegistersPerThread / locStageRadix;
 						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
 						if (!raderTranspose) {
 							temp_int.data.i = j * subLogicalGroupSize.data.i + i * locFFTDim / locStageRadix + sc->fftDim.data.i / stageRadix->data.i;
@@ -856,19 +858,19 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 				}
 				PfContainer* regID = (PfContainer*)calloc(locStageRadix, sizeof(PfContainer));
 				if (regID) {
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
 						id.data.i = (id.data.i / logicalRegistersPerThread) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread;
-						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						regID[i].type = sc->regIDs[id.data.i].type;
+						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						PfCopyContainer(sc, &regID[i], &sc->regIDs[id.data.i]);
 					}
 					inlineRadixKernelVkFFT(sc, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID);
 
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
 						id.data.i = (id.data.i / logicalRegistersPerThread) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread;
@@ -883,7 +885,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 					sc->res = VKFFT_ERROR_MALLOC_FAILED;
 					return;
 				}
-				if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+				if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 					PfIf_end(sc);
 				}
 			}
@@ -915,12 +917,12 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 		PfContainer* tempID;
 		tempID = (PfContainer*)calloc(sc->registers_per_thread * sc->registerBoost, sizeof(PfContainer));
 		if (tempID) {
-			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
-				PfAllocateContainerFlexible(sc, &tempID[i], 50);
+			for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 				tempID[i].type = sc->regIDs[0].type;
+				PfAllocateContainerFlexible(sc, &tempID[i], 50);
 			}
-			for (uint64_t k = 0; k < sc->registerBoost; ++k) {
-				uint64_t t = 0;
+			for (pfUINT k = 0; k < sc->registerBoost; ++k) {
+				pfUINT t = 0;
 				if (!raderTranspose) {
 					temp_int.data.i = sc->currentRaderContainer->containerFFTNum;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -940,7 +942,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 					
 				}
 				if (!sc->stridedSharedLayout) {
-					if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) {
+					if (rader_stage != (pfINT)sc->currentRaderContainer->numStages - 1) {
 						shift = (subLogicalGroupSize.data.i > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize.data.i - locFFTDim % (sc->numSharedBanks / 2) : 0;
 						if (shift <= sc->sharedShiftRaderFFT.data.i) locFFTDimStride = locFFTDim + shift;
 					}
@@ -949,9 +951,9 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 						if (shift <= sc->sharedShiftRaderFFT.data.i) locFFTDimStride = locFFTDim + shift;
 					}
 				}
-				for (int64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
-					if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
-						if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+				for (pfINT j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
+					if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
+						if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 							temp_int.data.i = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize.data.i;
 							PfIf_lt_start(sc, &sc->raderIDx, &temp_int);							
 						}
@@ -975,14 +977,14 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 	blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
 	inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/
 
-					for (uint64_t i = 0; i < (uint64_t)locStageRadix; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)locStageRadix; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
 						id.data.i = (id.data.i / logicalRegistersPerThread) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread;
 						PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[id.data.i]);
 						t++;
-						if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
+						if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
 							temp_int.data.i = i * locStageSize;
 							PfAdd(sc, &sc->combinedID, &sc->inoutID, &temp_int);
 							
@@ -1087,7 +1089,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 							//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
 							if ((((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle->data.d > 0))) && (rader_stage == 0)) {
 								if (normalizationValue.data.i != 1) {
-									PfMul(sc, &sc->regIDs[id.data.i], &sc->regIDs[id.data.i], &stageNormalization, 0);
+									PfMul(sc, &sc->regIDs[id.data.i], &sc->regIDs[id.data.i], &stageNormalization, &sc->temp);
 								}								
 							}
 							appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[id.data.i]);
@@ -1099,24 +1101,24 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 						/*sc->tempLen = sprintf(sc->tempStr, "\
 sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
 					}
-					if (subLogicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
-						if (subLogicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
+					if (subLogicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
+						if (subLogicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
 							PfIf_end(sc);
 						}
 					}
 				}
 				PfIf_end(sc);
 
-				for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
+				for (pfUINT j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
 					PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[t + k * sc->registers_per_thread]);
 					t++;
 				}
 				t = 0;
 			}
-			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+			for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 				PfCopyContainer(sc, &sc->regIDs[i], &tempID[i]);
 			}
-			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+			for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 				PfDeallocateContainer(sc, &tempID[i]);
 			}
 			free(tempID);
@@ -1125,7 +1127,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 		else
 			sc->res = VKFFT_ERROR_MALLOC_FAILED;
 
-		if (rader_stage < (int64_t)sc->currentRaderContainer->numStages - 1) {
+		if (rader_stage < (pfINT)sc->currentRaderContainer->numStages - 1) {
 			switch (locStageRadix) {
 			case 2:
 				locStageSizeSum += locStageSize;
@@ -1200,13 +1202,13 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 	}
 
 	{
-		uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1];
-		uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		//uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-		uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
-		//uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
-		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
-		PfContainer subLogicalGroupSize;
+		pfUINT locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1];
+		pfUINT logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		//pfUINT logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		pfUINT locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
+		//pfUINT locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
+		//pfUINT logicalGroupSize = (pfUINT)pfceil(locFFTsCombined / (double)logicalStoragePerThread);
+		PfContainer subLogicalGroupSize = VKFFT_ZERO_INIT;
 		subLogicalGroupSize.type = 31;
 		temp_int.data.i = locFFTDim;
 		temp_int1.data.i = logicalStoragePerThread;
@@ -1253,7 +1255,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 
 		if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle->data.d > 0))) {
 			if (normalizationValue.data.i != 1) {
-				PfMul(sc, &sc->x0[1], &sc->x0[1], &stageNormalization, 0);
+				PfMul(sc, &sc->x0[1], &sc->x0[1], &stageNormalization, &sc->temp);
 			}
 		}
 
@@ -1276,24 +1278,27 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc, PfContainer* stageSize, PfContainer* stageSizeSum, PfContainer* stageAngle, PfContainer* stageRadix, int stageID) {
 	if (sc->res != VKFFT_SUCCESS) return;
 
-	PfContainer temp_complex;
-	temp_complex.type = 33;
-	PfContainer temp_double;
-	temp_double.type = 32;
-	PfContainer temp_int;
+	PfContainer temp_complex = VKFFT_ZERO_INIT;
+	temp_complex.type = 23;
+	PfAllocateContainerFlexible(sc, &temp_complex, 50);
+	PfContainer temp_double = VKFFT_ZERO_INIT;
+	temp_double.type = 22;
+	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
-	PfContainer temp_int1;
+	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	
-	PfContainer stageNormalization;
-	stageNormalization.type = 32;
-	PfContainer normalizationValue;
+	PfContainer stageNormalization = VKFFT_ZERO_INIT;
+	stageNormalization.type = 22;
+	PfContainer normalizationValue = VKFFT_ZERO_INIT;
 	normalizationValue.type = 31;
 	normalizationValue.data.i = 1;
 
 	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle->data.d > 0))) && (stageSize->data.i == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle->data.d < 0)))) {
-		if ((sc->performDCT) && (sc->actualInverse)) {
-			if (sc->performDCT == 1)
+		if (((sc->performDCT) || (sc->performDST)) && (sc->actualInverse)) {
+			if (sc->performDST == 1)
+				normalizationValue.data.i = (sc->sourceFFTSize.data.i + 1) * 2;
+			else if (sc->performDCT == 1)
 				normalizationValue.data.i = (sc->sourceFFTSize.data.i - 1) * 2;
 			else
 				normalizationValue.data.i = sc->sourceFFTSize.data.i * 2;
@@ -1305,7 +1310,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 		normalizationValue.data.i *= sc->fft_dim_full.data.i;
 	}
 	if (normalizationValue.data.i != 1) {
-		stageNormalization.data.d = 1.0 / (long double)(normalizationValue.data.i);
+		stageNormalization.data.d = pfFPinit("1.0") / (pfLD)(normalizationValue.data.i);
 	}
 	/*char convolutionInverse[10] = "";
 	if (sc->convolutionStep) {
@@ -1323,14 +1328,14 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
 
-	int64_t num_logical_subgroups = (sc->stridedSharedLayout) ? sc->localSize[1].data.i / ((stageRadix->data.i + 1) / 2) : sc->localSize[0].data.i / ((stageRadix->data.i + 1) / 2);
-	PfContainer num_logical_groups;
+	pfINT num_logical_subgroups = (sc->stridedSharedLayout) ? sc->localSize[1].data.i / ((stageRadix->data.i + 1) / 2) : sc->localSize[0].data.i / ((stageRadix->data.i + 1) / 2);
+	PfContainer num_logical_groups = VKFFT_ZERO_INIT;
 	num_logical_groups.type = 31;
 	temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i;
 	temp_int1.data.i = num_logical_subgroups;
 	PfDivCeil(sc, &num_logical_groups, &temp_int, &temp_int1);
-	int64_t require_cutoff_check = ((sc->fftDim.data.i == (num_logical_subgroups * num_logical_groups.data.i * stageRadix->data.i))) ? 0 : 1;
-	int64_t require_cutoff_check2;
+	pfINT require_cutoff_check = ((sc->fftDim.data.i == (num_logical_subgroups * num_logical_groups.data.i * stageRadix->data.i))) ? 0 : 1;
+	pfINT require_cutoff_check2;
 	
 	PfContainer* localInvocationID = VKFFT_ZERO_INIT;
 
@@ -1351,8 +1356,8 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 	PfMod(sc, &sc->raderIDx, localInvocationID, &temp_int);
 	PfDiv(sc, &sc->raderIDx2, localInvocationID, &temp_int);
 	
-	for (int64_t k = 0; k < sc->registerBoost; k++) {
-		for (int64_t j = 0; j < 1; j++) {
+	for (pfINT k = 0; k < sc->registerBoost; k++) {
+		for (pfINT j = 0; j < 1; j++) {
 			if (stageSize->data.i > 1) {
 				if (require_cutoff_check2) {
 					if (sc->stridedSharedLayout) {
@@ -1364,7 +1369,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 						PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &temp_int);
 					}
 				}
-				for (int64_t t = 0; t < num_logical_groups.data.i; t++) {
+				for (pfINT t = 0; t < num_logical_groups.data.i; t++) {
 					if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 						temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 						PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);						
@@ -1384,7 +1389,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 					}
 					else {
 						PfMul(sc, &sc->tempInt, &sc->stageInvocationID, &sc->raderIDx, 0);
-						temp_double.data.d = stageAngle->data.d * 2.0 / stageRadix->data.d;
+						temp_double.data.d = stageAngle->data.d * pfFPinit("2.0") / stageRadix->data.d;
 						PfMul(sc, &sc->angle, &sc->tempInt, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->angle);
 					}
@@ -1436,7 +1441,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 						temp_int.data.i = (stageRadix->data.i + 1) / 2;
 						PfAdd(sc, &sc->tempInt, &sc->raderIDx, &temp_int);
 						PfMul(sc, &sc->tempInt, &sc->stageInvocationID, &sc->tempInt, 0);
-						temp_double.data.d = stageAngle->data.d * 2.0 / stageRadix->data.d;
+						temp_double.data.d = stageAngle->data.d * pfFPinit("2.0") / stageRadix->data.d;
 						PfMul(sc, &sc->angle, &sc->tempInt, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->angle);
 					}
@@ -1503,7 +1508,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 				}
 			}
 			//save x0
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -1565,7 +1570,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 				PfAppendLine(sc);
 				*/
 			}
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -1617,7 +1622,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 				*/
 			}
 
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -1662,13 +1667,13 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 			
 			//load deconv kernel
 			if (!sc->inline_rader_kernel) {
-				for (uint64_t t = 0; t < (uint64_t)ceil((stageRadix->data.i - 1) / ((long double)(sc->localSize[0].data.i * sc->localSize[1].data.i))); t++) {
+				for (pfUINT t = 0; t < (pfUINT)pfceil((stageRadix->data.i - 1) / ((pfLD)(sc->localSize[0].data.i * sc->localSize[1].data.i))); t++) {
 					PfMul(sc, &sc->combinedID, &sc->gl_LocalInvocationID_y, &sc->localSize[0], 0);
 					PfAdd(sc, &sc->combinedID, &sc->combinedID, &sc->gl_LocalInvocationID_x);
 					temp_int.data.i = t * sc->localSize[0].data.i * sc->localSize[1].data.i;
 					PfAdd(sc, &sc->combinedID, &sc->combinedID, &temp_int);
 					
-					if (t == ((uint64_t)ceil((stageRadix->data.i - 1) / ((double)(sc->localSize[0].data.i * sc->localSize[1].data.i))) - 1)) {
+					if (t == ((pfUINT)pfceil((stageRadix->data.i - 1) / ((double)(sc->localSize[0].data.i * sc->localSize[1].data.i))) - 1)) {
 						temp_int.data.i = stageRadix->data.i - 1;
 						PfIf_lt_start(sc, &sc->combinedID, &temp_int);
 					}
@@ -1708,7 +1713,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 							PfAppendLine(sc);
 							*/
 						}
-						temp_double.data.d = 2.0 * sc->double_PI / stageRadix->data.i;
+						temp_double.data.d = pfFPinit("2.0") * sc->double_PI / stageRadix->data.i;
 						PfMul(sc, &sc->tempFloat, &temp_double, &sc->sdataID, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 						if (!sc->inverse) {
@@ -1718,7 +1723,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 						PfAdd(sc, &sc->tempInt, &sc->combinedID, &sc->RaderKernelOffsetShared[stageID]);
 						appendRegistersToShared(sc, &sc->tempInt, &sc->w);
 					}
-					if (t == ((uint64_t)ceil((stageRadix->data.i - 1) / ((long double)(sc->localSize[0].data.i * sc->localSize[1].data.i))) - 1)) {
+					if (t == ((pfUINT)pfceil((stageRadix->data.i - 1) / ((pfLD)(sc->localSize[0].data.i * sc->localSize[1].data.i))) - 1)) {
 						PfIf_end(sc);
 					}
 				}
@@ -1802,14 +1807,14 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 			}*/
 			//subgroups
 			/* {
-				uint64_t numGroupsQuant = ((((sc->localSize[0] * sc->localSize[1] * sc->localSize[2]) % sc->warpSize) == 0) || (sc->numSubgroups == 1)) ? sc->numSubgroups : sc->numSubgroups - 1;
+				pfUINT numGroupsQuant = ((((sc->localSize[0] * sc->localSize[1] * sc->localSize[2]) % sc->warpSize) == 0) || (sc->numSubgroups == 1)) ? sc->numSubgroups : sc->numSubgroups - 1;
 				if (numGroupsQuant != sc->numSubgroups) {
 					sc->tempLen = sprintf(sc->tempStr, "\
 		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, numGroupsQuant);
 					PfAppendLine(sc);
 					
 				}
-				for (uint64_t t = 0; t < (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant); t++) {
+				for (pfUINT t = 0; t < (pfUINT)pfceil(sc->localSize[1] / (double)numGroupsQuant); t++) {
 					sc->tempLen = sprintf(sc->tempStr, "\
 		%s.x = 0;\n", sc->regIDs[0]);
 					PfAppendLine(sc);
@@ -1818,15 +1823,15 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 		%s.y = 0;\n", sc->regIDs[0]);
 					PfAppendLine(sc);
 					
-					uint64_t quant = (sc->warpSize < (sc->localSize[0] * sc->localSize[1] * sc->localSize[2])) ? sc->warpSize : (sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
-					for (uint64_t t2 = 0; t2 < (uint64_t)ceil(stageRadix / (double)quant); t2++) {
-						if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
+					pfUINT quant = (sc->warpSize < (sc->localSize[0] * sc->localSize[1] * sc->localSize[2])) ? sc->warpSize : (sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
+					for (pfUINT t2 = 0; t2 < (pfUINT)pfceil(stageRadix / (double)quant); t2++) {
+						if ((t == (pfUINT)pfceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
 							sc->tempLen = sprintf(sc->tempStr, "\
 		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant);
 							PfAppendLine(sc);
 							
 						}
-						if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) {
+						if (t2 == (pfUINT)pfceil(stageRadix / (double)quant) - 1) {
 							sc->tempLen = sprintf(sc->tempStr, "\
 		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupInvocationID, stageRadix % quant);
 							PfAppendLine(sc);
@@ -1849,13 +1854,13 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 						
 						PfAddComplex(sc, sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
 						
-						if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) {
+						if (t2 == (pfUINT)pfceil(stageRadix / (double)quant) - 1) {
 							sc->tempLen = sprintf(sc->tempStr, "\
 		}\n");
 							PfAppendLine(sc);
 							
 						}
-						if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
+						if ((t == (pfUINT)pfceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
 							sc->tempLen = sprintf(sc->tempStr, "\
 		}\n");
 							PfAppendLine(sc);
@@ -1866,7 +1871,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 					PfSubgroupAdd(sc, sc->regIDs[0], sc->regIDs[0], 1);
 					
 
-					if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
+					if ((t == (pfUINT)pfceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
 						sc->tempLen = sprintf(sc->tempStr, "\
 		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant);
 						PfAppendLine(sc);
@@ -1895,7 +1900,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 		}\n");
 					PfAppendLine(sc);
 					
-					if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
+					if ((t == (pfUINT)pfceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
 						sc->tempLen = sprintf(sc->tempStr, "\
 		}\n");
 						PfAppendLine(sc);
@@ -1913,7 +1918,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 			temp_int.data.i = 0;
 			PfIf_gt_start(sc, &sc->raderIDx, &temp_int);
 			
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -1942,13 +1947,13 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 					PfAdd(sc, &sc->combinedID, &sc->sdataID, &temp_int);
 				}
 				
-				PfSub_x(sc, &sc->temp, &sc->regIDs[2 * t], &sc->regIDs[2 * t + 1]);
+				PfSub(sc, &sc->temp.data.c[0], &sc->regIDs[2 * t].data.c[0], &sc->regIDs[2 * t + 1].data.c[0]);
 				
-				PfAdd_x(sc, &sc->regIDs[2 * t], &sc->regIDs[2 * t], &sc->regIDs[2 * t + 1]);
+				PfAdd(sc, &sc->regIDs[2 * t].data.c[0], &sc->regIDs[2 * t].data.c[0], &sc->regIDs[2 * t + 1].data.c[0]);
 				
-				PfAdd_y(sc, &sc->temp, &sc->regIDs[2 * t], &sc->regIDs[2 * t + 1]);
+				PfAdd(sc, &sc->temp.data.c[1], &sc->regIDs[2 * t].data.c[1], &sc->regIDs[2 * t + 1].data.c[1]);
 
-				PfSub_y(sc, &sc->regIDs[2 * t], &sc->regIDs[2 * t], &sc->regIDs[2 * t + 1]);
+				PfSub(sc, &sc->regIDs[2 * t].data.c[1], &sc->regIDs[2 * t].data.c[1], &sc->regIDs[2 * t + 1].data.c[1]);
 
 				appendRegistersToShared(sc, &sc->sdataID, &sc->regIDs[2 * t]);
 				appendRegistersToShared(sc, &sc->combinedID, &sc->temp);
@@ -1994,20 +1999,20 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 			temp_int.data.i = (stageRadix->data.i + 1) / 2;
 			PfIf_lt_start(sc, &sc->raderIDx, &temp_int);
 			
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				PfSetToZero(sc, &sc->regIDs[2 * t + 1]);
 			}
 			temp_int.data.i = (stageRadix->data.i - 1) / 2;
 			PfIf_eq_start(sc, &sc->raderIDx, &temp_int);
 			
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = 1;
+			temp_complex.data.c[1].data.d = 0;
 
 			PfMov(sc, &sc->w, &temp_complex);
 			
 			PfIf_end(sc);
 			
-			for (uint64_t i = 0; i < (uint64_t)(stageRadix->data.i - 1) / 2; i++) {
+			for (pfUINT i = 0; i < (pfUINT)(stageRadix->data.i - 1) / 2; i++) {
 
 				temp_int.data.i = (stageRadix->data.i - 1) / 2;
 				PfIf_lt_start(sc, &sc->raderIDx, &temp_int);
@@ -2028,7 +2033,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 				PfIf_end(sc);
 				
 
-				for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+				for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 #if(VKFFT_BACKEND != 2) //AMD compiler fix
 					if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 						temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
@@ -2078,7 +2083,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 					}
 #endif
 #if(VKFFT_BACKEND == 2) //AMD compiler fix
-					if ((uint64_t)ceil((sc->localSize[0].data.i * sc->localSize[1].data.i) / ((long double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) > 2048) {
+					if ((pfUINT)pfceil((sc->localSize[0].data.i * sc->localSize[1].data.i) / ((pfLD)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) > 2048) {
 						PfIf_end(sc);
 
 						if (require_cutoff_check2) {
@@ -2118,7 +2123,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 #endif
 				}
 #if(VKFFT_BACKEND == 2) //AMD compiler fix
-				if ((uint64_t)ceil((sc->localSize[0].data.i * sc->localSize[1].data.i) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) <= 2048) {
+				if ((pfUINT)pfceil((sc->localSize[0].data.i * sc->localSize[1].data.i) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) <= 2048) {
 					PfIf_end(sc);
 
 					if (require_cutoff_check2) {
@@ -2156,19 +2161,19 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 				}
 #endif
 			}
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
 				}
 				
-				PfSub_x(sc, &sc->regIDs[2 * t], &sc->x0[t], &sc->regIDs[2 * t + 1]);
+				PfSub(sc, &sc->regIDs[2 * t].data.c[0], &sc->x0[t].data.c[0], &sc->regIDs[2 * t + 1].data.c[0]);
 
-				PfAdd_y(sc, &sc->regIDs[2 * t], &sc->x0[t], &sc->regIDs[2 * t + 1]);
+				PfAdd(sc, &sc->regIDs[2 * t].data.c[1], &sc->x0[t].data.c[1], &sc->regIDs[2 * t + 1].data.c[1]);
 
-				PfAdd_x(sc, &sc->regIDs[2 * t + 1], &sc->x0[t], &sc->regIDs[2 * t + 1]);
+				PfAdd(sc, &sc->regIDs[2 * t + 1].data.c[0], &sc->x0[t].data.c[0], &sc->regIDs[2 * t + 1].data.c[0]);
 
-				PfSub_y(sc, &sc->regIDs[2 * t + 1], &sc->x0[t], &sc->regIDs[2 * t + 1]);
+				PfSub(sc, &sc->regIDs[2 * t + 1].data.c[1], &sc->x0[t].data.c[1], &sc->regIDs[2 * t + 1].data.c[1]);
 
 
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
@@ -2241,7 +2246,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 			
 			PfIf_end(sc);
 			
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -2274,7 +2279,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 
 				if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle->data.d > 0))) {
 					if (normalizationValue.data.i != 1) {
-						PfMul(sc, &sc->regIDs[2 * t], &sc->regIDs[2 * t], &stageNormalization, 0);
+						PfMul(sc, &sc->regIDs[2 * t], &sc->regIDs[2 * t], &stageNormalization, &sc->temp);
 					}
 				}
 				appendRegistersToShared(sc, &sc->combinedID, &sc->regIDs[2 * t]);
@@ -2308,7 +2313,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 				PfAppendLine(sc);
 				*/
 			}
-			for (uint64_t t = 0; t < (uint64_t)num_logical_groups.data.i; t++) {
+			for (pfUINT t = 0; t < (pfUINT)num_logical_groups.data.i; t++) {
 				if ((require_cutoff_check) && (t == num_logical_groups.data.i - 1)) {
 					temp_int.data.i = sc->fftDim.data.i / stageRadix->data.i - t * num_logical_subgroups;
 					PfIf_lt_start(sc, &sc->raderIDx2, &temp_int);
@@ -2340,7 +2345,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 
 				if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle->data.d > 0))) {
 					if (normalizationValue.data.i != 1) {
-						PfMul(sc, &sc->regIDs[2 * t+1], &sc->regIDs[2 * t+1], &stageNormalization, 0);
+						PfMul(sc, &sc->regIDs[2 * t+1], &sc->regIDs[2 * t+1], &stageNormalization, &sc->temp);
 					}
 					
 				}
@@ -2364,7 +2369,7 @@ static inline void appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc,
 			
 		}
 	}
-
+	PfDeallocateContainer(sc, &temp_complex);
 	return;
 }
 
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixKernels.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixKernels.h
index 29c7c085..12c8eb02 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixKernels.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixKernels.h
@@ -27,22 +27,23 @@
 #include "vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryTransfers/vkFFT_Transfers.h"
 
-static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc, int64_t radix, int64_t stageSize, int64_t stageSizeSum, long double stageAngle, PfContainer* regID) {
+static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc, pfINT radix, pfINT stageSize, pfINT stageSizeSum, pfLD stageAngle, PfContainer* regID) {
 	if (sc->res != VKFFT_SUCCESS) return;
 
-	PfContainer temp_complex;
-	temp_complex.type = 33;
-	PfContainer temp_double;
-	temp_double.type = 32;
-	PfContainer temp_int;
+	PfContainer temp_complex = VKFFT_ZERO_INIT;
+	temp_complex.type = 23;
+	PfAllocateContainerFlexible(sc, &temp_complex, 50);
+	PfContainer temp_double = VKFFT_ZERO_INIT;
+	temp_double.type = 22;
+	PfContainer temp_int = VKFFT_ZERO_INIT;
 	temp_int.type = 31;
 	//sprintf(temp, "loc_0");
 
 	switch (radix) {
 	case 2: {
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);		
 		}
 		else {
@@ -62,26 +63,26 @@ static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc
 			}
 		}
 		PfMul(sc, &sc->temp, &regID[1], &sc->w, 0);
-		
+
 		PfSub(sc, &regID[1], &regID[0], &sc->temp);
-		
+
 		PfAdd(sc, &regID[0], &regID[0], &sc->temp);
 		
 		break;
 	}
 	case 3: {
 
-		PfContainer tf[2];
-		for (int64_t i = 0; i < 2; i++){
-			tf[i].type = 32;
+		PfContainer tf[2] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 2; i++){
+			tf[i].type = 22;
 		}
 		
-		tf[0].data.d = -0.5;
-		tf[1].data.d = -0.8660254037844386467637231707529;
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("-0.8660254037844386467637231707529361834714");
 
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);		
 		}
 		else {
@@ -97,15 +98,15 @@ static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc
 				}
 			}
 			else { 
-				temp_double.data.d = 4.0 / 3.0;
+				temp_double.data.d = pfFPinit("4.0") / 3.0;
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
 		PfMul(sc, &sc->locID[2], &regID[2], &sc->w, 0);
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);		
 		}
 		else {
@@ -116,7 +117,7 @@ static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc
 					appendSharedToRegisters(sc, &sc->w, &sc->sdataID);
 				}
 				else {
-					temp_double.data.d = 4.0 / 3.0;
+					temp_double.data.d = pfFPinit("4.0") / 3.0;
 					temp_int.data.i = stageSize;
 					PfAdd(sc, &sc->inoutID, &sc->LUTId, &temp_int);
 					appendGlobalToRegisters(sc, &sc->w, &sc->LUTStruct, &sc->inoutID);
@@ -128,7 +129,7 @@ static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc
 				}
 			}
 			else {
-				temp_double.data.d = 2.0 / 3.0;
+				temp_double.data.d = pfFPinit("2.0") / 3.0;
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
@@ -149,15 +150,15 @@ static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc
 		
 		if (stageAngle < 0)
 		{
-			PfShuffleComplex(sc, &regID[1], &sc->locID[1], &sc->locID[2], 0);
+			PfShuffleComplex(sc, &regID[1], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 			
-			PfShuffleComplexInv(sc, &regID[2], &sc->locID[1], &sc->locID[2], 0);
+			PfShuffleComplexInv(sc, &regID[2], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 			
 		}
 		else {
-			PfShuffleComplexInv(sc, &regID[1], &sc->locID[1], &sc->locID[2], 0);
+			PfShuffleComplexInv(sc, &regID[1], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 			
-			PfShuffleComplex(sc, &regID[2], &sc->locID[1], &sc->locID[2], 0);
+			PfShuffleComplex(sc, &regID[2], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 			
 		}
 
@@ -173,8 +174,8 @@ static inline void inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc
 		//&sc->tempLen = sprintf(&sc->tempStr, "	%s %s;\n", vecType, &sc->temp);
 		//		
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -220,8 +221,8 @@ temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n\
 //DIF 2nd stage with angle\n", &regID[2], &regID[2], &regID[2], &regID[2], &regID[2], &regID[0], &regID[0], &regID[0], &regID[3], &regID[3], &regID[3], &regID[3], &regID[3], &regID[1], &regID[1], &regID[1]);*/
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -236,7 +237,7 @@ temp%s = temp%s + temp;\n\n\
 					temp_int.data.i = stageSize;
 					PfAdd(sc, &sc->inoutID, &sc->LUTId, &temp_int);
 					appendGlobalToRegisters(sc, &sc->w, &sc->LUTStruct, &sc->inoutID);
-					
+				
 				}
 				if (stageAngle < 0) {
 					PfConjugate(sc, &sc->w, &sc->w);
@@ -244,7 +245,7 @@ temp%s = temp%s + temp;\n\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.5;
+				temp_double.data.d = pfFPinit("0.5");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 				
@@ -262,18 +263,18 @@ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
 temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n", &regID[1], &regID[1], &regID[1], &regID[1], &regID[1], &regID[0], &regID[0], &regID[0]);*/
 		if (stageAngle < 0) {
-			PfMov_x(sc, &sc->temp, &sc->w);
+			PfMov(sc, &sc->temp.data.c[0], &sc->w.data.c[0]);
 			
-			PfMov_x_y(sc, &sc->w, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->w, &sc->temp);
+			PfMov(sc, &sc->w.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x(sc, &sc->temp, &sc->w);
+			PfMov(sc, &sc->temp.data.c[0], &sc->w.data.c[0]);
 			
-			PfMov_x_Neg_y(sc, &sc->w, &sc->w);
-			PfMov_y_x(sc, &sc->w, &sc->temp);
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->w.data.c[1], &sc->temp.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(-w.y, w.x);\n\n", vecType);
 		}
@@ -286,7 +287,7 @@ temp%s = temp%s + temp;\n\n", &regID[1], &regID[1], &regID[1], &regID[1], &regID
 		//PfMov(sc, &sc->temp, &regID[1]);
 		//
 
-		uint64_t permute2[4] = { 0,2,1,3 };
+		pfUINT permute2[4] = { 0,2,1,3 };
 		PfPermute(sc, permute2, 4, 1, regID, &sc->temp);
 		
 
@@ -313,17 +314,17 @@ temp%s = temp;\n\
 		else {
 			&sc->tempLen = sprintf(&sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
 		}*/
-		PfContainer tf[5];
-		for (int64_t i = 0; i < 5; i++){
-			tf[i].type = 32;
+		PfContainer tf[5] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 5; i++){
+			tf[i].type = 22;
 		}
-		tf[0].data.d = -0.5;
-		tf[1].data.d = 1.538841768587626701285145288018455;
-		tf[2].data.d = -0.363271264002680442947733378740309;
-		tf[3].data.d = -0.809016994374947424102293417182819;
-		tf[4].data.d = -0.587785252292473129168705954639073;
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("1.538841768587626701285145288018455");
+		tf[2].data.d = pfFPinit("-0.363271264002680442947733378740309");
+		tf[3].data.d = pfFPinit("-0.809016994374947424102293417182819");
+		tf[4].data.d = pfFPinit("-0.587785252292473129168705954639073");
 
-		/*for (uint64_t i = 0; i < 5; i++) {
+		/*for (pfUINT i = 0; i < 5; i++) {
 			&sc->locID[i], (char*)malloc(sizeof(char) * 50);
 			sprintf(&sc->locID[i], loc_%" PRIu64 "", i);
 			&sc->tempLen = sprintf(&sc->tempStr, "	%s %s;\n", vecType, &sc->locID[i]);
@@ -332,10 +333,10 @@ temp%s = temp;\n\
 			/*&sc->tempLen = sprintf(&sc->tempStr, "	{\n\
 	%s loc_0;\n	%s loc_1;\n	%s loc_2;\n	%s loc_3;\n	%s loc_4;\n", vecType, vecType, vecType, vecType, vecType);*/
 
-		for (uint64_t i = radix - 1; i > 0; i--) {
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -356,7 +357,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -380,7 +381,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -419,13 +420,13 @@ loc_4 = temp%s + temp%s;\n", &regID[1], &regID[2], &regID[3], &regID[4], &regID[
 		
 		PfFMA(sc, &sc->locID[2], &regID[2], &tf[0], &regID[0]);
 		
-		PfMul(sc, &regID[3], &regID[3], &tf[1], 0);
+		PfMul(sc, &regID[3], &regID[3], &tf[1], &regID[0]);
 		
-		PfMul(sc, &regID[4], &regID[4], &tf[2], 0);
+		PfMul(sc, &regID[4], &regID[4], &tf[2], &regID[0]);
 		
-		PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[3], 0);
+		PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[3], &regID[0]);
 		
-		PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[4], 0);
+		PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[4], &regID[0]);
 		
 		/*&sc->tempLen = sprintf(&sc->tempStr, "\
 loc_0 = temp%s + temp%s + temp%s;\n\
@@ -454,13 +455,13 @@ temp%s = loc_0;\n", &regID[3], &regID[4], &regID[0]);*/
 
 		if (stageAngle < 0)
 		{
-			PfShuffleComplex(sc, &regID[1], &sc->locID[1], &sc->locID[4], 0);
+			PfShuffleComplex(sc, &regID[1], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 			
-			PfShuffleComplex(sc, &regID[2], &sc->locID[2], &sc->locID[3], 0);
+			PfShuffleComplex(sc, &regID[2], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 			
-			PfShuffleComplexInv(sc, &regID[3], &sc->locID[2], &sc->locID[3], 0);
+			PfShuffleComplexInv(sc, &regID[3], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 			
-			PfShuffleComplexInv(sc, &regID[4], &sc->locID[1], &sc->locID[4], 0);
+			PfShuffleComplexInv(sc, &regID[4], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 			
 			/*&sc->tempLen = sprintf(&sc->tempStr, "\
 temp%s.x = loc_1.x - loc_4.y; \n\
@@ -473,13 +474,13 @@ temp%s.x = loc_1.x + loc_4.y; \n\
 temp%s.y = loc_1.y - loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &regID[3], &regID[3], &regID[4], &regID[4]);*/
 		}
 		else {
-			PfShuffleComplexInv(sc, &regID[1], &sc->locID[1], &sc->locID[4], 0);
+			PfShuffleComplexInv(sc, &regID[1], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 			
-			PfShuffleComplexInv(sc, &regID[2], &sc->locID[2], &sc->locID[3], 0);
+			PfShuffleComplexInv(sc, &regID[2], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 			
-			PfShuffleComplex(sc, &regID[3], &sc->locID[2], &sc->locID[3], 0);
+			PfShuffleComplex(sc, &regID[3], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 			
-			PfShuffleComplex(sc, &regID[4], &sc->locID[1], &sc->locID[4], 0);
+			PfShuffleComplex(sc, &regID[4], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 			
 			/*&sc->tempLen = sprintf(&sc->tempStr, "\
 temp%s.x = loc_1.x + loc_4.y; \n\
@@ -496,19 +497,19 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 		break;
 	}
 	case 6: {
-		PfContainer tf[2];
-		for (int64_t i = 0; i < 2; i++){
-			tf[i].type = 32;
+		PfContainer tf[2] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 2; i++){
+			tf[i].type = 22;
 		}
 		//PfAppendLine(sc, "	{\n");
 		
 
-		tf[0].data.d = -0.5;
-		tf[1].data.d = -0.8660254037844386467637231707529;
-		for (uint64_t i = radix - 1; i > 0; i--) {
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("-0.8660254037844386467637231707529361834714");
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -529,7 +530,7 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -553,7 +554,7 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -566,9 +567,9 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 		//PfMov(sc, &regID[1], &sc->locID[1]);
 		//
 
-		//uint64_t P = 3;
-		uint64_t Q = 2;
-		for (uint64_t i = 0; i < Q; i++) {
+		//pfUINT P = 3;
+		pfUINT Q = 2;
+		for (pfUINT i = 0; i < Q; i++) {
 			PfMov(sc, &sc->locID[0], &regID[i]);
 			
 			PfMov(sc, &sc->locID[1], &regID[i + Q]);
@@ -591,15 +592,15 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 			
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 		}
@@ -611,14 +612,14 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 		PfAdd(sc, &regID[0], &regID[0], &sc->temp);
 		
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] =  -0.5;
-			temp_complex.data.c[1] = 0.8660254037844386467637231707529;
+			temp_complex.data.c[0].data.d = pfFPinit("-0.5");
+			temp_complex.data.c[1].data.d = pfFPinit("0.8660254037844386467637231707529361834714");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
 		else {
-			temp_complex.data.c[0] =  -0.5;
-			temp_complex.data.c[1] = -0.8660254037844386467637231707529;
+			temp_complex.data.c[0].data.d = pfFPinit("-0.5");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.8660254037844386467637231707529361834714");
 			PfMov(sc, &sc->w, &temp_complex);
 			
 		}
@@ -640,7 +641,7 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 		PfAdd(sc, &regID[4], &regID[4], &sc->temp);
 		
 
-		uint64_t permute2[6] = { 0,3,4,1,2,5 };
+		pfUINT permute2[6] = { 0,3,4,1,2,5 };
 		PfPermute(sc, permute2, 6, 1, regID, &sc->temp);
 		
 
@@ -660,44 +661,40 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 		break;
 	}
 	case 7: {
-		/*if (sc->LUT) {
-			&sc->tempLen = sprintf(&sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType);
-		}
-		else {
-			&sc->tempLen = sprintf(&sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
-		}*/
-		PfContainer tf[8];
-		for (int64_t i = 0; i < 8; i++){
-			tf[i].type = 32;
-		}
-		//PfAppendLine(sc, "	{\n");
-		tf[0].data.d = -1.16666666666666651863693004997913;
-		tf[1].data.d = 0.79015646852540022404554065360571;
-		tf[2].data.d = 0.05585426728964774240049351305970;
-		tf[3].data.d = 0.73430220123575240531721419756650;
+		PfContainer tf_x[6] = VKFFT_ZERO_INIT;
+		PfContainer tf_y[6] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 6; i++){
+			tf_x[i].type = 22;
+			tf_y[i].type = 22;
+		}
+		
+		tf_x[0].data.d = pfFPinit("0.6234898018587335305250048840042398106322747308964021053655");
+		tf_x[1].data.d = pfFPinit("-0.222520933956314404288902564496794759466355568764544955311");
+		tf_x[2].data.d = pfFPinit("-0.900968867902419126236102319507445051165919162131857150053");
+		tf_x[3].data.d = tf_x[0].data.d;
+		tf_x[4].data.d = tf_x[1].data.d;
+		tf_x[5].data.d = tf_x[2].data.d;
 		if (stageAngle < 0) {
-			tf[4].data.d = 0.44095855184409837868031445395900;
-			tf[5].data.d = 0.34087293062393136944265847887436;
-			tf[6].data.d = -0.53396936033772524066165487965918;
-			tf[7].data.d = 0.87484229096165666561546458979137;
+			tf_y[0].data.d = pfFPinit("-0.7818314824680298087084445266740577502323345187086875289806");
+			tf_y[1].data.d = pfFPinit("0.9749279121818236070181316829939312172327858006199974376480");
+			tf_y[2].data.d = pfFPinit("0.4338837391175581204757683328483587546099907277874598764445");
+			tf_y[3].data.d = -tf_y[0].data.d;
+			tf_y[4].data.d = -tf_y[1].data.d;
+			tf_y[5].data.d = -tf_y[2].data.d;
 		}
 		else {
-			tf[4].data.d = -0.44095855184409837868031445395900;
-			tf[5].data.d = -0.34087293062393136944265847887436;
-			tf[6].data.d = 0.53396936033772524066165487965918;
-			tf[7].data.d = -0.87484229096165666561546458979137;
-		}
-		/*for (uint64_t i = 0; i < 7; i++) {
-			&sc->locID[i], (char*)malloc(sizeof(char) * 50);
-			sprintf(&sc->locID[i], loc_%" PRIu64 "", i);
-			&sc->tempLen = sprintf(&sc->tempStr, "	%s %s;\n", vecType, &sc->locID[i]);
-			
-			}*/
-		for (uint64_t i = radix - 1; i > 0; i--) {
+			tf_y[0].data.d = pfFPinit("0.7818314824680298087084445266740577502323345187086875289806");
+			tf_y[1].data.d = pfFPinit("-0.9749279121818236070181316829939312172327858006199974376480");
+			tf_y[2].data.d = pfFPinit("-0.4338837391175581204757683328483587546099907277874598764445");
+			tf_y[3].data.d = -tf_y[0].data.d;
+			tf_y[4].data.d = -tf_y[1].data.d;
+			tf_y[5].data.d = -tf_y[2].data.d;
+		}
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
-				PfMov(sc, &sc->w, &temp_complex);	
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
+			PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
 			else {
@@ -717,7 +714,7 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -737,11 +734,11 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 						}
 						if (stageAngle < 0) {
 							PfConjugate(sc, &sc->w, &sc->w);
-					
+							
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -749,168 +746,57 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 			}
 			PfMul(sc, &sc->locID[i], &regID[i], &sc->w, 0);
 			
-			/*&sc->tempLen = sprintf(&sc->tempStr, "\
-loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\
-loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, &regID[i], &regID[i], i, &regID[i], &regID[i]);*/
 		}
 		PfMov(sc, &sc->locID[0], &regID[0]);
 		
-		PfAdd(sc, &regID[0], &sc->locID[1], &sc->locID[6]);
-		
-		PfSub(sc, &regID[1], &sc->locID[1], &sc->locID[6]);
-		
-		PfAdd(sc, &regID[2], &sc->locID[2], &sc->locID[5]);
-		
-		PfSub(sc, &regID[3], &sc->locID[2], &sc->locID[5]);
-		
-		PfAdd(sc, &regID[4], &sc->locID[4], &sc->locID[3]);
-		
-		PfSub(sc, &regID[5], &sc->locID[4], &sc->locID[3]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-loc_0 = temp%s;\n\
-temp%s = loc_1 + loc_6;\n\
-temp%s = loc_1 - loc_6;\n\
-temp%s = loc_2 + loc_5;\n\
-temp%s = loc_2 - loc_5;\n\
-temp%s = loc_4 + loc_3;\n\
-temp%s = loc_4 - loc_3;\n", &regID[0], &regID[0], &regID[1], &regID[2], &regID[3], &regID[4], &regID[5]);*/
-		PfAdd(sc, &sc->locID[5], &regID[1], &regID[3]);
-		
-		PfAdd(sc, &sc->locID[5], &sc->locID[5], &regID[5]);
-		
-		PfAdd(sc, &sc->locID[1], &regID[0], &regID[2]);
-		
-		PfAdd(sc, &sc->locID[1], &sc->locID[1], &regID[4]);
-		
-		PfAdd(sc, &sc->locID[0], &sc->locID[0], &sc->locID[1]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-loc_5 = temp%s + temp%s + temp%s;\n\
-loc_1 = temp%s + temp%s + temp%s;\n\
-loc_0 += loc_1;\n", &regID[1], &regID[3], &regID[5], &regID[0], &regID[2], &regID[4]);*/
-		PfSub(sc, &sc->locID[2], &regID[0], &regID[4]);
-		
-		PfSub(sc, &sc->locID[3], &regID[4], &regID[2]);
-		
-		PfSub(sc, &sc->locID[4], &regID[2], &regID[0]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-loc_2 = temp%s - temp%s;\n\
-loc_3 = temp%s - temp%s;\n\
-loc_4 = temp%s - temp%s;\n", &regID[0], &regID[4], &regID[4], &regID[2], &regID[2], &regID[0]);*/
-		PfSub(sc, &regID[0], &regID[1], &regID[5]);
-		
-		PfSub(sc, &regID[2], &regID[5], &regID[3]);
-		
-		PfSub(sc, &regID[4], &regID[3], &regID[1]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-temp%s = temp%s - temp%s;\n\
-temp%s = temp%s - temp%s;\n\
-temp%s = temp%s - temp%s;\n", &regID[0], &regID[1], &regID[5], &regID[2], &regID[5], &regID[3], &regID[4], &regID[3], &regID[1]);*/
-
-		PfMul(sc, &sc->locID[1], &sc->locID[1], &tf[0], 0);
-		
-		PfMul(sc, &sc->locID[2], &sc->locID[2], &tf[1], 0);
-		
-		PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[2], 0);
-		
-		PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[3], 0);
-		
-		PfMul(sc, &sc->locID[5], &sc->locID[5], &tf[4], 0);
-		
-		PfMul(sc, &regID[0], &regID[0], &tf[5], 0);
-		
-		PfMul(sc, &regID[2], &regID[2], &tf[6], 0);
-		
-		PfMul(sc, &regID[4], &regID[4], &tf[7], 0);
+		pfUINT permute[7] = { 0, 1, 3, 2, 6, 4, 5 };
+		PfPermute(sc, permute, 7, 0, 0, &sc->w);
 		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-loc_1 *= -1.16666666666666651863693004997913;\n\
-loc_2 *= 0.79015646852540022404554065360571;\n\
-loc_3 *= 0.05585426728964774240049351305970;\n\
-loc_4 *= 0.73430220123575240531721419756650;\n\
-loc_5 *= 0.44095855184409837868031445395900;\n\
-temp%s *= 0.34087293062393136944265847887436;\n\
-temp%s *= -0.53396936033772524066165487965918;\n\
-temp%s *= 0.87484229096165666561546458979137;\n", &regID[0], &regID[2], &regID[4]);*/
-
-		PfSub(sc, &regID[5], &regID[4], &regID[2]);
-		
-		PfAddInv(sc, &regID[6], &regID[4], &regID[0]);
-		
-		PfAdd(sc, &regID[4], &regID[0], &regID[2]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-temp%s = temp%s - temp%s;\n\
-temp%s = - temp%s - temp%s;\n\
-temp%s = temp%s + temp%s;\n", &regID[5], &regID[4], &regID[2], &regID[6], &regID[4], &regID[0], &regID[4], &regID[0], &regID[2]);*/
-		PfAdd(sc, &regID[0], &sc->locID[0], &sc->locID[1]);
-		
-		PfAdd(sc, &regID[1], &sc->locID[2], &sc->locID[3]);
-		
-		PfSub(sc, &regID[2], &sc->locID[4], &sc->locID[3]);
-		
-		PfAddInv(sc, &regID[3], &sc->locID[2], &sc->locID[4]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-temp%s = loc_0 + loc_1;\n\
-temp%s = loc_2 + loc_3;\n\
-temp%s = loc_4 - loc_3;\n\
-temp%s = - loc_2 - loc_4;\n", &regID[0], &regID[1], &regID[2], &regID[3]);*/
-		PfAdd(sc, &sc->locID[1], &regID[0], &regID[1]);
-		
-		PfAdd(sc, &sc->locID[2], &regID[0], &regID[2]);
-		
-		PfAdd(sc, &sc->locID[3], &regID[0], &regID[3]);
-		
-		PfAdd(sc, &sc->locID[4], &regID[4], &sc->locID[5]);
-		
-		PfAdd(sc, &sc->locID[6], &regID[6], &sc->locID[5]);
-		
-		PfAdd(sc, &sc->locID[5], &sc->locID[5], &regID[5]);
-		
-		PfMov(sc, &regID[0], &sc->locID[0]);
-		
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-loc_1 = temp%s + temp%s;\n\
-loc_2 = temp%s + temp%s;\n\
-loc_3 = temp%s + temp%s;\n\
-loc_4 = temp%s + loc_5;\n\
-loc_6 = temp%s + loc_5;\n\
-loc_5 += temp%s;\n\
-temp%s = loc_0;\n", &regID[0], &regID[1], &regID[0], &regID[2], &regID[0], &regID[3], &regID[4], &regID[6], &regID[5], &regID[0]);*/
-		PfShuffleComplexInv(sc, &regID[1], &sc->locID[1], &sc->locID[4], 0);
-		
-		PfShuffleComplexInv(sc, &regID[2], &sc->locID[3], &sc->locID[6], 0);
-		
-		PfShuffleComplex(sc, &regID[3], &sc->locID[2], &sc->locID[5], 0);
-		
-		PfShuffleComplexInv(sc, &regID[4], &sc->locID[2], &sc->locID[5], 0);
-		
-		PfShuffleComplex(sc, &regID[5], &sc->locID[3], &sc->locID[6], 0);
-		
-		PfShuffleComplex(sc, &regID[6], &sc->locID[1], &sc->locID[4], 0);
-		
-
-		/*&sc->tempLen = sprintf(&sc->tempStr, "\
-temp%s.x = loc_1.x + loc_4.y; \n\
-temp%s.y = loc_1.y - loc_4.x; \n\
-temp%s.x = loc_3.x + loc_6.y; \n\
-temp%s.y = loc_3.y - loc_6.x; \n\
-temp%s.x = loc_2.x - loc_5.y; \n\
-temp%s.y = loc_2.y + loc_5.x; \n\
-temp%s.x = loc_2.x + loc_5.y; \n\
-temp%s.y = loc_2.y - loc_5.x; \n\
-temp%s.x = loc_3.x - loc_6.y; \n\
-temp%s.y = loc_3.y + loc_6.x; \n\
-temp%s.x = loc_1.x - loc_4.y; \n\
-temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &regID[3], &regID[3], &regID[4], &regID[4], &regID[5], &regID[5], &regID[6], &regID[6]);
-		PfAppendLine(sc, "	}\n");*/
-		/*for (uint64_t i = 0; i < 7; i++) {
-			free(&sc->locID[i]);
-		}*/
+		for (pfUINT i = 0; i < 3; i++) {
+			PfSub(sc, &regID[i + 4].data.c[0], &sc->locID[i + 1].data.c[0], &sc->locID[i + 4].data.c[0]);
+			
+			PfAdd(sc, &regID[i + 1].data.c[0], &sc->locID[i + 1].data.c[0], &sc->locID[i + 4].data.c[0]);
+			
+			PfAdd(sc, &regID[i + 4].data.c[1], &sc->locID[i + 1].data.c[1], &sc->locID[i + 4].data.c[1]);
+			
+			PfSub(sc, &regID[i + 1].data.c[1], &sc->locID[i + 1].data.c[1], &sc->locID[i + 4].data.c[1]);
+			
+		}
+		for (pfUINT i = 0; i < 3; i++) {
+			PfAdd(sc, &regID[0].data.c[0], &regID[0].data.c[0], &regID[i + 1].data.c[0]);
+			
+			PfAdd(sc, &regID[0].data.c[1], &regID[0].data.c[1], &regID[i + 4].data.c[1]);
+			
+		}
+		for (pfUINT i = 1; i < 4; i++) {
+			PfMov(sc, &sc->locID[i], &sc->locID[0]);
+			
+			
+		}
+		for (pfUINT i = 4; i < 7; i++) {
+			PfSetToZero(sc, &sc->locID[i]);
+		}
+		for (pfUINT i = 0; i < 3; i++) {
+			for (pfUINT j = 0; j < 3; j++) {
+				pfUINT id = ((6 - i) + j) % 6;
+				PfFMA3_const_w(sc, &sc->locID[j + 1], &sc->locID[j + 4], &regID[i + 1], &tf_x[id], &tf_y[id], &regID[i + 4], &sc->w, &sc->locID[0]);
+				
+			}
+		}
+		for (pfUINT i = 1; i < 4; i++) {
+			PfSub(sc, &regID[i].data.c[0], &sc->locID[i].data.c[0], &sc->locID[i + 3].data.c[0]);
+			
+			PfAdd(sc, &regID[i].data.c[1], &sc->locID[i].data.c[1], &sc->locID[i + 3].data.c[1]);
+			
+		}
+		for (pfUINT i = 1; i < 4; i++) {
+			PfAdd(sc, &regID[i + 3].data.c[0], &sc->locID[i].data.c[0], &sc->locID[i + 3].data.c[0]);
+			
+			PfSub(sc, &regID[i + 3].data.c[1], &sc->locID[i].data.c[1], &sc->locID[i + 3].data.c[1]);
+			
+		}
+		pfUINT permute2[7] = { 0, 1, 5, 6, 3, 2, 4 };
+		PfPermute(sc, permute2, 7, 1, regID, &sc->w);
 		break;
 	}
 	case 8: {
@@ -925,8 +811,8 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 			&sc->tempLen = sprintf(&sc->tempStr, "	%s %s;\n", vecType, iw);
 			*/
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -949,7 +835,7 @@ temp%s.y = loc_1.y + loc_4.x; \n", &regID[1], &regID[1], &regID[2], &regID[2], &
 				PfSinCos(sc, &sc->w, &sc->angle);
 			}
 		}
-		for (uint64_t i = 0; i < 4; i++) {
+		for (pfUINT i = 0; i < 4; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -963,8 +849,8 @@ temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n", &regID[i + 4], &regID[i + 4], &regID[i + 4], &regID[i + 4], &regID[i + 4], &regID[i + 0], &regID[i + 0], &regID[i + 0]);*/
 		}
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -987,12 +873,12 @@ temp%s = temp%s + temp;\n\n", &regID[i + 4], &regID[i + 4], &regID[i + 4], &regI
 				}
 			}
 			else {
-				temp_double.data.d = 0.5;
+				temp_double.data.d = pfFPinit("0.5");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
-		for (uint64_t i = 0; i < 2; i++) {
+		for (pfUINT i = 0; i < 2; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -1007,20 +893,20 @@ temp%s = temp%s + temp;\n\n", &regID[i + 2], &regID[i + 2], &regID[i + 2], &regI
 		}
 		if (stageAngle < 0) {
 			
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
 			
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
 
-		for (uint64_t i = 4; i < 6; i++) {
+		for (pfUINT i = 4; i < 6; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -1034,8 +920,8 @@ temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n", &regID[i + 2], &regID[i + 2], &regID[i + 2], &regID[i + 2], &regID[i + 2], &regID[i + 0], &regID[i + 0], &regID[i + 0]);*/
 		}
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -1058,7 +944,7 @@ temp%s = temp%s + temp;\n\n", &regID[i + 2], &regID[i + 2], &regID[i + 2], &regI
 				}
 			}
 			else {
-				temp_double.data.d = 0.25;
+				temp_double.data.d = pfFPinit("0.25");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
@@ -1076,15 +962,15 @@ temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n", &regID[1], &regID[1], &regID[1], &regID[1], &regID[1], &regID[0], &regID[0], &regID[0]);*/
 		if (stageAngle < 0) {
 			
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
@@ -1100,14 +986,14 @@ temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
 temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n", &regID[3], &regID[3], &regID[3], &regID[3], &regID[3], &regID[2], &regID[2], &regID[2]);*/
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = -0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		
 		}
 		else {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = 0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		}
 		PfMul(sc, &sc->temp, &regID[5], &sc->iw, 0);
@@ -1122,14 +1008,14 @@ temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
 temp%s = temp%s - temp;\n\
 temp%s = temp%s + temp;\n\n", &regID[5], &regID[5], &regID[5], &regID[5], &regID[5], &regID[4], &regID[4], &regID[4]);*/
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->w, &sc->iw);
-			PfMov_y_Neg_x(sc, &sc->w, &sc->iw);
+			PfMov(sc, &sc->w.data.c[0], &sc->iw.data.c[1]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->iw.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(iw.y, -iw.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->w, &sc->iw);
-			PfMov_y_x(sc, &sc->w, &sc->iw);
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->iw.data.c[1]);
+			PfMov(sc, &sc->w.data.c[1], &sc->iw.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(-iw.y, iw.x);\n\n", vecType);
 		}
@@ -1140,7 +1026,7 @@ temp%s = temp%s + temp;\n\n", &regID[5], &regID[5], &regID[5], &regID[5], &regID
 		PfAdd(sc, &regID[6], &regID[6], &sc->temp);
 		
 
-		uint64_t permute2[8] = { 0,4,2,6,1,5,3,7 };
+		pfUINT permute2[8] = { 0,4,2,6,1,5,3,7 };
 		PfPermute(sc, permute2, 8, 1, regID, &sc->temp);
 		
 		/*
@@ -1174,18 +1060,18 @@ temp%s = temp;\n\
 		break;
 	}
 	case 9: {
-		PfContainer tf[2];
+		PfContainer tf[2] = VKFFT_ZERO_INIT;
 		//PfAppendLine(sc, "	{\n");
-		for (int64_t i = 0; i < 2; i++){
-			tf[i].type = 32;
+		for (pfINT i = 0; i < 2; i++){
+			tf[i].type = 22;
 		}
 
-		tf[0].data.d = -0.5;
-		tf[1].data.d = -0.8660254037844386467637231707529;
-		for (uint64_t i = radix - 1; i > 0; i--) {
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("-0.8660254037844386467637231707529361834714");
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -1206,7 +1092,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1230,7 +1116,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1244,9 +1130,9 @@ temp%s = temp;\n\
 		//
 		//PfMov(sc, &regID[2], &sc->locID[2]);
 		//
-		uint64_t P = 3;
-		uint64_t Q = 3;
-		for (uint64_t i = 0; i < Q; i++) {
+		pfUINT P = 3;
+		pfUINT Q = 3;
+		for (pfUINT i = 0; i < Q; i++) {
 			PfMov(sc, &sc->locID[0], &regID[i]);
 			
 			PfMov(sc, &sc->locID[1], &regID[i + Q]);
@@ -1268,45 +1154,45 @@ temp%s = temp;\n\
 			
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 		}
 
 
-		for (uint64_t i = 0; i < P; i++) {
+		for (pfUINT i = 0; i < P; i++) {
 			if (i > 0) {
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = -sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = -pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				else {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				PfMul(sc, &sc->locID[1], &regID[Q * i + 1], &sc->w, &sc->temp);
 				
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos(4 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = -sin(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = -pfsin(4 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				else {
-					temp_complex.data.c[0] = cos(4 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = sin(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = pfsin(4 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
@@ -1335,20 +1221,20 @@ temp%s = temp;\n\
 			
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 		}
 
-		uint64_t permute2[9] = { 0,3,6,1,4,7,2,5,8 };
+		pfUINT permute2[9] = { 0,3,6,1,4,7,2,5,8 };
 		PfPermute(sc, permute2, 9, 1, regID, &sc->temp);
 		
 
@@ -1368,21 +1254,21 @@ temp%s = temp;\n\
 		break;
 	}
 	case 10: {
-		PfContainer tf[5];
-		for (int64_t i = 0; i < 5; i++){
-			tf[i].type = 32;
+		PfContainer tf[5] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 5; i++){
+			tf[i].type = 22;
 		}
 		//PfAppendLine(sc, "	{\n");
 		
-		tf[0].data.d = -0.5;
-		tf[1].data.d = 1.538841768587626701285145288018455;
-		tf[2].data.d = -0.363271264002680442947733378740309;
-		tf[3].data.d = -0.809016994374947424102293417182819;
-		tf[4].data.d = -0.587785252292473129168705954639073;
-		for (uint64_t i = radix - 1; i > 0; i--) {
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("1.538841768587626701285145288018455");
+		tf[2].data.d = pfFPinit("-0.363271264002680442947733378740309");
+		tf[3].data.d = pfFPinit("-0.809016994374947424102293417182819");
+		tf[4].data.d = pfFPinit("-0.587785252292473129168705954639073");
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -1403,7 +1289,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1427,7 +1313,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1440,9 +1326,9 @@ temp%s = temp;\n\
 		//PfMov(sc, &regID[1], &sc->locID[1]);
 		//
 
-		uint64_t P = 5;
-		uint64_t Q = 2;
-		for (uint64_t i = 0; i < Q; i++) {
+		pfUINT P = 5;
+		pfUINT Q = 2;
+		for (pfUINT i = 0; i < Q; i++) {
 			PfMov(sc, &sc->locID[0], &regID[i]);
 			
 			PfMov(sc, &sc->locID[1], &regID[i + Q]);
@@ -1475,13 +1361,13 @@ temp%s = temp;\n\
 			
 			PfFMA(sc, &sc->locID[2], &regID[i + 2 * Q], &tf[0], &regID[i]);
 			
-			PfMul(sc, &regID[i + 3 * Q], &regID[i + 3 * Q], &tf[1], 0);
+			PfMul(sc, &regID[i + 3 * Q], &regID[i + 3 * Q], &tf[1], &regID[i]);
 			
-			PfMul(sc, &regID[i + 4 * Q], &regID[i + 4 * Q], &tf[2], 0);
+			PfMul(sc, &regID[i + 4 * Q], &regID[i + 4 * Q], &tf[2], &regID[i]);
 			
-			PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[3], 0);
+			PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[3], &regID[i]);
 			
-			PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[4], 0);
+			PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[4], &regID[i]);
 			
 
 			PfSub(sc, &sc->locID[1], &sc->locID[1], &sc->locID[3]);
@@ -1497,40 +1383,40 @@ temp%s = temp;\n\
 
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplexInv(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplexInv(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplex(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplex(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
 			}
 
 		}
 
 
-		for (uint64_t i = 0; i < P; i++) {
+		for (pfUINT i = 0; i < P; i++) {
 			if (i > 0) {
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = -sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = -pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				else {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
@@ -1546,56 +1432,56 @@ temp%s = temp;\n\
 			
 		}
 
-		uint64_t permute2[10] = { 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 };
+		pfUINT permute2[10] = { 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 };
 		PfPermute(sc, permute2, 10, 1, regID, &sc->temp);
 		break;
 	}
 	case 11: {
-		PfContainer tf_x[20];
-		PfContainer tf_y[20];
-		for (int64_t i = 0; i < 20; i++){
-			tf_x[i].type = 32;
-			tf_y[i].type = 32;
-		}
-		
-		tf_x[0].data.d = 8.4125353283118116886306336876800e-01;
-		tf_x[1].data.d = -9.5949297361449738990105129410324e-01;
-		tf_x[2].data.d = -1.4231483827328514046015907335008e-01;
-		tf_x[3].data.d = -6.5486073394528506407246543075118e-01;
-		tf_x[4].data.d = 4.1541501300188642567903264668505e-01;
-		tf_x[5].data.d = 8.4125353283118116886306336876800e-01;
-		tf_x[6].data.d = -9.5949297361449738990105129410324e-01;
-		tf_x[7].data.d = -1.4231483827328514046015907335008e-01;
-		tf_x[8].data.d = -6.5486073394528506407246543075118e-01;
-		tf_x[9].data.d = 4.1541501300188642567903264668505e-01;
+		PfContainer tf_x[20] = VKFFT_ZERO_INIT;
+		PfContainer tf_y[20] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 20; i++){
+			tf_x[i].type = 22;
+			tf_y[i].type = 22;
+		}
+		
+		tf_x[0].data.d = pfFPinit("0.8412535328311811688618116489193677175132924984205378986426");
+		tf_x[1].data.d = pfFPinit("-0.959492973614497389890368057066327699062454848422161955044");
+		tf_x[2].data.d = pfFPinit("-0.142314838273285140443792668616369668791051361125984328418");
+		tf_x[3].data.d = pfFPinit("-0.654860733945285064056925072466293553183791199336928427606");
+		tf_x[4].data.d = pfFPinit("0.4154150130018864255292741492296232035240049104645368124262");
+		tf_x[5].data.d = tf_x[0].data.d;
+		tf_x[6].data.d = tf_x[1].data.d;
+		tf_x[7].data.d = tf_x[2].data.d;
+		tf_x[8].data.d = tf_x[3].data.d;
+		tf_x[9].data.d = tf_x[4].data.d;
 		if (stageAngle < 0) {
-			tf_y[0].data.d = -5.4064081745559758210122047739077e-01;
-			tf_y[1].data.d = 2.8173255684142969773359373164556e-01;
-			tf_y[2].data.d = -9.8982144188093273235937163967435e-01;
-			tf_y[3].data.d = 7.5574957435425828375808593451168e-01;
-			tf_y[4].data.d = 9.0963199535451837136413102968824e-01;
-			tf_y[5].data.d = 5.4064081745559758210122047739077e-01;
-			tf_y[6].data.d = -2.8173255684142969773359373164556e-01;
-			tf_y[7].data.d = 9.8982144188093273235937163967435e-01;
-			tf_y[8].data.d = -7.5574957435425828375808593451168e-01;
-			tf_y[9].data.d = -9.0963199535451837136413102968824e-01;
+			tf_y[0].data.d = pfFPinit("-0.5406408174555975821076359543186916954317706078981138400357");
+			tf_y[1].data.d = pfFPinit("0.2817325568414296977114179153466168990357778989732668718310");
+			tf_y[2].data.d = pfFPinit("-0.9898214418809327323760920377767187873765193719487166878386");
+			tf_y[3].data.d = pfFPinit("0.7557495743542582837740358439723444201797174451692235695799");
+			tf_y[4].data.d = pfFPinit("0.9096319953545183714117153830790284600602410511946441707561");
+			tf_y[5].data.d = -tf_y[0].data.d;
+			tf_y[6].data.d = -tf_y[1].data.d;
+			tf_y[7].data.d = -tf_y[2].data.d;
+			tf_y[8].data.d = -tf_y[3].data.d;
+			tf_y[9].data.d = -tf_y[4].data.d;
 		}
 		else {
-			tf_y[0].data.d = 5.4064081745559758210122047739077e-01;
-			tf_y[1].data.d = -2.8173255684142969773359373164556e-01;
-			tf_y[2].data.d = 9.8982144188093273235937163967435e-01;
-			tf_y[3].data.d = -7.5574957435425828375808593451168e-01;
-			tf_y[4].data.d = -9.0963199535451837136413102968824e-01;
-			tf_y[5].data.d = -5.4064081745559758210122047739077e-01;
-			tf_y[6].data.d = 2.8173255684142969773359373164556e-01;
-			tf_y[7].data.d = -9.8982144188093273235937163967435e-01;
-			tf_y[8].data.d = 7.5574957435425828375808593451168e-01;
-			tf_y[9].data.d = 9.0963199535451837136413102968824e-01;
-		}
-		for (uint64_t i = radix - 1; i > 0; i--) {
+			tf_y[0].data.d = pfFPinit("0.5406408174555975821076359543186916954317706078981138400357");
+			tf_y[1].data.d = pfFPinit("-0.2817325568414296977114179153466168990357778989732668718310");
+			tf_y[2].data.d = pfFPinit("0.9898214418809327323760920377767187873765193719487166878386");
+			tf_y[3].data.d = pfFPinit("-0.7557495743542582837740358439723444201797174451692235695799");
+			tf_y[4].data.d = pfFPinit("-0.9096319953545183714117153830790284600602410511946441707561");
+			tf_y[5].data.d = -tf_y[0].data.d;
+			tf_y[6].data.d = -tf_y[1].data.d;
+			tf_y[7].data.d = -tf_y[2].data.d;
+			tf_y[8].data.d = -tf_y[3].data.d;
+			tf_y[9].data.d = -tf_y[4].data.d;
+		}
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -1616,7 +1502,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1640,7 +1526,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1651,70 +1537,70 @@ temp%s = temp;\n\
 		}
 		PfMov(sc, &sc->locID[0], &regID[0]);
 		
-		uint64_t permute[11] = { 0,1,2,4,8,5,10,9,7,3,6 };
+		pfUINT permute[11] = { 0,1,2,4,8,5,10,9,7,3,6 };
 		PfPermute(sc, permute, 11, 0, 0, &sc->w);
 		
-		for (uint64_t i = 0; i < 5; i++) {
-			PfSub_x(sc, &regID[i + 6], &sc->locID[i + 1], &sc->locID[i + 6]);
+		for (pfUINT i = 0; i < 5; i++) {
+			PfSub(sc, &regID[i + 6].data.c[0], &sc->locID[i + 1].data.c[0], &sc->locID[i + 6].data.c[0]);
 			
-			PfAdd_x(sc, &regID[i + 1], &sc->locID[i + 1], &sc->locID[i + 6]);
+			PfAdd(sc, &regID[i + 1].data.c[0], &sc->locID[i + 1].data.c[0], &sc->locID[i + 6].data.c[0]);
 			
-			PfAdd_y(sc, &regID[i + 6], &sc->locID[i + 1], &sc->locID[i + 6]);
+			PfAdd(sc, &regID[i + 6].data.c[1], &sc->locID[i + 1].data.c[1], &sc->locID[i + 6].data.c[1]);
 			
-			PfSub_y(sc, &regID[i + 1], &sc->locID[i + 1], &sc->locID[i + 6]);
+			PfSub(sc, &regID[i + 1].data.c[1], &sc->locID[i + 1].data.c[1], &sc->locID[i + 6].data.c[1]);
 			
 		}
-		for (uint64_t i = 0; i < 5; i++) {
-			PfAdd_x(sc, &regID[0], &regID[0], &regID[i + 1]);
+		for (pfUINT i = 0; i < 5; i++) {
+			PfAdd(sc, &regID[0].data.c[0], &regID[0].data.c[0], &regID[i + 1].data.c[0]);
 			
-			PfAdd_y(sc, &regID[0], &regID[0], &regID[i + 6]);
+			PfAdd(sc, &regID[0].data.c[1], &regID[0].data.c[1], &regID[i + 6].data.c[1]);
 			
 		}
-		for (uint64_t i = 1; i < 6; i++) {
+		for (pfUINT i = 1; i < 6; i++) {
 			PfMov(sc, &sc->locID[i], &sc->locID[0]);
 			
 			
 		}
-		for (uint64_t i = 6; i < 11; i++) {
+		for (pfUINT i = 6; i < 11; i++) {
 			PfSetToZero(sc, &sc->locID[i]);
 		}
-		for (uint64_t i = 0; i < 5; i++) {
-			for (uint64_t j = 0; j < 5; j++) {
-				uint64_t id = ((10 - i) + j) % 10;
-				PfFMA3_const_w(sc, &sc->locID[j + 1], &sc->locID[j + 6], &regID[i + 1], &tf_x[id], &tf_y[id], &regID[i + 6], &sc->w);
+		for (pfUINT i = 0; i < 5; i++) {
+			for (pfUINT j = 0; j < 5; j++) {
+				pfUINT id = ((10 - i) + j) % 10;
+				PfFMA3_const_w(sc, &sc->locID[j + 1], &sc->locID[j + 6], &regID[i + 1], &tf_x[id], &tf_y[id], &regID[i + 6], &sc->w, &sc->locID[0]);
 				
 			}
 		}
-		for (uint64_t i = 1; i < 6; i++) {
-			PfSub_x(sc, &regID[i], &sc->locID[i], &sc->locID[i + 5]);
+		for (pfUINT i = 1; i < 6; i++) {
+			PfSub(sc, &regID[i].data.c[0], &sc->locID[i].data.c[0], &sc->locID[i + 5].data.c[0]);
 			
-			PfAdd_y(sc, &regID[i], &sc->locID[i], &sc->locID[i + 5]);
+			PfAdd(sc, &regID[i].data.c[1], &sc->locID[i].data.c[1], &sc->locID[i + 5].data.c[1]);
 			
 		}
-		for (uint64_t i = 1; i < 6; i++) {
-			PfAdd_x(sc, &regID[i + 5], &sc->locID[i], &sc->locID[i + 5]);
+		for (pfUINT i = 1; i < 6; i++) {
+			PfAdd(sc, &regID[i + 5].data.c[0], &sc->locID[i].data.c[0], &sc->locID[i + 5].data.c[0]);
 			
-			PfSub_y(sc, &regID[i + 5], &sc->locID[i], &sc->locID[i + 5]);
+			PfSub(sc, &regID[i + 5].data.c[1], &sc->locID[i].data.c[1], &sc->locID[i + 5].data.c[1]);
 			
 		}
 
-		uint64_t permute2[11] = { 0,1,10,3,9,7,2,4,8,5,6 };
+		pfUINT permute2[11] = { 0,1,10,3,9,7,2,4,8,5,6 };
 		PfPermute(sc, permute2, 11, 1, regID, &sc->w);
 		break;
 	}
 	case 12: {
-		PfContainer tf[2];
-		for (int64_t i = 0; i < 2; i++){
-			tf[i].type = 32;
+		PfContainer tf[2] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 2; i++){
+			tf[i].type = 22;
 		}
 		//PfAppendLine(sc, "	{\n");
 		
-		tf[0].data.d = -0.5;
-		tf[1].data.d = -0.8660254037844386467637231707529;
-		for (uint64_t i = radix - 1; i > 0; i--) {
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("-0.8660254037844386467637231707529361834714");
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 
 			}
@@ -1735,7 +1621,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1759,7 +1645,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1773,9 +1659,9 @@ temp%s = temp;\n\
 		//
 		//PfMov(sc, &regID[2], &sc->locID[2]);
 		//
-		uint64_t P = 3;
-		uint64_t Q = 4;
-		for (uint64_t i = 0; i < Q; i++) {
+		pfUINT P = 3;
+		pfUINT Q = 4;
+		for (pfUINT i = 0; i < Q; i++) {
 			PfMov(sc, &sc->locID[0], &regID[i]);
 			
 			PfMov(sc, &sc->locID[1], &regID[i + Q]);
@@ -1797,32 +1683,32 @@ temp%s = temp;\n\
 			
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 		}
 
 
-		for (uint64_t i = 0; i < P; i++) {
-			for (uint64_t j = 0; j < Q; j++) {
+		for (pfUINT i = 0; i < P; i++) {
+			for (pfUINT j = 0; j < Q; j++) {
 				if (i > 0) {
 					if (stageAngle < 0) {
-						temp_complex.data.c[0] = cos(2 * i * j * sc->double_PI / radix);
-						temp_complex.data.c[1] = -sin(2 * i * j * sc->double_PI / radix);
+						temp_complex.data.c[0].data.d = pfcos(2 * i * j * sc->double_PI / radix);
+						temp_complex.data.c[1].data.d = -pfsin(2 * i * j * sc->double_PI / radix);
 						PfMov(sc, &sc->w, &temp_complex);	
 						
 					}
 					else {
-						temp_complex.data.c[0] = cos(2 * i * j * sc->double_PI / radix);
-						temp_complex.data.c[1] = sin(2 * i * j * sc->double_PI / radix);
+						temp_complex.data.c[0].data.d = pfcos(2 * i * j * sc->double_PI / radix);
+						temp_complex.data.c[1].data.d = pfsin(2 * i * j * sc->double_PI / radix);
 						PfMov(sc, &sc->w, &temp_complex);	
 						
 					}
@@ -1852,13 +1738,13 @@ temp%s = temp;\n\
 			
 
 			if (stageAngle < 0) {
-				PfMov_x_y(sc, &sc->temp, &regID[Q * i + 3]);
-				PfMov_y_Neg_x(sc, &sc->temp, &regID[Q * i + 3]);
+				PfMov(sc, &sc->temp.data.c[0], &regID[Q * i + 3].data.c[1]);
+				PfMovNeg(sc, &sc->temp.data.c[1], &regID[Q * i + 3].data.c[0]);
 				
 			}
 			else {
-				PfMov_x_Neg_y(sc, &sc->temp, &regID[Q * i + 3]);
-				PfMov_y_x(sc, &sc->temp, &regID[Q * i + 3]);
+				PfMovNeg(sc, &sc->temp.data.c[0], &regID[Q * i + 3].data.c[1]);
+				PfMov(sc, &sc->temp.data.c[1], &regID[Q * i + 3].data.c[0]);
 				
 			}
 			PfSub(sc, &regID[Q * i + 3], &regID[Q * i + 2], &sc->temp);
@@ -1867,65 +1753,65 @@ temp%s = temp;\n\
 			
 		}
 
-		uint64_t permute2[12] = { 0,4,8,2,6,10,1,5,9,3,7,11 };
+		pfUINT permute2[12] = { 0,4,8,2,6,10,1,5,9,3,7,11 };
 		PfPermute(sc, permute2, 12, 1, regID, &sc->temp);
 		
 		break;
 	}
 	case 13: {
-		PfContainer tf_x[20];
-		for (int64_t i = 0; i < 20; i++){
-			tf_x[i].type = 32;
-		}
-		PfContainer tf_y[20];
-		for (int64_t i = 0; i < 20; i++){
-			tf_y[i].type = 32;
-		}
-		
-		tf_x[0].data.d = 8.8545602565320989587194927539215e-01;
-		tf_x[1].data.d = -9.7094181742605202719252621701429e-01;
-		tf_x[2].data.d = 1.2053668025532305345994812592614e-01;
-		tf_x[3].data.d = -7.4851074817110109868448578063216e-01;
-		tf_x[4].data.d = -3.5460488704253562600274447824678e-01;
-		tf_x[5].data.d = 5.6806474673115580237845248512407e-01;
-		tf_x[6].data.d = 8.8545602565320989608878970988926e-01;
-		tf_x[7].data.d = -9.7094181742605202719252621701429e-01;
-		tf_x[8].data.d = 1.2053668025532305324988395500707e-01;
-		tf_x[9].data.d = -7.4851074817110109863027567200788e-01;
-		tf_x[10].data.d = -3.5460488704253562600274447824678e-01;
-		tf_x[11].data.d = 5.6806474673115580248687270237262e-01;
+		PfContainer tf_x[20] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 20; i++){
+			tf_x[i].type = 22;
+		}
+		PfContainer tf_y[20] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 20; i++){
+			tf_y[i].type = 22;
+		}
+		
+		tf_x[0].data.d = pfFPinit("0.8854560256532098959003755220150988786054984163475349018024");
+		tf_x[1].data.d = pfFPinit("-0.970941817426052027156982276293789227249865105739003588587");
+		tf_x[2].data.d = pfFPinit("0.1205366802553230533490676874525435822736811592275714047969");
+		tf_x[3].data.d = pfFPinit("-0.748510748171101098634630599701351383846451590175826134069");
+		tf_x[4].data.d = pfFPinit("-0.354604887042535625969637892600018474316355432113794753421");
+		tf_x[5].data.d = pfFPinit("0.5680647467311558025118075591275166245334925524535181694796");
+		tf_x[6].data.d = tf_x[0].data.d;
+		tf_x[7].data.d = tf_x[1].data.d;
+		tf_x[8].data.d = tf_x[2].data.d;
+		tf_x[9].data.d = tf_x[3].data.d;
+		tf_x[10].data.d = tf_x[4].data.d;
+		tf_x[11].data.d = tf_x[5].data.d;
 		if (stageAngle < 0) {
-			tf_y[0].data.d = -4.6472317204376854566250792943904e-01;
-			tf_y[1].data.d = 2.3931566428755776706062234626682e-01;
-			tf_y[2].data.d = 9.9270887409805399278096144088934e-01;
-			tf_y[3].data.d = -6.6312265824079520232193704631918e-01;
-			tf_y[4].data.d = 9.3501624268541482344965776185575e-01;
-			tf_y[5].data.d = 8.2298386589365639468820687318917e-01;
-			tf_y[6].data.d = 4.6472317204376854531014222338126e-01;
-			tf_y[7].data.d = -2.3931566428755776695220212901827e-01;
-			tf_y[8].data.d = -9.9270887409805399283517154951362e-01;
-			tf_y[9].data.d = 6.6312265824079520243035726356773e-01;
-			tf_y[10].data.d = -9.3501624268541482344965776185575e-01;
-			tf_y[11].data.d = -8.2298386589365639457978665594062e-01;
+			tf_y[0].data.d = pfFPinit("-0.4647231720437685456560153351331047775577358653324689769540");
+			tf_y[1].data.d = pfFPinit("0.2393156642875577671487537262602118952031730227383060133551");
+			tf_y[2].data.d = pfFPinit("0.9927088740980539928007516494925201793436756329701668557709");
+			tf_y[3].data.d = pfFPinit("-0.6631226582407952023767854926667662795247641070441061881807");
+			tf_y[4].data.d = pfFPinit("0.9350162426854148234397845998378307290505174695784318706963");
+			tf_y[5].data.d = pfFPinit("0.8229838658936563945796174234393819906550676930875738058270");
+			tf_y[6].data.d = -tf_y[0].data.d;
+			tf_y[7].data.d = -tf_y[1].data.d;
+			tf_y[8].data.d = -tf_y[2].data.d;
+			tf_y[9].data.d = -tf_y[3].data.d;
+			tf_y[10].data.d = -tf_y[4].data.d;
+			tf_y[11].data.d = -tf_y[5].data.d;
 		}
 		else {
-			tf_y[0].data.d = 4.6472317204376854566250792943904e-01;
-			tf_y[1].data.d = -2.3931566428755776706062234626682e-01;
-			tf_y[2].data.d = -9.9270887409805399278096144088934e-01;
-			tf_y[3].data.d = 6.6312265824079520232193704631918e-01;
-			tf_y[4].data.d = -9.3501624268541482344965776185575e-01;
-			tf_y[5].data.d = -8.2298386589365639468820687318917e-01;
-			tf_y[6].data.d = -4.6472317204376854531014222338126e-01;
-			tf_y[7].data.d = 2.3931566428755776695220212901827e-01;
-			tf_y[8].data.d = 9.9270887409805399283517154951362e-01;
-			tf_y[9].data.d = -6.6312265824079520243035726356773e-01;
-			tf_y[10].data.d = 9.3501624268541482344965776185575e-01;
-			tf_y[11].data.d = 8.2298386589365639457978665594062e-01;
-		}
-		for (uint64_t i = radix - 1; i > 0; i--) {
+			tf_y[0].data.d = pfFPinit("0.4647231720437685456560153351331047775577358653324689769540");
+			tf_y[1].data.d = pfFPinit("-0.2393156642875577671487537262602118952031730227383060133551");
+			tf_y[2].data.d = pfFPinit("-0.9927088740980539928007516494925201793436756329701668557709");
+			tf_y[3].data.d = pfFPinit("0.6631226582407952023767854926667662795247641070441061881807");
+			tf_y[4].data.d = pfFPinit("-0.9350162426854148234397845998378307290505174695784318706963");
+			tf_y[5].data.d = pfFPinit("-0.8229838658936563945796174234393819906550676930875738058270");
+			tf_y[6].data.d = -tf_y[0].data.d;
+			tf_y[7].data.d = -tf_y[1].data.d;
+			tf_y[8].data.d = -tf_y[2].data.d;
+			tf_y[9].data.d = -tf_y[3].data.d;
+			tf_y[10].data.d = -tf_y[4].data.d;
+			tf_y[11].data.d = -tf_y[5].data.d;
+		}
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -1946,7 +1832,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1970,7 +1856,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -1981,84 +1867,92 @@ temp%s = temp;\n\
 		}
 		PfMov(sc, &sc->locID[0], &regID[0]);
 		
-		uint64_t permute[13] = { 0, 1, 2, 4, 8, 3, 6, 12, 11, 9, 5, 10, 7 };
+		pfUINT permute[13] = { 0, 1, 2, 4, 8, 3, 6, 12, 11, 9, 5, 10, 7 };
 		PfPermute(sc, permute, 13, 0, 0, &sc->w);
 		
-		for (uint64_t i = 0; i < 6; i++) {
-			PfSub_x(sc, &regID[i + 7], &sc->locID[i + 1], &sc->locID[i + 7]);
+		for (pfUINT i = 0; i < 6; i++) {
+			PfSub(sc, &regID[i + 7].data.c[0], &sc->locID[i + 1].data.c[0], &sc->locID[i + 7].data.c[0]);
 			
-			PfAdd_x(sc, &regID[i + 1], &sc->locID[i + 1], &sc->locID[i + 7]);
+			PfAdd(sc, &regID[i + 1].data.c[0], &sc->locID[i + 1].data.c[0], &sc->locID[i + 7].data.c[0]);
 			
-			PfAdd_y(sc, &regID[i + 7], &sc->locID[i + 1], &sc->locID[i + 7]);
+			PfAdd(sc, &regID[i + 7].data.c[1], &sc->locID[i + 1].data.c[1], &sc->locID[i + 7].data.c[1]);
 			
-			PfSub_y(sc, &regID[i + 1], &sc->locID[i + 1], &sc->locID[i + 7]);
+			PfSub(sc, &regID[i + 1].data.c[1], &sc->locID[i + 1].data.c[1], &sc->locID[i + 7].data.c[1]);
 			
 		}
-		for (uint64_t i = 0; i < 6; i++) {
-			PfAdd_x(sc, &regID[0], &regID[0], &regID[i + 1]);
+		for (pfUINT i = 0; i < 6; i++) {
+			PfAdd(sc, &regID[0].data.c[0], &regID[0].data.c[0], &regID[i + 1].data.c[0]);
 			
-			PfAdd_y(sc, &regID[0], &regID[0], &regID[i + 7]);
+			PfAdd(sc, &regID[0].data.c[1], &regID[0].data.c[1], &regID[i + 7].data.c[1]);
 			
 		}
-		for (uint64_t i = 1; i < 7; i++) {
+		for (pfUINT i = 1; i < 7; i++) {
 			PfMov(sc, &sc->locID[i], &sc->locID[0]);
 			
 		}
-		for (uint64_t i = 7; i < 13; i++) {
+		for (pfUINT i = 7; i < 13; i++) {
 			PfSetToZero(sc, &sc->locID[i]);
 		}
-		for (uint64_t i = 0; i < 6; i++) {
-			for (uint64_t j = 0; j < 6; j++) {
-				uint64_t id = ((12 - i) + j) % 12;
-				PfFMA3_const_w(sc, &sc->locID[j + 1], &sc->locID[j + 7], &regID[i + 1], &tf_x[id], &tf_y[id], &regID[i + 7], &sc->w);
+		for (pfUINT i = 0; i < 6; i++) {
+			for (pfUINT j = 0; j < 6; j++) {
+				pfUINT id = ((12 - i) + j) % 12;
+				PfFMA3_const_w(sc, &sc->locID[j + 1], &sc->locID[j + 7], &regID[i + 1], &tf_x[id], &tf_y[id], &regID[i + 7], &sc->w, &sc->locID[0]);
 				
 			}
 		}
-		for (uint64_t i = 1; i < 7; i++) {
-			PfSub_x(sc, &regID[i], &sc->locID[i], &sc->locID[i + 6]);
+		for (pfUINT i = 1; i < 7; i++) {
+			PfSub(sc, &regID[i].data.c[0], &sc->locID[i].data.c[0], &sc->locID[i + 6].data.c[0]);
 			
-			PfAdd_y(sc, &regID[i], &sc->locID[i], &sc->locID[i + 6]);
+			PfAdd(sc, &regID[i].data.c[1], &sc->locID[i].data.c[1], &sc->locID[i + 6].data.c[1]);
 			
 		}
-		for (uint64_t i = 1; i < 7; i++) {
-			PfAdd_x(sc, &regID[i + 6], &sc->locID[i], &sc->locID[i + 6]);
+		for (pfUINT i = 1; i < 7; i++) {
+			PfAdd(sc, &regID[i + 6].data.c[0], &sc->locID[i].data.c[0], &sc->locID[i + 6].data.c[0]);
 			
-			PfSub_y(sc, &regID[i + 6], &sc->locID[i], &sc->locID[i + 6]);
+			PfSub(sc, &regID[i + 6].data.c[1], &sc->locID[i].data.c[1], &sc->locID[i + 6].data.c[1]);
 			
 		}
 
-		uint64_t permute2[13] = { 0,1,12,9,11,4,8,2,10,5,3,6,7 };
+		pfUINT permute2[13] = { 0,1,12,9,11,4,8,2,10,5,3,6,7 };
 		PfPermute(sc, permute2, 13, 1, regID, &sc->w);
 		//
 		break;
 	}
 	case 14: {
-		PfContainer tf[8];
-		for (int64_t i = 0; i < 8; i++){
-			tf[i].type = 32;
-		}
-		//PfAppendLine(sc, "	{\n");
-		
-		tf[0].data.d = -1.16666666666666651863693004997913;
-		tf[1].data.d = 0.79015646852540022404554065360571;
-		tf[2].data.d = 0.05585426728964774240049351305970;
-		tf[3].data.d = 0.73430220123575240531721419756650;
+		PfContainer tf_x[6] = VKFFT_ZERO_INIT;
+		PfContainer tf_y[6] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 6; i++){
+			tf_x[i].type = 22;
+			tf_y[i].type = 22;
+		}
+		
+		tf_x[0].data.d = pfFPinit("0.6234898018587335305250048840042398106322747308964021053655");
+		tf_x[1].data.d = pfFPinit("-0.222520933956314404288902564496794759466355568764544955311");
+		tf_x[2].data.d = pfFPinit("-0.900968867902419126236102319507445051165919162131857150053");
+		tf_x[3].data.d = tf_x[0].data.d;
+		tf_x[4].data.d = tf_x[1].data.d;
+		tf_x[5].data.d = tf_x[2].data.d;
 		if (stageAngle < 0) {
-			tf[4].data.d = 0.44095855184409837868031445395900;
-			tf[5].data.d = 0.34087293062393136944265847887436;
-			tf[6].data.d = -0.53396936033772524066165487965918;
-			tf[7].data.d = 0.87484229096165666561546458979137;
+			tf_y[0].data.d = pfFPinit("-0.7818314824680298087084445266740577502323345187086875289806");
+			tf_y[1].data.d = pfFPinit("0.9749279121818236070181316829939312172327858006199974376480");
+			tf_y[2].data.d = pfFPinit("0.4338837391175581204757683328483587546099907277874598764445");
+			tf_y[3].data.d = -tf_y[0].data.d;
+			tf_y[4].data.d = -tf_y[1].data.d;
+			tf_y[5].data.d = -tf_y[2].data.d;
 		}
 		else {
-			tf[4].data.d = -0.44095855184409837868031445395900;
-			tf[5].data.d = -0.34087293062393136944265847887436;
-			tf[6].data.d = 0.53396936033772524066165487965918;
-			tf[7].data.d = -0.87484229096165666561546458979137;
+			tf_y[0].data.d = pfFPinit("0.7818314824680298087084445266740577502323345187086875289806");
+			tf_y[1].data.d = pfFPinit("-0.9749279121818236070181316829939312172327858006199974376480");
+			tf_y[2].data.d = pfFPinit("-0.4338837391175581204757683328483587546099907277874598764445");
+			tf_y[3].data.d = -tf_y[0].data.d;
+			tf_y[4].data.d = -tf_y[1].data.d;
+			tf_y[5].data.d = -tf_y[2].data.d;
 		}
-		for (uint64_t i = radix - 1; i > 0; i--) {
+
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -2079,7 +1973,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -2103,7 +1997,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -2116,134 +2010,98 @@ temp%s = temp;\n\
 		//PfMov(sc, &regID[1], &sc->locID[1]);
 		//
 
-		uint64_t P = 7;
-		uint64_t Q = 2;
-		for (uint64_t i = 0; i < Q; i++) {
-			PfMov(sc, &sc->locID[0], &regID[i]);
-			
-			PfMov(sc, &sc->locID[1], &regID[i + Q]);
-			
-			PfMov(sc, &sc->locID[2], &regID[i + 2 * Q]);
-			
-			PfMov(sc, &sc->locID[3], &regID[i + 3 * Q]);
-			
-			PfMov(sc, &sc->locID[4], &regID[i + 4 * Q]);
-			
-			PfMov(sc, &sc->locID[5], &regID[i + 5 * Q]);
-			
-			PfMov(sc, &sc->locID[6], &regID[i + 6 * Q]);
-			
-
-			PfAdd(sc, &regID[i], &sc->locID[1], &sc->locID[6]);
-			
-			PfSub(sc, &regID[i + Q], &sc->locID[1], &sc->locID[6]);
-			
-			PfAdd(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[5]);
-			
-			PfSub(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[5]);
-			
-			PfAdd(sc, &regID[i + 4 * Q], &sc->locID[4], &sc->locID[3]);
-			
-			PfSub(sc, &regID[i + 5 * Q], &sc->locID[4], &sc->locID[3]);
-			
-
-			PfAdd(sc, &sc->locID[5], &regID[i + Q], &regID[i + 3 * Q]);
-			
-			PfAdd(sc, &sc->locID[5], &sc->locID[5], &regID[i + 5 * Q]);
-			
-			PfAdd(sc, &sc->locID[1], &regID[i], &regID[i + 2 * Q]);
-			
-			PfAdd(sc, &sc->locID[1], &sc->locID[1], &regID[i + 4 * Q]);
-			
-			PfAdd(sc, &sc->locID[0], &sc->locID[0], &sc->locID[1]);
-			
-
-			PfSub(sc, &sc->locID[2], &regID[i], &regID[i + 4 * Q]);
-			
-			PfSub(sc, &sc->locID[3], &regID[i + 4 * Q], &regID[i + 2 * Q]);
-			
-			PfSub(sc, &sc->locID[4], &regID[i + 2 * Q], &regID[i]);
-			
+		pfUINT P = 7;
+		pfUINT Q = 2;
+		PfContainer tempID[7] = VKFFT_ZERO_INIT;
 
-			PfSub(sc, &regID[i], &regID[i + Q], &regID[i + 5 * Q]);
-			
-			PfSub(sc, &regID[i + 2 * Q], &regID[i + 5 * Q], &regID[i + 3 * Q]);
-			
-			PfSub(sc, &regID[i + 4 * Q], &regID[i + 3 * Q], &regID[i + Q]);
-			
+		for (int t = 0; t < 7; t++) {
+			tempID[t].type = 100 + sc->vecTypeCode;
+			PfAllocateContainerFlexible(sc, &tempID[t], 50);
+		}
 
-			PfMul(sc, &sc->locID[1], &sc->locID[1], &tf[0], 0);
-			
-			PfMul(sc, &sc->locID[2], &sc->locID[2], &tf[1], 0);
-			
-			PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[2], 0);
-			
-			PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[3], 0);
-			
-			PfMul(sc, &sc->locID[5], &sc->locID[5], &tf[4], 0);
-			
-			PfMul(sc, &regID[i], &regID[i], &tf[5], 0);
-			
-			PfMul(sc, &regID[i + 2 * Q], &regID[i + 2 * Q], &tf[6], 0);
-			
-			PfMul(sc, &regID[i + 4 * Q], &regID[i + 4 * Q], &tf[7], 0);
+		for (pfUINT i = 0; i < Q; i++) {
+			pfUINT permute[7] = { 0, 1, 3, 2, 6, 4, 5 };
 			
+			for (pfUINT t = 0; t < 7; t++)
+				PfCopyContainer(sc, &tempID[t], &regID[i + Q * t]);
+			for (pfUINT t = 0; t < 7; t++)
+				PfCopyContainer(sc, &regID[i + Q * t], &tempID[permute[t]]);
 
-			PfSub(sc, &regID[i + 5 * Q], &regID[i + 4 * Q], &regID[i + 2 * Q]);
-			
-			PfAddInv(sc, &regID[i + 6 * Q], &regID[i + 4 * Q], &regID[i]);
-			
-			PfAdd(sc, &regID[i + 4 * Q], &regID[i], &regID[i + 2 * Q]);
-			
-			PfAdd(sc, &regID[i], &sc->locID[0], &sc->locID[1]);
-			
-			PfAdd(sc, &regID[i + Q], &sc->locID[2], &sc->locID[3]);
-			
-			PfSub(sc, &regID[i + 2 * Q], &sc->locID[4], &sc->locID[3]);
-			
-			PfAddInv(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[4]);
-			
-			PfAdd(sc, &sc->locID[1], &regID[i], &regID[i + Q]);
-			
-			PfAdd(sc, &sc->locID[2], &regID[i], &regID[i + 2 * Q]);
+			PfMov(sc, &sc->locID[0], &regID[i]);		
+			PfMov(sc, &sc->locID[1], &regID[i + Q]);	
+			PfMov(sc, &sc->locID[2], &regID[i + 2 * Q]);	
+			PfMov(sc, &sc->locID[3], &regID[i + 3 * Q]);	
+			PfMov(sc, &sc->locID[4], &regID[i + 4 * Q]);	
+			PfMov(sc, &sc->locID[5], &regID[i + 5 * Q]);	
+			PfMov(sc, &sc->locID[6], &regID[i + 6 * Q]);
+		
+			for (pfUINT t = 0; t < 3; t++) {
+				PfSub(sc, &regID[i + Q * (t + 4)].data.c[0], &sc->locID[t + 1].data.c[0], &sc->locID[t + 4].data.c[0]);
 			
-			PfAdd(sc, &sc->locID[3], &regID[i], &regID[i + 3 * Q]);
+				PfAdd(sc, &regID[i + Q * (t + 1)].data.c[0], &sc->locID[t + 1].data.c[0], &sc->locID[t + 4].data.c[0]);
 			
-			PfAdd(sc, &sc->locID[4], &regID[i + 4 * Q], &sc->locID[5]);
+				PfAdd(sc, &regID[i + Q * (t + 4)].data.c[1], &sc->locID[t + 1].data.c[1], &sc->locID[t + 4].data.c[1]);
 			
-			PfAdd(sc, &sc->locID[6], &regID[i + 6 * Q], &sc->locID[5]);
+				PfSub(sc, &regID[i + Q * (t + 1)].data.c[1], &sc->locID[t + 1].data.c[1], &sc->locID[t + 4].data.c[1]);
 			
-			PfAdd(sc, &sc->locID[5], &sc->locID[5], &regID[i + 5 * Q]);
+			}
+			for (pfUINT t = 0; t < 3; t++) {
+				PfAdd(sc, &regID[i].data.c[0], &regID[i].data.c[0], &regID[i + Q * (t + 1)].data.c[0]);
 			
-			PfMov(sc, &regID[i], &sc->locID[0]);
+				PfAdd(sc, &regID[i].data.c[1], &regID[i].data.c[1], &regID[i + Q * (t + 4)].data.c[1]);
 			
-			PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], 0);
+			}
+			for (pfUINT t = 1; t < 4; t++) {
+				PfMov(sc, &sc->locID[t], &sc->locID[0]);
 			
-			PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[3], &sc->locID[6], 0);
 			
-			PfShuffleComplex(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[5], 0);
+			}
+			for (pfUINT t = 4; t < 7; t++) {
+				PfSetToZero(sc, &sc->locID[t]);
+			}
+			for (pfUINT t = 0; t < 3; t++) {
+				for (pfUINT j = 0; j < 3; j++) {
+					pfUINT id = ((6 - t) + j) % 6;
+					PfFMA3_const_w(sc, &sc->locID[j + 1], &sc->locID[j + 4], &regID[i + Q * (t + 1)], &tf_x[id], &tf_y[id], &regID[i + Q * (t + 4)], &sc->w, &sc->locID[0]);
+				
+				}
+			}
+			for (pfUINT t = 1; t < 4; t++) {
+				PfSub(sc, &regID[i + Q * t].data.c[0], &sc->locID[t].data.c[0], &sc->locID[t + 3].data.c[0]);
 			
-			PfShuffleComplexInv(sc, &regID[i + 4 * Q], &sc->locID[2], &sc->locID[5], 0);
+				PfAdd(sc, &regID[i + Q * t].data.c[1], &sc->locID[t].data.c[1], &sc->locID[t + 3].data.c[1]);
 			
-			PfShuffleComplex(sc, &regID[i + 5 * Q], &sc->locID[3], &sc->locID[6], 0);
+			}
+			for (pfUINT t = 1; t < 4; t++) {
+				PfAdd(sc, &regID[i + Q * (t + 3)].data.c[0], &sc->locID[t].data.c[0], &sc->locID[t + 3].data.c[0]);
 			
-			PfShuffleComplex(sc, &regID[i + 6 * Q], &sc->locID[1], &sc->locID[4], 0);
+				PfSub(sc, &regID[i + Q * (t + 3)].data.c[1], &sc->locID[t].data.c[1], &sc->locID[t + 3].data.c[1]);
 			
+			}
+			pfUINT permute2[7] = { 0, 1, 5, 6, 3, 2, 4 };
+						
+			for (pfUINT t = 0; t < 7; t++)
+				PfCopyContainer(sc, &tempID[t], &regID[i + Q * t]);
+			for (pfUINT t = 0; t < 7; t++)
+				PfCopyContainer(sc, &regID[i + Q * t], &tempID[permute2[t]]);
 
 		}
 
+		for (int t = 0; t < 7; t++) {
+			PfDeallocateContainer(sc, &tempID[t]);
+		}
 
-		for (uint64_t i = 0; i < P; i++) {
+		for (pfUINT i = 0; i < P; i++) {
 			if (i > 0) {
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = -sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = -pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				else {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
@@ -2260,38 +2118,38 @@ temp%s = temp;\n\
 			
 		}
 
-		uint64_t permute2[14] = { 0,2,4,6,8,10,12,1,3,5,7,9,11,13 };
+		pfUINT permute2[14] = { 0,2,4,6,8,10,12,1,3,5,7,9,11,13 };
 		PfPermute(sc, permute2, 14, 1, regID, &sc->temp);
 		
 		break;
 	}
 	case 15: {
-		PfContainer tf[5];
-		for (int64_t i = 0; i < 5; i++){
-			tf[i].type = 32;
+		PfContainer tf[5] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 5; i++){
+			tf[i].type = 22;
 		}
 		//PfAppendLine(sc, "	{\n");
 		
-		tf[0].data.d = -0.5;
-		tf[1].data.d = 1.538841768587626701285145288018455;
-		tf[2].data.d = -0.363271264002680442947733378740309;
-		tf[3].data.d = -0.809016994374947424102293417182819;
-		tf[4].data.d = -0.587785252292473129168705954639073;
+		tf[0].data.d = pfFPinit("-0.5");
+		tf[1].data.d = pfFPinit("1.538841768587626701285145288018455");
+		tf[2].data.d = pfFPinit("-0.363271264002680442947733378740309");
+		tf[3].data.d = pfFPinit("-0.809016994374947424102293417182819");
+		tf[4].data.d = pfFPinit("-0.587785252292473129168705954639073");
 
-		PfContainer tf2[2];
-		for (int64_t i = 0; i < 2; i++){
-			tf2[i].type = 32;
+		PfContainer tf2[2] = VKFFT_ZERO_INIT;
+		for (pfINT i = 0; i < 2; i++){
+			tf2[i].type = 22;
 		}
 		//PfAppendLine(sc, "	{\n");
 		
 
-		tf2[0].data.d = -0.5;
-		tf2[1].data.d = -0.8660254037844386467637231707529;
+		tf2[0].data.d = pfFPinit("-0.5");
+		tf2[1].data.d = pfFPinit("-0.8660254037844386467637231707529361834714");
 
-		for (uint64_t i = radix - 1; i > 0; i--) {
+		for (pfUINT i = radix - 1; i > 0; i--) {
 			if (stageSize == 1) {
-				temp_complex.data.c[0] = 1;
-				temp_complex.data.c[1] = 0;
+				temp_complex.data.c[0].data.d = pfFPinit("1.0");
+				temp_complex.data.c[1].data.d = pfFPinit("0.0");
 				PfMov(sc, &sc->w, &temp_complex);	
 				
 			}
@@ -2312,7 +2170,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -2336,7 +2194,7 @@ temp%s = temp;\n\
 						}
 					}
 					else {
-						temp_double.data.d = 2.0 * i / radix;
+						temp_double.data.d = pfFPinit("2.0") * i / radix;
 						PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 						PfSinCos(sc, &sc->w, &sc->tempFloat);
 					}
@@ -2349,9 +2207,9 @@ temp%s = temp;\n\
 		//PfMov(sc, &regID[1], &sc->locID[1]);
 		//
 
-		uint64_t P = 5;
-		uint64_t Q = 3;
-		for (uint64_t i = 0; i < Q; i++) {
+		pfUINT P = 5;
+		pfUINT Q = 3;
+		for (pfUINT i = 0; i < Q; i++) {
 			PfMov(sc, &sc->locID[0], &regID[i]);
 			
 			PfMov(sc, &sc->locID[1], &regID[i + Q]);
@@ -2384,13 +2242,13 @@ temp%s = temp;\n\
 			
 			PfFMA(sc, &sc->locID[2], &regID[i + 2 * Q], &tf[0], &regID[i]);
 			
-			PfMul(sc, &regID[i + 3 * Q], &regID[i + 3 * Q], &tf[1], 0);
+			PfMul(sc, &regID[i + 3 * Q], &regID[i + 3 * Q], &tf[1], &regID[i]);
 			
-			PfMul(sc, &regID[i + 4 * Q], &regID[i + 4 * Q], &tf[2], 0);
+			PfMul(sc, &regID[i + 4 * Q], &regID[i + 4 * Q], &tf[2], &regID[i]);
 			
-			PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[3], 0);
+			PfMul(sc, &sc->locID[3], &sc->locID[3], &tf[3], &regID[i]);
 			
-			PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[4], 0);
+			PfMul(sc, &sc->locID[4], &sc->locID[4], &tf[4], &regID[i]);
 			
 
 			PfSub(sc, &sc->locID[1], &sc->locID[1], &sc->locID[3]);
@@ -2406,54 +2264,54 @@ temp%s = temp;\n\
 
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplex(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplex(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplexInv(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplexInv(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplexInv(sc, &regID[i + Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplexInv(sc, &regID[i + 2 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], 0);
+				PfShuffleComplex(sc, &regID[i + 3 * Q], &sc->locID[2], &sc->locID[3], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], 0);
+				PfShuffleComplex(sc, &regID[i + 4 * Q], &sc->locID[1], &sc->locID[4], &sc->locID[0]);
 				
 			}
 
 		}
 
 
-		for (uint64_t i = 0; i < P; i++) {
+		for (pfUINT i = 0; i < P; i++) {
 			if (i > 0) {
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = -sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = -pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				else {
-					temp_complex.data.c[0] = cos(2 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = sin(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(2 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = pfsin(2 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				PfMul(sc, &sc->locID[1], &regID[Q * i + 1], &sc->w, &sc->temp);
 				
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos(4 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = -sin(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = -pfsin(4 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
 				else {
-					temp_complex.data.c[0] = cos(4 * i * sc->double_PI / radix);
-					temp_complex.data.c[1] = sin(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[0].data.d = pfcos(4 * i * sc->double_PI / radix);
+					temp_complex.data.c[1].data.d = pfsin(4 * i * sc->double_PI / radix);
 					PfMov(sc, &sc->w, &temp_complex);	
 					
 				}
@@ -2482,20 +2340,20 @@ temp%s = temp;\n\
 			
 			if (stageAngle < 0)
 			{
-				PfShuffleComplex(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplexInv(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 			else {
-				PfShuffleComplexInv(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplexInv(sc, &regID[Q * i + 1], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
-				PfShuffleComplex(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], 0);
+				PfShuffleComplex(sc, &regID[Q * i + 2], &sc->locID[1], &sc->locID[2], &sc->locID[0]);
 				
 			}
 		}
 
-		uint64_t permute2[15] = { 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14 };
+		pfUINT permute2[15] = { 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14 };
 		PfPermute(sc, permute2, 15, 1, regID, &sc->temp);
 		
 		break;
@@ -2503,8 +2361,8 @@ temp%s = temp;\n\
 	case 16: {
 		
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -2527,7 +2385,7 @@ temp%s = temp;\n\
 				PfSinCos(sc, &sc->w, &sc->angle);
 			}
 		}
-		for (uint64_t i = 0; i < 8; i++) {
+		for (pfUINT i = 0; i < 8; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 8], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 8], &regID[i], &sc->temp);
@@ -2536,8 +2394,8 @@ temp%s = temp;\n\
 			
 		}
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -2560,12 +2418,12 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.5;
+				temp_double.data.d = pfFPinit("0.5");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
-		for (uint64_t i = 0; i < 4; i++) {
+		for (pfUINT i = 0; i < 4; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -2574,19 +2432,19 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
 
-		for (uint64_t i = 8; i < 12; i++) {
+		for (pfUINT i = 8; i < 12; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -2595,8 +2453,8 @@ temp%s = temp;\n\
 			
 		}
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -2619,12 +2477,12 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.25;
+				temp_double.data.d = pfFPinit("0.25");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
-		for (uint64_t i = 0; i < 2; i++) {
+		for (pfUINT i = 0; i < 2; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -2633,18 +2491,18 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
-		for (uint64_t i = 4; i < 6; i++) {
+		for (pfUINT i = 4; i < 6; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -2653,17 +2511,17 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = -0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 
 		}
 		else {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = 0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		}
-		for (uint64_t i = 8; i < 10; i++) {
+		for (pfUINT i = 8; i < 10; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -2672,18 +2530,18 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->w, &sc->iw);
-			PfMov_y_Neg_x(sc, &sc->w, &sc->iw);
+			PfMov(sc, &sc->w.data.c[0], &sc->iw.data.c[1]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->iw.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(iw.y, -iw.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->w, &sc->iw);
-			PfMov_y_x(sc, &sc->w, &sc->iw);
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->iw.data.c[1]);
+			PfMov(sc, &sc->w.data.c[1], &sc->iw.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(-iw.y, iw.x);\n\n", vecType);
 		}
-		for (uint64_t i = 12; i < 14; i++) {
+		for (pfUINT i = 12; i < 14; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -2693,8 +2551,8 @@ temp%s = temp;\n\
 		}
 
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -2717,13 +2575,13 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.125;
+				temp_double.data.d = pfFPinit("0.125");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
 
-		for (uint64_t i = 0; i < 1; i++) {
+		for (pfUINT i = 0; i < 1; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -2732,18 +2590,18 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
-		for (uint64_t i = 2; i < 3; i++) {
+		for (pfUINT i = 2; i < 3; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -2754,17 +2612,17 @@ temp%s = temp;\n\
 
 
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = -0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 
 		}
 		else {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = 0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		}
-		for (uint64_t i = 4; i < 5; i++) {
+		for (pfUINT i = 4; i < 5; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -2773,20 +2631,20 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->temp, &sc->iw);
-			PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+			PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+			PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 			PfMov(sc, &sc->iw, &sc->temp);
 			
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-			PfMov_y_x(sc, &sc->temp, &sc->iw);
+			PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+			PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 			PfMov(sc, &sc->iw, &sc->temp);
 			
 		}
-		for (uint64_t i = 6; i < 7; i++) {
+		for (pfUINT i = 6; i < 7; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -2796,18 +2654,18 @@ temp%s = temp;\n\
 		}
 
 
-		for (uint64_t j = 0; j < 2; j++) {
+		for (pfUINT j = 0; j < 2; j++) {
 			if (stageAngle < 0) {
-				temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 8);
-				temp_complex.data.c[1] = -sin((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[1].data.d = -pfsin((2 * j + 1) * sc->double_PI / 8);
 				PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 			}
 			else {
-				temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 8);
-				temp_complex.data.c[1] = sin((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[1].data.d = pfsin((2 * j + 1) * sc->double_PI / 8);
 				PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 			}
-			for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) {
+			for (pfUINT i = 8 + 4 * j; i < 9 + 4 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -2816,20 +2674,20 @@ temp%s = temp;\n\
 				
 			}
 			if (stageAngle < 0) {
-				PfMov_x_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+				PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 				
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
 			else {
-				PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_x(sc, &sc->temp, &sc->iw);
+				PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 				
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
-			for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) {
+			for (pfUINT i = 10 + 4 * j; i < 11 + 4 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -2839,7 +2697,7 @@ temp%s = temp;\n\
 			}
 		}
 
-		uint64_t permute2[16] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
+		pfUINT permute2[16] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
 		PfPermute(sc, permute2, 16, 1, regID, &sc->temp);
 		
 
@@ -2889,8 +2747,8 @@ temp%s = temp;\n\
 	case 32: {
 		
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -2913,7 +2771,7 @@ temp%s = temp;\n\
 				PfSinCos(sc, &sc->w, &sc->angle);
 			}
 		}
-		for (uint64_t i = 0; i < 16; i++) {
+		for (pfUINT i = 0; i < 16; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 16], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 16], &regID[i], &sc->temp);
@@ -2922,8 +2780,8 @@ temp%s = temp;\n\
 			
 		}
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -2945,12 +2803,12 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.5;
+				temp_double.data.d = pfFPinit("0.5");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
-		for (uint64_t i = 0; i < 8; i++) {
+		for (pfUINT i = 0; i < 8; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 8], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 8], &regID[i], &sc->temp);
@@ -2959,19 +2817,19 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
 
-		for (uint64_t i = 16; i < 24; i++) {
+		for (pfUINT i = 16; i < 24; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 8], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 8], &regID[i], &sc->temp);
@@ -2980,8 +2838,8 @@ temp%s = temp;\n\
 			
 		}
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -3003,12 +2861,12 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.25;
+				temp_double.data.d = pfFPinit("0.25");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
-		for (uint64_t i = 0; i < 4; i++) {
+		for (pfUINT i = 0; i < 4; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -3017,18 +2875,18 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
-		for (uint64_t i = 8; i < 12; i++) {
+		for (pfUINT i = 8; i < 12; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -3037,17 +2895,17 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = -0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 
 		}
 		else {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = 0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		}
-		for (uint64_t i = 16; i < 20; i++) {
+		for (pfUINT i = 16; i < 20; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -3056,18 +2914,18 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->w, &sc->iw);
-			PfMov_y_Neg_x(sc, &sc->w, &sc->iw);
+			PfMov(sc, &sc->w.data.c[0], &sc->iw.data.c[1]);
+			PfMovNeg(sc, &sc->w.data.c[1], &sc->iw.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(iw.y, -iw.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->w, &sc->iw);
-			PfMov_y_x(sc, &sc->w, &sc->iw);
+			PfMovNeg(sc, &sc->w.data.c[0], &sc->iw.data.c[1]);
+			PfMov(sc, &sc->w.data.c[1], &sc->iw.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(-iw.y, iw.x);\n\n", vecType);
 		}
-		for (uint64_t i = 24; i < 28; i++) {
+		for (pfUINT i = 24; i < 28; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 4], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 4], &regID[i], &sc->temp);
@@ -3077,8 +2935,8 @@ temp%s = temp;\n\
 		}
 
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -3100,13 +2958,13 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.125;
+				temp_double.data.d = pfFPinit("0.125");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
 
-		for (uint64_t i = 0; i < 2; i++) {
+		for (pfUINT i = 0; i < 2; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -3115,18 +2973,18 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
-		for (uint64_t i = 4; i < 6; i++) {
+		for (pfUINT i = 4; i < 6; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -3137,17 +2995,17 @@ temp%s = temp;\n\
 
 
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = -0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 
 		}
 		else {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = 0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		}
-		for (uint64_t i = 8; i < 10; i++) {
+		for (pfUINT i = 8; i < 10; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -3156,20 +3014,20 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->temp, &sc->iw);
-			PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+			PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+			PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 			PfMov(sc, &sc->iw, &sc->temp);
 			
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-			PfMov_y_x(sc, &sc->temp, &sc->iw);
+			PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+			PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 			PfMov(sc, &sc->iw, &sc->temp);
 			
 		}
-		for (uint64_t i = 12; i < 14; i++) {
+		for (pfUINT i = 12; i < 14; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -3179,18 +3037,18 @@ temp%s = temp;\n\
 		}
 
 
-		for (uint64_t j = 0; j < 2; j++) {
+		for (pfUINT j = 0; j < 2; j++) {
 			if (stageAngle < 0) {
-				temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 8);
-				temp_complex.data.c[1] = -sin((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[1].data.d = -pfsin((2 * j + 1) * sc->double_PI / 8);
 				PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 			}
 			else {
-				temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 8);
-				temp_complex.data.c[1] = sin((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[1].data.d = pfsin((2 * j + 1) * sc->double_PI / 8);
 				PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 			}
-			for (uint64_t i = 16 + 8 * j; i < 18 + 8 * j; i++) {
+			for (pfUINT i = 16 + 8 * j; i < 18 + 8 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -3199,19 +3057,19 @@ temp%s = temp;\n\
 				
 			}
 			if (stageAngle < 0) {
-				PfMov_x_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+				PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 				PfMov(sc, &sc->iw, &sc->temp);
 			}
 			else {
-				PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_x(sc, &sc->temp, &sc->iw);
+				PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
-			for (uint64_t i = 20 + 8 * j; i < 22 + 8 * j; i++) {
+			for (pfUINT i = 20 + 8 * j; i < 22 + 8 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 2], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 2], &regID[i], &sc->temp);
@@ -3222,8 +3080,8 @@ temp%s = temp;\n\
 		}
 
 		if (stageSize == 1) {
-			temp_complex.data.c[0] = 1;
-			temp_complex.data.c[1] = 0;
+			temp_complex.data.c[0].data.d = pfFPinit("1.0");
+			temp_complex.data.c[1].data.d = pfFPinit("0.0");
 			PfMov(sc, &sc->w, &temp_complex);	
 			
 		}
@@ -3244,13 +3102,13 @@ temp%s = temp;\n\
 				}
 			}
 			else {
-				temp_double.data.d = 0.0625;
+				temp_double.data.d = pfFPinit("0.0625");
 				PfMul(sc, &sc->tempFloat, &sc->angle, &temp_double, 0);
 				PfSinCos(sc, &sc->w, &sc->tempFloat);
 			}
 		}
 
-		for (uint64_t i = 0; i < 1; i++) {
+		for (pfUINT i = 0; i < 1; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->w, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3259,19 +3117,19 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->iw, &sc->w);
-			PfMov_y_Neg_x(sc, &sc->iw, &sc->w);
+			PfMov(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMovNeg(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->iw, &sc->w);
-			PfMov_y_x(sc, &sc->iw, &sc->w);
+			PfMovNeg(sc, &sc->iw.data.c[0], &sc->w.data.c[1]);
+			PfMov(sc, &sc->iw.data.c[1], &sc->w.data.c[0]);
 			
 			//&sc->tempLen = sprintf(&sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
 		}
-		for (uint64_t i = 2; i < 3; i++) {
+		for (pfUINT i = 2; i < 3; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3282,17 +3140,17 @@ temp%s = temp;\n\
 
 
 		if (stageAngle < 0) {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = -0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("-0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 
 		}
 		else {
-			temp_complex.data.c[0] = 0.70710678118654752440084436210485;
-			temp_complex.data.c[1] = 0.70710678118654752440084436210485;
+			temp_complex.data.c[0].data.d = pfFPinit("0.70710678118654752440084436210485");
+			temp_complex.data.c[1].data.d = pfFPinit("0.70710678118654752440084436210485");
 			PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 		}
-		for (uint64_t i = 4; i < 5; i++) {
+		for (pfUINT i = 4; i < 5; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3301,21 +3159,21 @@ temp%s = temp;\n\
 			
 		}
 		if (stageAngle < 0) {
-			PfMov_x_y(sc, &sc->temp, &sc->iw);
-			PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+			PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+			PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 			PfMov(sc, &sc->iw, &sc->temp);
 			
 		}
 		else {
-			PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-			PfMov_y_x(sc, &sc->temp, &sc->iw);
+			PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+			PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 			
 			
 			PfMov(sc, &sc->iw, &sc->temp);
 			
 		}
-		for (uint64_t i = 6; i < 7; i++) {
+		for (pfUINT i = 6; i < 7; i++) {
 			PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 			
 			PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3325,18 +3183,18 @@ temp%s = temp;\n\
 		}
 
 
-		for (uint64_t j = 0; j < 2; j++) {
+		for (pfUINT j = 0; j < 2; j++) {
 			if (stageAngle < 0) {
-				temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 8);
-				temp_complex.data.c[1] = -sin((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[1].data.d = -pfsin((2 * j + 1) * sc->double_PI / 8);
 				PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 			}
 			else {
-				temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 8);
-				temp_complex.data.c[1] = sin((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 8);
+				temp_complex.data.c[1].data.d = pfsin((2 * j + 1) * sc->double_PI / 8);
 				PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 			}
-			for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) {
+			for (pfUINT i = 8 + 4 * j; i < 9 + 4 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3345,20 +3203,20 @@ temp%s = temp;\n\
 				
 			}
 			if (stageAngle < 0) {
-				PfMov_x_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+				PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 				
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
 			else {
-				PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_x(sc, &sc->temp, &sc->iw);
+				PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 				
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
-			for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) {
+			for (pfUINT i = 10 + 4 * j; i < 11 + 4 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3368,32 +3226,32 @@ temp%s = temp;\n\
 			}
 		}
 
-		for (uint64_t j = 0; j < 4; j++) {
+		for (pfUINT j = 0; j < 4; j++) {
 			if ((j == 1) || (j == 2)) {
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos((7 - 2 * j) * sc->double_PI / 16);
-					temp_complex.data.c[1] = -sin((7 - 2 * j) * sc->double_PI / 16);
+					temp_complex.data.c[0].data.d = pfcos((7 - 2 * j) * sc->double_PI / 16);
+					temp_complex.data.c[1].data.d = -pfsin((7 - 2 * j) * sc->double_PI / 16);
 					PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 				}
 				else {
-					temp_complex.data.c[0] = cos((7 - 2 * j) * sc->double_PI / 16);
-					temp_complex.data.c[1] = sin((7 - 2 * j) * sc->double_PI / 16);
+					temp_complex.data.c[0].data.d = pfcos((7 - 2 * j) * sc->double_PI / 16);
+					temp_complex.data.c[1].data.d = pfsin((7 - 2 * j) * sc->double_PI / 16);
 					PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 				}
 			}
 			else {
 				if (stageAngle < 0) {
-					temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 16);
-					temp_complex.data.c[1] = -sin((2 * j + 1) * sc->double_PI / 16);
+					temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 16);
+					temp_complex.data.c[1].data.d = -pfsin((2 * j + 1) * sc->double_PI / 16);
 					PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 				}
 				else {
-					temp_complex.data.c[0] = cos((2 * j + 1) * sc->double_PI / 16);
-					temp_complex.data.c[1] = sin((2 * j + 1) * sc->double_PI / 16);
+					temp_complex.data.c[0].data.d = pfcos((2 * j + 1) * sc->double_PI / 16);
+					temp_complex.data.c[1].data.d = pfsin((2 * j + 1) * sc->double_PI / 16);
 					PfMul(sc, &sc->iw, &sc->w, &temp_complex, 0);
 				}
 			}
-			for (uint64_t i = 16 + 4 * j; i < 17 + 4 * j; i++) {
+			for (pfUINT i = 16 + 4 * j; i < 17 + 4 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3402,20 +3260,20 @@ temp%s = temp;\n\
 				
 			}
 			if (stageAngle < 0) {
-				PfMov_x_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_Neg_x(sc, &sc->temp, &sc->iw);
+				PfMov(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMovNeg(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 				
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
 			else {
-				PfMov_x_Neg_y(sc, &sc->temp, &sc->iw);
-				PfMov_y_x(sc, &sc->temp, &sc->iw);
+				PfMovNeg(sc, &sc->temp.data.c[0], &sc->iw.data.c[1]);
+				PfMov(sc, &sc->temp.data.c[1], &sc->iw.data.c[0]);
 				
 				PfMov(sc, &sc->iw, &sc->temp);
 				
 			}
-			for (uint64_t i = 18 + 4 * j; i < 19 + 4 * j; i++) {
+			for (pfUINT i = 18 + 4 * j; i < 19 + 4 * j; i++) {
 				PfMul(sc, &sc->temp, &regID[i + 1], &sc->iw, 0);
 				
 				PfSub(sc, &regID[i + 1], &regID[i], &sc->temp);
@@ -3425,7 +3283,7 @@ temp%s = temp;\n\
 			}
 		}
 
-		uint64_t permute2[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 };
+		pfUINT permute2[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 };
 		PfPermute(sc, permute2, 32, 1, regID, &sc->temp);
 		
 
@@ -3516,6 +3374,7 @@ temp%s = temp;\n\
 		break;
 	}
 	}
+	PfDeallocateContainer(sc, &temp_complex);
 	return;
 }
 
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixShuffle.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixShuffle.h
index 10b1667a..2b696921 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixShuffle.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixShuffle.h
@@ -38,17 +38,19 @@ static inline void appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayo
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32; 
+	temp_double.type = 22; 
 	
-	PfContainer stageNormalization;
-	stageNormalization.type = 32;
-	PfContainer normalizationValue;
+	PfContainer stageNormalization = VKFFT_ZERO_INIT;
+	stageNormalization.type = 22;
+	PfContainer normalizationValue = VKFFT_ZERO_INIT;
 	normalizationValue.type = 31;
 	normalizationValue.data.i = 1;
 
 	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle->data.d > 0))) && (stageSize->data.i == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle->data.d < 0)))) {
-		if ((sc->performDCT) && (sc->actualInverse)) {
-			if (sc->performDCT == 1)
+		if (((sc->performDCT) || (sc->performDST)) && (sc->actualInverse)) {
+			if (sc->performDST == 1)
+				normalizationValue.data.i = (sc->sourceFFTSize.data.i + 1) * 2;
+			else if (sc->performDCT == 1)
 				normalizationValue.data.i = (sc->sourceFFTSize.data.i - 1) * 2;
 			else
 				normalizationValue.data.i = sc->sourceFFTSize.data.i * 2;
@@ -60,46 +62,46 @@ static inline void appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayo
 		normalizationValue.data.i *= sc->fft_dim_full.data.i;
 	}
 	if (normalizationValue.data.i != 1) {
-		stageNormalization.data.d = 1.0 / (long double)(normalizationValue.data.i);
+		stageNormalization.data.d = pfFPinit("1.0") / (pfLD)(normalizationValue.data.i);
 	}
 	
-	PfContainer logicalStoragePerThread;
+	PfContainer logicalStoragePerThread = VKFFT_ZERO_INIT;
 	logicalStoragePerThread.type = 31;
 	logicalStoragePerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i] * sc->registerBoost;// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-	PfContainer logicalStoragePerThreadNext;
+	PfContainer logicalStoragePerThreadNext = VKFFT_ZERO_INIT;
 	logicalStoragePerThreadNext.type = 31;
 	logicalStoragePerThreadNext.data.i = sc->registers_per_thread_per_radix[stageRadixNext->data.i] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext->data.i == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-	PfContainer logicalRegistersPerThread; 
+	PfContainer logicalRegistersPerThread = VKFFT_ZERO_INIT; 
 	logicalRegistersPerThread.type = 31;
 	logicalRegistersPerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i];// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-	PfContainer logicalRegistersPerThreadNext; 
+	PfContainer logicalRegistersPerThreadNext = VKFFT_ZERO_INIT; 
 	logicalRegistersPerThreadNext.type = 31;
 	logicalRegistersPerThreadNext.data.i = sc->registers_per_thread_per_radix[stageRadixNext->data.i];// (sc->registers_per_thread % stageRadixNext->data.i == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 
-	PfContainer logicalGroupSize;
+	PfContainer logicalGroupSize = VKFFT_ZERO_INIT;
 	logicalGroupSize.type = 31;
 	PfDivCeil(sc, &logicalGroupSize, &sc->fftDim, &logicalStoragePerThread);
-	PfContainer logicalGroupSizeNext;
+	PfContainer logicalGroupSizeNext = VKFFT_ZERO_INIT;
 	logicalGroupSizeNext.type = 31;
 	PfDivCeil(sc, &logicalGroupSizeNext, &sc->fftDim, &logicalStoragePerThreadNext);
 	
-	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->registerBoost == 1) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->reorderFourStep) && (sc->fftDim.data.i < sc->fft_dim_full.data.i) && (sc->localSize[1].data.i > 1)) || (sc->localSize[1].data.i > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)))) || (sc->performDCT)))
+	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->registerBoost == 1) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->reorderFourStep) && (sc->fftDim.data.i < sc->fft_dim_full.data.i) && (sc->localSize[1].data.i > 1)) || (sc->localSize[1].data.i > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)))) || ((sc->performDCT) || (sc->performDST))))
 	{
 		appendBarrierVkFFT(sc);
 	}
 	//if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize->data.i < sc->fftDim / stageRadix->data.i) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle->data.d < 0)) || (sc->registerBoost > 1) || (sc->performDCT)) {
-	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->reorderFourStep) && (sc->fftDim.data.i < sc->fft_dim_full.data.i) && (sc->localSize[1].data.i > 1)) || (sc->localSize[1].data.i > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)) || (sc->registerBoost > 1) || (sc->performDCT))) {
+	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->reorderFourStep) && (sc->fftDim.data.i < sc->fft_dim_full.data.i) && (sc->localSize[1].data.i > 1)) || (sc->localSize[1].data.i > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)) || (sc->registerBoost > 1) || ((sc->performDCT) || (sc->performDST)))) {
 		if (!((sc->registerBoost > 1) && (stageSize->data.i * stageRadix->data.i == sc->fftDim.data.i / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
 			PfContainer* tempID;
 			tempID = (PfContainer*)calloc(sc->registers_per_thread * sc->registerBoost, sizeof(PfContainer));
 			if (tempID) {
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
-					PfAllocateContainerFlexible(sc, &tempID[i], 50);
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					tempID[i].type = sc->regIDs[0].type;
+					PfAllocateContainerFlexible(sc, &tempID[i], 50);
 				}
 
-				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
-					uint64_t t = 0;
+				for (pfUINT k = 0; k < sc->registerBoost; ++k) {
+					pfUINT t = 0;
 					if (sc->registerBoost > 1) {
 						appendBarrierVkFFT(sc);
 											
@@ -122,10 +124,10 @@ static inline void appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayo
 					if (logicalGroupSize.data.i != sc->localSize[0].data.i) {
 						PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &logicalGroupSize);
 					}
-					for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-						if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
-							if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
-								PfContainer current_group_cut;
+					for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+						if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
+							if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+								PfContainer current_group_cut = VKFFT_ZERO_INIT;
 								current_group_cut.type = 31;
 								current_group_cut.data.i = sc->fftDim.data.i / stageRadix->data.i - (j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * logicalGroupSize.data.i;
 								PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &current_group_cut);
@@ -143,14 +145,14 @@ static inline void appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayo
 		blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
 		inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize->data.i, j * logicalGroupSize, stageRadix->data.i);*/
 						
-						for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-							PfContainer id;
+						for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+							PfContainer id = VKFFT_ZERO_INIT;
 							id.type = 31;
 							id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 							id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
 							PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[id.data.i]);
 							t++;
-							if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
+							if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
 								temp_int.data.i = i * stageSize->data.i;
 								PfAdd(sc, &sc->sdataID, &sc->inoutID, &temp_int);
 									
@@ -187,8 +189,8 @@ static inline void appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayo
 							/*sc->tempLen = sprintf(sc->tempStr, "\
 sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize->data.i, sc->regIDs[id], stageNormalization);*/
 						}
-						if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
-							if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+						if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
+							if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
 								PfIf_end(sc);
 							}
 						}
@@ -196,7 +198,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 					if (logicalGroupSize.data.i != sc->localSize[0].data.i) {
 						PfIf_end(sc);
 					}
-					for (uint64_t j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
+					for (pfUINT j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
 						PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[t + k * sc->registers_per_thread]);
 						t++;
 					}
@@ -222,9 +224,9 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 							temp_int.data.i = 0;
 							PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 						}
-						for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThreadNext.data.i / stageRadixNext->data.i; j++) {
-							for (uint64_t i = 0; i < (uint64_t)stageRadixNext->data.i; i++) {
-								PfContainer id;
+						for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThreadNext.data.i / stageRadixNext->data.i; j++) {
+							for (pfUINT i = 0; i < (pfUINT)stageRadixNext->data.i; i++) {
+								PfContainer id = VKFFT_ZERO_INIT;
 								id.type = 31;
 								id.data.i = j + k * logicalRegistersPerThreadNext.data.i / stageRadixNext->data.i + i * logicalStoragePerThreadNext.data.i / stageRadixNext->data.i;
 								id.data.i = (id.data.i / logicalRegistersPerThreadNext.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThreadNext.data.i;
@@ -261,10 +263,10 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 						}
 					}
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfCopyContainer(sc, &sc->regIDs[i], &tempID[i]);
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfDeallocateContainer(sc, &tempID[i]);
 				}
 				free(tempID);
@@ -281,32 +283,32 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 			tempID = (PfContainer*)calloc(sc->registers_per_thread * sc->registerBoost, sizeof(PfContainer));
 			if (tempID) {
 				//resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
-					PfAllocateContainerFlexible(sc, &tempID[i], 50);
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					tempID[i].type = sc->regIDs[0].type;
+					PfAllocateContainerFlexible(sc, &tempID[i], 50);
 				}
 				if (sc->useDisableThreads) {
 					temp_int.data.i = 0;
 					PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 				}
-				for (uint64_t k = 0; k < (uint64_t)sc->registerBoost; ++k) {
-					for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-						for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-							PfContainer id;
+				for (pfUINT k = 0; k < (pfUINT)sc->registerBoost; ++k) {
+					for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+						for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+							PfContainer id = VKFFT_ZERO_INIT;
 							id.type = 31;
 							id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 							id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
 							PfCopyContainer(sc, &tempID[j + i * logicalRegistersPerThread.data.i / stageRadix->data.i + k * sc->registers_per_thread], &sc->regIDs[id.data.i]);
 						}
 					}
-					for (uint64_t j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
+					for (pfUINT j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
 						PfCopyContainer(sc, &tempID[j + k * sc->registers_per_thread], &sc->regIDs[j + k * sc->registers_per_thread]);
 					}
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfCopyContainer(sc, &sc->regIDs[i], &tempID[i]);
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfDeallocateContainer(sc, &tempID[i]);
 				}
 				free(tempID);
@@ -326,8 +328,8 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 		}
 
 		if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle->data.d > 0))) {
-			for (uint64_t i = 0; i < (uint64_t)logicalStoragePerThread.data.i; i++) {
-				PfContainer id;
+			for (pfUINT i = 0; i < (pfUINT)logicalStoragePerThread.data.i; i++) {
+				PfContainer id = VKFFT_ZERO_INIT;
 				id.type = 31;
 				id.data.i = (i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + i % logicalRegistersPerThread.data.i;
 				
@@ -353,17 +355,19 @@ static inline void appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout*
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
-	PfContainer stageNormalization;
-	stageNormalization.type = 32;
-	PfContainer normalizationValue;
+	PfContainer stageNormalization = VKFFT_ZERO_INIT;
+	stageNormalization.type = 22;
+	PfContainer normalizationValue = VKFFT_ZERO_INIT;
 	normalizationValue.type = 31;
 	normalizationValue.data.i = 1;
 
 	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle->data.d > 0))) && (stageSize->data.i == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle->data.d < 0)))) {
-		if ((sc->performDCT) && (sc->actualInverse)) {
-			if (sc->performDCT == 1)
+		if (((sc->performDCT) || (sc->performDST)) && (sc->actualInverse)) {
+			if (sc->performDST == 1)
+				normalizationValue.data.i = (sc->sourceFFTSize.data.i + 1) * 2;
+			else if (sc->performDCT == 1)
 				normalizationValue.data.i = (sc->sourceFFTSize.data.i - 1) * 2;
 			else
 				normalizationValue.data.i = sc->sourceFFTSize.data.i * 2;
@@ -375,31 +379,31 @@ static inline void appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout*
 		normalizationValue.data.i *= sc->fft_dim_full.data.i;
 	}
 	if (normalizationValue.data.i != 1) {
-		stageNormalization.data.d = 1.0 / (long double)(normalizationValue.data.i);
+		stageNormalization.data.d = pfFPinit("1.0") / (pfLD)(normalizationValue.data.i);
 	}
 	char tempNum[50] = "";
 
-	PfContainer logicalStoragePerThread;
+	PfContainer logicalStoragePerThread = VKFFT_ZERO_INIT;
 	logicalStoragePerThread.type = 31;
 	logicalStoragePerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i] * sc->registerBoost;// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-	PfContainer logicalStoragePerThreadNext;
+	PfContainer logicalStoragePerThreadNext = VKFFT_ZERO_INIT;
 	logicalStoragePerThreadNext.type = 31;
 	logicalStoragePerThreadNext.data.i = sc->registers_per_thread_per_radix[stageRadixNext->data.i] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext->data.i == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-	PfContainer logicalRegistersPerThread;
+	PfContainer logicalRegistersPerThread = VKFFT_ZERO_INIT;
 	logicalRegistersPerThread.type = 31;
 	logicalRegistersPerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i];// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-	PfContainer logicalRegistersPerThreadNext;
+	PfContainer logicalRegistersPerThreadNext = VKFFT_ZERO_INIT;
 	logicalRegistersPerThreadNext.type = 31;
 	logicalRegistersPerThreadNext.data.i = sc->registers_per_thread_per_radix[stageRadixNext->data.i];// (sc->registers_per_thread % stageRadixNext->data.i == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 
-	PfContainer logicalGroupSize;
+	PfContainer logicalGroupSize = VKFFT_ZERO_INIT;
 	logicalGroupSize.type = 31;
 	PfDivCeil(sc, &logicalGroupSize, &sc->fftDim, &logicalStoragePerThread);
-	PfContainer logicalGroupSizeNext;
+	PfContainer logicalGroupSizeNext = VKFFT_ZERO_INIT;
 	logicalGroupSizeNext.type = 31;
 	PfDivCeil(sc, &logicalGroupSizeNext, &sc->fftDim, &logicalStoragePerThreadNext);
 
-	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)) || (sc->performDCT)))
+	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)) || ((sc->performDCT) || (sc->performDST))))
 	{
 		appendBarrierVkFFT(sc);
 	}
@@ -407,18 +411,18 @@ static inline void appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout*
 		PfMov(sc, &sc->sharedStride, &sc->sharedStrideReadWriteConflict);
 	}
 	//if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize->data.i < sc->fftDim / stageRadix->data.i) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle->data.d < 0)) || (sc->registerBoost > 1) || (sc->performDCT)) {
-	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)) || (sc->performDCT))) {
+	if ((!((sc->writeFromRegisters == 1) && (stageSize->data.i == sc->fftDim.data.i / stageRadix->data.i) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i < sc->fftDim.data.i / stageRadix->data.i) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d < 0)) || ((sc->performDCT) || (sc->performDST)))) {
 		if (!((sc->registerBoost > 1) && (stageSize->data.i * stageRadix->data.i == sc->fftDim.data.i / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
 			PfContainer* tempID;
 			tempID = (PfContainer*)calloc(sc->registers_per_thread * sc->registerBoost, sizeof(PfContainer));
 			if (tempID) {
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
-					PfAllocateContainerFlexible(sc, &tempID[i], 50);
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					tempID[i].type = sc->regIDs[0].type;
+					PfAllocateContainerFlexible(sc, &tempID[i], 50);
 				}
 
-				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
-					uint64_t t = 0;
+				for (pfUINT k = 0; k < sc->registerBoost; ++k) {
+					pfUINT t = 0;
 					if (sc->registerBoost > 1) {
 						appendBarrierVkFFT(sc);
 
@@ -440,10 +444,10 @@ static inline void appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout*
 					if (logicalGroupSize.data.i != sc->localSize[1].data.i) {
 						PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &logicalGroupSize);
 					}
-					for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-						if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
-							if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
-								PfContainer current_group_cut;
+					for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+						if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
+							if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+								PfContainer current_group_cut = VKFFT_ZERO_INIT;
 								current_group_cut.type = 31;
 								current_group_cut.data.i = sc->fftDim.data.i / stageRadix->data.i - (j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * logicalGroupSize.data.i;
 								PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &current_group_cut);
@@ -461,14 +465,14 @@ static inline void appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout*
 		blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
 		inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize->data.i, j * logicalGroupSize, stageRadix->data.i);*/
 
-						for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-							PfContainer id;
+						for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+							PfContainer id = VKFFT_ZERO_INIT;
 							id.type = 31;
 							id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 							id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
 							PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[id.data.i]);
 							t++;
-							if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
+							if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
 								temp_int.data.i = i * stageSize->data.i;
 								PfAdd(sc, &sc->sdataID, &sc->inoutID, &temp_int);
 
@@ -484,8 +488,8 @@ static inline void appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout*
 							/*sc->tempLen = sprintf(sc->tempStr, "\
 sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize->data.i, sc->regIDs[id], stageNormalization);*/
 						}
-						if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
-							if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+						if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) <= sc->fftDim.data.i) {
+							if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
 								PfIf_end(sc);
 							}
 						}
@@ -493,7 +497,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 					if (logicalGroupSize.data.i != sc->localSize[1].data.i) {
 						PfIf_end(sc);
 					}
-					for (uint64_t j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
+					for (pfUINT j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
 						PfCopyContainer(sc, &tempID[t + k * sc->registers_per_thread], &sc->regIDs[t + k * sc->registers_per_thread]);
 						t++;
 					}
@@ -518,9 +522,9 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 							PfDivCeil(sc, &temp_int, &sc->fftDim, &logicalRegistersPerThreadNext);
 							PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &temp_int);
 						}
-						for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThreadNext.data.i / stageRadixNext->data.i; j++) {
-							for (uint64_t i = 0; i < (uint64_t)stageRadixNext->data.i; i++) {
-								PfContainer id;
+						for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThreadNext.data.i / stageRadixNext->data.i; j++) {
+							for (pfUINT i = 0; i < (pfUINT)stageRadixNext->data.i; i++) {
+								PfContainer id = VKFFT_ZERO_INIT;
 								id.type = 31;
 								id.data.i = j + k * logicalRegistersPerThreadNext.data.i / stageRadixNext->data.i + i * logicalStoragePerThreadNext.data.i / stageRadixNext->data.i;
 								id.data.i = (id.data.i / logicalRegistersPerThreadNext.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThreadNext.data.i;
@@ -548,10 +552,10 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 					}
 
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfCopyContainer(sc, &sc->regIDs[i], &tempID[i]);
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfDeallocateContainer(sc, &tempID[i]);
 				}
 				free(tempID);
@@ -568,32 +572,32 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 			tempID = (PfContainer*)calloc(sc->registers_per_thread * sc->registerBoost, sizeof(PfContainer));
 			if (tempID) {
 				//resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
-					PfAllocateContainerFlexible(sc, &tempID[i], 50);
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					tempID[i].type = sc->regIDs[0].type;
+					PfAllocateContainerFlexible(sc, &tempID[i], 50);
 				}
 				if (sc->useDisableThreads) {
 					temp_int.data.i = 0;
 					PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 				}
-				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
-					for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-						for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-							PfContainer id;
+				for (pfUINT k = 0; k < sc->registerBoost; ++k) {
+					for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+						for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+							PfContainer id = VKFFT_ZERO_INIT;
 							id.type = 31;
 							id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 							id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
 							PfCopyContainer(sc, &tempID[j + i * logicalRegistersPerThread.data.i / stageRadix->data.i + k * sc->registers_per_thread], &sc->regIDs[id.data.i]);
 						}
 					}
-					for (uint64_t j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
+					for (pfUINT j = logicalRegistersPerThread.data.i; j < sc->registers_per_thread; j++) {
 						PfCopyContainer(sc, &tempID[j + k * sc->registers_per_thread], &sc->regIDs[j + k * sc->registers_per_thread]);
 					}
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfCopyContainer(sc, &sc->regIDs[i], &tempID[i]);
 				}
-				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
 					PfDeallocateContainer(sc, &tempID[i]);
 				}
 				free(tempID);
@@ -617,8 +621,8 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 			PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &temp_int);
 		}
 		if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle->data.d > 0))) {
-			for (uint64_t i = 0; i < (uint64_t)logicalStoragePerThread.data.i; i++) {
-				PfContainer id;
+			for (pfUINT i = 0; i < (pfUINT)logicalStoragePerThread.data.i; i++) {
+				PfContainer id = VKFFT_ZERO_INIT;
 				id.type = 31;
 				id.data.i = (i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + i % logicalRegistersPerThread.data.i;
 
@@ -639,7 +643,7 @@ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;
 	}
 	return;
 }
-static inline void appendRadixShuffle(VkFFTSpecializationConstantsLayout* sc, PfContainer* stageSize, PfContainer* stageSizeSum, PfContainer* stageAngle, PfContainer* stageRadix, PfContainer* stageRadixNext, uint64_t stageID, uint64_t shuffleType) {
+static inline void appendRadixShuffle(VkFFTSpecializationConstantsLayout* sc, PfContainer* stageSize, PfContainer* stageSizeSum, PfContainer* stageAngle, PfContainer* stageRadix, PfContainer* stageRadixNext, pfUINT stageID, pfUINT shuffleType) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (sc->rader_generator[stageID] == 0) {
 		switch (shuffleType) {
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixStage.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixStage.h
index 56c7aa19..40ebcc81 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixStage.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RadixStage.h
@@ -39,7 +39,7 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 	/*char convolutionInverse[10] = "";
 	if (sc->convolutionStep) {
 		if (stageAngle->data.d < 0)
@@ -47,17 +47,17 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 		else
 			sprintf(convolutionInverse, ", 1");
 	}*/
-	PfContainer logicalStoragePerThread;
+	PfContainer logicalStoragePerThread = VKFFT_ZERO_INIT;
 	logicalStoragePerThread.type = 31;
 	logicalStoragePerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i] * sc->registerBoost;// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-	PfContainer logicalRegistersPerThread;
+	PfContainer logicalRegistersPerThread = VKFFT_ZERO_INIT;
 	logicalRegistersPerThread.type = 31;
 	logicalRegistersPerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i];// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-	PfContainer logicalGroupSize;
+	PfContainer logicalGroupSize = VKFFT_ZERO_INIT;
 	logicalGroupSize.type = 31;
 	PfDivCeil(sc, &logicalGroupSize, &sc->fftDim, &logicalStoragePerThread);
 
-	if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->localSize[1].data.i > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || (sc->performDCT)))
+	if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->localSize[1].data.i > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || ((sc->performDCT) || (sc->performDST))))
 	{
 		appendBarrierVkFFT(sc);
 		
@@ -68,7 +68,7 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
 	//upload second stage of LUT to sm
-	uint64_t numLUTelementsStage = 0;
+	pfUINT numLUTelementsStage = 0;
 	switch (stageRadix->data.i) {
 	case 2:
 		numLUTelementsStage = 1;
@@ -97,14 +97,14 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 	else
 		sc->useCoalescedLUTUploadToSM = 0;
 
-	for (uint64_t k = 0; k < sc->registerBoost; k++) {
+	for (pfUINT k = 0; k < sc->registerBoost; k++) {
 		if (logicalGroupSize.data.i != sc->localSize[0].data.i) {
 			PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &logicalGroupSize);
 		}
-		for (uint64_t j = 0; j < (uint64_t)(logicalRegistersPerThread.data.i / stageRadix->data.i); j++) {
-			if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
-			if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
-				PfContainer current_group_cut;
+		for (pfUINT j = 0; j < (pfUINT)(logicalRegistersPerThread.data.i / stageRadix->data.i); j++) {
+			if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
+			if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+				PfContainer current_group_cut = VKFFT_ZERO_INIT;
 				current_group_cut.type = 31;
 				current_group_cut.data.i = sc->fftDim.data.i / stageRadix->data.i - (j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * logicalGroupSize.data.i;
 				PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &current_group_cut);
@@ -125,10 +125,10 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 				PfMul(sc, &sc->angle, &sc->stageInvocationID, &temp_double, 0);
 			}
 				
-			if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->registerBoost == 1) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->localSize[1].data.i > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || (sc->performDCT)))) {
+			if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->registerBoost == 1) && ((sc->localSize[0].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->localSize[1].data.i > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || ((sc->performDCT) || (sc->performDST))))) {
 				//if(sc->readToRegisters==0){
-				for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-					PfContainer id;
+				for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+					PfContainer id = VKFFT_ZERO_INIT;
 					id.type = 31;
 					id.data.i = j + i * logicalRegistersPerThread.data.i / stageRadix->data.i;
 					id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
@@ -158,20 +158,20 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 			if (!sc->useCoalescedLUTUploadToSM) {
 				PfContainer* regID = (PfContainer*)calloc(stageRadix->data.i, sizeof(PfContainer));
 				if (regID) {
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
-						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						regID[i].type = sc->regIDs[id.data.i].type;
+						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						PfCopyContainer(sc, &regID[i], &sc->regIDs[id.data.i]);
 					}
 
 					inlineRadixKernelVkFFT(sc, stageRadix->data.i, stageSize->data.i, stageSizeSum->data.i, stageAngle->data.d, regID);
 					
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31; 
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
@@ -187,7 +187,7 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 				}
 			}
 
-			if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+			if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
 				PfIf_end(sc);
 			}
 		}
@@ -210,12 +210,12 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 			}
 
-			for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)); i++) {
+			for (pfUINT i = 0; i < (pfUINT)pfceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)); i++) {
 				if (i > 0) {
 					temp_int.data.i = sc->localSize[0].data.i * sc->localSize[1].data.i;
 					PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);					
 				}
-				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
+				if (i == (pfUINT)pfceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
 					temp_int.data.i = numLUTelementsStage * stageSize->data.i;
 					PfIf_lt_start(sc, &sc->sdataID, &temp_int);					
 				}
@@ -223,7 +223,7 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 				PfAdd(sc, &sc->inoutID, &sc->sdataID, &temp_int);
 				appendGlobalToShared(sc, &sc->sdataID, &sc->LUTStruct, &sc->inoutID);
 				
-				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
+				if (i == (pfUINT)pfceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
 					PfIf_end(sc);
 				}
 			}
@@ -237,23 +237,23 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 			if (logicalGroupSize.data.i != sc->localSize[0].data.i) {
 				PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &logicalGroupSize);
 			}
-			for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-				if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
-				if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
-					PfContainer current_group_cut;
+			for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+				if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
+				if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+					PfContainer current_group_cut = VKFFT_ZERO_INIT;
 					current_group_cut.type = 31;
 					current_group_cut.data.i = sc->fftDim.data.i / stageRadix->data.i - (j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * logicalGroupSize.data.i;
 					PfIf_lt_start(sc, &sc->gl_LocalInvocationID_x, &current_group_cut);
 				}
 				PfContainer* regID = (PfContainer*)calloc(stageRadix->data.i, sizeof(PfContainer));
 				if (regID) {
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
-						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						regID[i].type = sc->regIDs[id.data.i].type;
+						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						PfCopyContainer(sc, &regID[i], &sc->regIDs[id.data.i]);
 					}
 
@@ -264,8 +264,8 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 					
 					inlineRadixKernelVkFFT(sc, stageRadix->data.i, stageSize->data.i, stageSizeSum->data.i, stageAngle->data.d, regID);
 					
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
@@ -280,7 +280,7 @@ static inline void appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout
 					sc->res = VKFFT_ERROR_MALLOC_FAILED;
 					return;
 				}
-				if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+				if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
 					PfIf_end(sc);
 				}
 			}
@@ -304,7 +304,7 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 	/*char convolutionInverse[10] = "";
 	if (sc->convolutionStep) {
 		if (stageAngle->data.d < 0)
@@ -312,17 +312,17 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 		else
 			sprintf(convolutionInverse, ", 1");
 	}*/
-	PfContainer logicalStoragePerThread;
+	PfContainer logicalStoragePerThread = VKFFT_ZERO_INIT;
 	logicalStoragePerThread.type = 31;
 	logicalStoragePerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i] * sc->registerBoost;// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-	PfContainer logicalRegistersPerThread;
+	PfContainer logicalRegistersPerThread = VKFFT_ZERO_INIT;
 	logicalRegistersPerThread.type = 31;
 	logicalRegistersPerThread.data.i = sc->registers_per_thread_per_radix[stageRadix->data.i];// (sc->registers_per_thread % stageRadix->data.i == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-	PfContainer logicalGroupSize;
+	PfContainer logicalGroupSize = VKFFT_ZERO_INIT;
 	logicalGroupSize.type = 31;
 	PfDivCeil(sc, &logicalGroupSize, &sc->fftDim, &logicalStoragePerThread);
 
-	if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || (sc->performDCT)))
+	if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || ((sc->performDCT) || (sc->performDST))))
 	{
 		appendBarrierVkFFT(sc);
 
@@ -332,7 +332,7 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 		PfIf_gt_start(sc, &sc->disableThreads, &temp_int);
 	}
 	//upload second stage of LUT to sm
-	uint64_t numLUTelementsStage = 0;
+	pfUINT numLUTelementsStage = 0;
 	switch (stageRadix->data.i) {
 	case 2:
 		numLUTelementsStage = 1;
@@ -361,14 +361,14 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 	else
 		sc->useCoalescedLUTUploadToSM = 0;
 
-	for (uint64_t k = 0; k < sc->registerBoost; k++) {
+	for (pfUINT k = 0; k < sc->registerBoost; k++) {
 		if (logicalGroupSize.data.i != sc->localSize[1].data.i) {
 			PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &logicalGroupSize);
 		}
-		for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-			if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
-			if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
-				PfContainer current_group_cut;
+		for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+			if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
+			if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+				PfContainer current_group_cut = VKFFT_ZERO_INIT;
 				current_group_cut.type = 31;
 				current_group_cut.data.i = sc->fftDim.data.i / stageRadix->data.i - (j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * logicalGroupSize.data.i;
 				PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &current_group_cut);
@@ -389,10 +389,10 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 				PfMul(sc, &sc->angle, &sc->stageInvocationID, &temp_double, 0);
 			}
 
-			if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->registerBoost == 1) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || (sc->performDCT)))) {
+			if ((!((sc->readToRegisters == 1) && (stageSize->data.i == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle->data.d > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)))))) && ((sc->registerBoost == 1) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1].data.i * logicalStoragePerThread.data.i > sc->fftDim.data.i) || (stageSize->data.i > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels.data.i > 1)) && (stageAngle->data.d > 0)) || ((sc->performDCT) || (sc->performDST))))) {
 				//if(sc->readToRegisters==0){
-				for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-					PfContainer id;
+				for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+					PfContainer id = VKFFT_ZERO_INIT;
 					id.type = 31;
 					id.data.i = j + i * logicalRegistersPerThread.data.i / stageRadix->data.i;
 					id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
@@ -409,20 +409,20 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 			if (!sc->useCoalescedLUTUploadToSM) {
 				PfContainer* regID = (PfContainer*)calloc(stageRadix->data.i, sizeof(PfContainer));
 				if (regID) {
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
-						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						regID[i].type = sc->regIDs[id.data.i].type;
+						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						PfCopyContainer(sc, &regID[i], &sc->regIDs[id.data.i]);
 					}
 
 					inlineRadixKernelVkFFT(sc, stageRadix->data.i, stageSize->data.i, stageSizeSum->data.i, stageAngle->data.d, regID);
 
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
@@ -438,7 +438,7 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 				}
 			}
 
-			if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+			if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
 				PfIf_end(sc);
 			}
 		}
@@ -459,12 +459,12 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 			PfMul(sc, &sc->tempInt, &sc->localSize[0], &sc->gl_LocalInvocationID_y, 0);
 			PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 
-			for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)); i++) {
+			for (pfUINT i = 0; i < (pfUINT)pfceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)); i++) {
 				if (i > 0) {
 					temp_int.data.i = sc->localSize[0].data.i * sc->localSize[1].data.i;
 					PfAdd(sc, &sc->sdataID, &sc->sdataID, &temp_int);
 				}
-				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
+				if (i == (pfUINT)pfceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
 					temp_int.data.i = numLUTelementsStage * stageSize->data.i;
 					PfIf_lt_start(sc, &sc->sdataID, &temp_int);
 				}
@@ -472,7 +472,7 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 				PfAdd(sc, &sc->inoutID, &sc->sdataID, &temp_int);
 				appendGlobalToShared(sc, &sc->sdataID, &sc->LUTStruct, &sc->inoutID);
 
-				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
+				if (i == (pfUINT)pfceil(numLUTelementsStage * stageSize->data.i / ((double)sc->localSize[0].data.i * sc->localSize[1].data.i)) - 1) {
 					PfIf_end(sc);
 				}
 			}
@@ -486,23 +486,23 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 			if (logicalGroupSize.data.i != sc->localSize[1].data.i) {
 				PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &logicalGroupSize);
 			}
-			for (uint64_t j = 0; j < (uint64_t)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
-				if (logicalGroupSize.data.i * ((int64_t)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
-				if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
-					PfContainer current_group_cut;
+			for (pfUINT j = 0; j < (pfUINT)logicalRegistersPerThread.data.i / stageRadix->data.i; j++) {
+				if (logicalGroupSize.data.i * ((pfINT)(j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) continue;
+				if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+					PfContainer current_group_cut = VKFFT_ZERO_INIT;
 					current_group_cut.type = 31;
 					current_group_cut.data.i = sc->fftDim.data.i / stageRadix->data.i - (j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * logicalGroupSize.data.i;
 					PfIf_lt_start(sc, &sc->gl_LocalInvocationID_y, &current_group_cut);
 				}
 				PfContainer* regID = (PfContainer*)calloc(stageRadix->data.i, sizeof(PfContainer));
 				if (regID) {
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
-						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						regID[i].type = sc->regIDs[id.data.i].type;
+						PfAllocateContainerFlexible(sc, &regID[i], 50);
 						PfCopyContainer(sc, &regID[i], &sc->regIDs[id.data.i]);
 					}
 
@@ -513,8 +513,8 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 
 					inlineRadixKernelVkFFT(sc, stageRadix->data.i, stageSize->data.i, stageSizeSum->data.i, stageAngle->data.d, regID);
 
-					for (uint64_t i = 0; i < (uint64_t)stageRadix->data.i; i++) {
-						PfContainer id;
+					for (pfUINT i = 0; i < (pfUINT)stageRadix->data.i; i++) {
+						PfContainer id = VKFFT_ZERO_INIT;
 						id.type = 31;
 						id.data.i = j + k * logicalRegistersPerThread.data.i / stageRadix->data.i + i * logicalStoragePerThread.data.i / stageRadix->data.i;
 						id.data.i = (id.data.i / logicalRegistersPerThread.data.i) * sc->registers_per_thread + id.data.i % logicalRegistersPerThread.data.i;
@@ -529,7 +529,7 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 					sc->res = VKFFT_ERROR_MALLOC_FAILED;
 					return;
 				}
-				if (logicalGroupSize.data.i * ((int64_t)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
+				if (logicalGroupSize.data.i * ((pfINT)(1 + j + k * logicalRegistersPerThread.data.i / stageRadix->data.i) * stageRadix->data.i) > sc->fftDim.data.i) {
 					PfIf_end(sc);
 				}
 			}
@@ -552,7 +552,7 @@ static inline void appendRadixStageStrided(VkFFTSpecializationConstantsLayout* s
 static inline void appendRadixStage(VkFFTSpecializationConstantsLayout* sc, PfContainer* stageSize, PfContainer* stageSizeSum, PfContainer* stageAngle, PfContainer* stageRadix, int stageID, int shuffleType) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (sc->rader_generator[stageID]) {
-		for (uint64_t i = 0; i < sc->numRaderPrimes; i++) {
+		for (pfUINT i = 0; i < sc->numRaderPrimes; i++) {
 			if (sc->raderContainer[i].prime == stageRadix->data.i) {
 				sc->currentRaderContainer = &sc->raderContainer[i];
 			}
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h
index f8a978c6..d7c16e04 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h
@@ -79,11 +79,15 @@ static inline void setReadToRegisters(VkFFTSpecializationConstantsLayout* sc, in
 	}
 	case 142: case 143:
 	{
+		if (sc->performDST == 4)
+			sc->readToRegisters = 1;
+		else {
 #if(((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))
-		sc->readToRegisters = 1;
+			sc->readToRegisters = 1;
 #else
-		sc->readToRegisters = 0;
+			sc->readToRegisters = 0;
 #endif
+		}
 		break;
 	}
 	case 120: case 121:
@@ -193,7 +197,7 @@ static inline void appendOffset(VkFFTSpecializationConstantsLayout* sc, int read
             }
         }
 	}
-	int64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
+	pfINT maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
 	if (sc->numCoordinates * sc->matrixConvolution > 1) {
 		PfDiv(sc, &sc->tempInt, &sc->gl_GlobalInvocationID_z, &sc->dispatchZactualFFTSize);
 		temp_int.data.i = maxCoordinate;
@@ -346,7 +350,7 @@ static inline void appendKernelOffset(VkFFTSpecializationConstantsLayout* sc, in
             }
         }
 	}
-	int64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
+	pfINT maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
 	if (sc->numCoordinates * sc->matrixConvolution > 1) {
 		PfDiv(sc, &sc->tempInt, &sc->gl_GlobalInvocationID_z, &sc->dispatchZactualFFTSize);
 		temp_int.data.i = maxCoordinate;
@@ -462,7 +466,10 @@ static inline void appendReadWriteDataVkFFT_nonstrided(VkFFTSpecializationConsta
 		PfInc(sc, &fftDim);
 	}
 	else if (type == 110) {
-		fftDim.data.i = (fftDim.data.i + 2) / 2;
+		if(sc->performDST > 0)
+			fftDim.data.i = (fftDim.data.i - 2) / 2;
+		else
+			fftDim.data.i = (fftDim.data.i + 2) / 2;
 	}
 	else if ((type == 142) && (readWrite == 0)) {
 		fftDim.data.i = 2 * fftDim.data.i;
@@ -588,12 +595,12 @@ static inline void appendReadWriteDataVkFFT_nonstrided(VkFFTSpecializationConsta
 			PfMul(sc, &sc->tempInt, &sc->tempInt, &temp_int, 0);
 			PfAdd(sc, &sc->tempInt2, &sc->tempInt2, &sc->tempInt);
 
-			//sc->tempLen = sprintf(sc->tempStr, "		%s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (uint64_t)ceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
+			//sc->tempLen = sprintf(sc->tempStr, "		%s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((pfUINT)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (pfUINT)pfceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((pfUINT)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
 			temp_int.data.i = sc->firstStageStartSize.data.i / sc->fftDim.data.i;
 			PfDiv(sc, &sc->tempInt, &sc->gl_WorkGroupID_x, &temp_int);
-			temp_int1.data.i = ((int64_t)floor(sc->fft_dim_full.data.i / ((long double)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i);
+			temp_int1.data.i = ((pfINT)pffloor(sc->fft_dim_full.data.i / ((pfLD)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i);
 			PfIf_eq_start(sc, &sc->tempInt, &temp_int1);
-			temp_int1.data.i = ((sc->fft_dim_full.data.i - (sc->firstStageStartSize.data.i / sc->fftDim.data.i) * ((((int64_t)floor(sc->fft_dim_full.data.i / ((long double)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i)) * batching_localSize.data.i * sc->fftDim.data.i)) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i));
+			temp_int1.data.i = ((sc->fft_dim_full.data.i - (sc->firstStageStartSize.data.i / sc->fftDim.data.i) * ((((pfINT)pffloor(sc->fft_dim_full.data.i / ((pfLD)batching_localSize.data.i * sc->fftDim.data.i))) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i)) * batching_localSize.data.i * sc->fftDim.data.i)) / (sc->firstStageStartSize.data.i / sc->fftDim.data.i));
 			PfMov(sc, &sc->blockInvocationID, &temp_int1);
 			PfIf_else(sc);
 			temp_int1.data.i = fftDim.data.i * batching_localSize.data.i;
@@ -634,9 +641,9 @@ static inline void appendReadWriteDataVkFFT_nonstrided(VkFFTSpecializationConsta
 			PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->combinedID);
 		}
 	}
-	//for (uint64_t k = 0; k < &sc->registerBoost; k++) {
+	//for (pfUINT k = 0; k < &sc->registerBoost; k++) {
 	for (int k = 0; k < sc->registerBoost; k++) {
-		//for (uint64_t i = 0; i < used_registers; i++) {
+		//for (pfUINT i = 0; i < used_registers; i++) {
 		for (int i = 0; i < used_registers.data.i; i++) {
 			//combined thread numeration
 			if (sc->localSize[1].data.i == 1) {
@@ -801,6 +808,8 @@ static inline void appendReadWriteDataVkFFT_nonstrided(VkFFTSpecializationConsta
 						PfDiv(sc, &sc->tempInt, &sc->combinedID, &temp_int);
 						PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 					}
+					if (sc->performDST == 1) 
+						PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride);
 				}
 				else {
 					if ((sc->reorderFourStep) && (readWrite == 1)) {
@@ -826,6 +835,8 @@ static inline void appendReadWriteDataVkFFT_nonstrided(VkFFTSpecializationConsta
 						PfMul(sc, &sc->tempInt, &sc->tempInt, &sc->sharedStride, 0);
 						PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->tempInt);
 					}
+					if (sc->performDST == 1) 
+						PfInc(sc, &sc->sdataID);
 				}
 			}
 			if ((sc->zeropad[readWrite]) || ((sc->numAxisUploads > 1) && (sc->zeropadBluestein[readWrite]))) {
@@ -962,14 +973,21 @@ static inline void appendReadWriteDataVkFFT_nonstrided(VkFFTSpecializationConsta
 					}
 					else {
 						appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
-						appendRegistersToGlobal_x(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
+						if (sc->performDST == 1){
+							PfMovNeg(sc, &sc->temp.data.c[1], &sc->temp.data.c[1]);
+							appendRegistersToGlobal_y(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
+						}else
+							appendRegistersToGlobal_x(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
 						if (sc->mergeSequencesR2C) {
 							if ((sc->size[1].data.i % 2) != 0) {
 								temp_int.data.i = sc->size[1].data.i - 1;
 								PfIf_lt_start(sc, &sc->inoutID_y, &temp_int);
 							}
 							PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->outputStride[1]);
-							appendRegistersToGlobal_y(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
+							if (sc->performDST == 1)
+								appendRegistersToGlobal_x(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
+							else
+								appendRegistersToGlobal_y(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
 							if ((sc->size[1].data.i % 2) != 0) {
 								PfIf_end(sc);
 							}
@@ -1093,7 +1111,10 @@ static inline void appendReadWriteDataVkFFT_strided(VkFFTSpecializationConstants
 		fftDim.data.i = sc->fftDim.data.i;
 
 	if (type == 111) {
-		fftDim.data.i = (fftDim.data.i + 2) / 2;
+		if(sc->performDST > 0)
+			fftDim.data.i = (fftDim.data.i - 2) / 2;
+		else
+			fftDim.data.i = (fftDim.data.i + 2) / 2;
 	}
 	else if ((type == 143) && (readWrite == 0)) {
 		fftDim.data.i = 2 * fftDim.data.i;
@@ -1275,9 +1296,9 @@ static inline void appendReadWriteDataVkFFT_strided(VkFFTSpecializationConstants
 	}
 	PfAdd(sc, &sc->inoutID, &sc->inoutID, &sc->shiftZ);
 
-	//for (uint64_t k = 0; k < &sc->registerBoost; k++) {
+	//for (pfUINT k = 0; k < &sc->registerBoost; k++) {
 	for (int k = 0; k < sc->registerBoost; k++) {
-		//for (uint64_t i = 0; i < used_registers; i++) {
+		//for (pfUINT i = 0; i < used_registers; i++) {
 		for (int i = 0; i < used_registers.data.i; i++) {
 
 			temp_int1.data.i = (k * used_registers.data.i + i + 1) * sc->localSize[1].data.i;
@@ -1339,6 +1360,8 @@ static inline void appendReadWriteDataVkFFT_strided(VkFFTSpecializationConstants
 				}
 				PfMul(sc, &sc->sdataID, &sc->sharedStride, &sc->sdataID, 0);
 				PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->gl_LocalInvocationID_x);
+				if (sc->performDST == 1) 
+					PfAdd(sc, &sc->sdataID, &sc->sdataID, &sc->sharedStride);
 			}
 			if ((sc->zeropad[readWrite]) || ((sc->numAxisUploads > 1) && (sc->zeropadBluestein[readWrite]))) {
 				//sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
@@ -1461,7 +1484,11 @@ static inline void appendReadWriteDataVkFFT_strided(VkFFTSpecializationConstants
 					}
 					else {
 						appendSharedToRegisters(sc, &sc->temp, &sc->sdataID);
-						appendRegistersToGlobal_x(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
+						if (sc->performDST == 1){
+							PfMovNeg(sc, &sc->temp.data.c[1], &sc->temp.data.c[1]);
+							appendRegistersToGlobal_y(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
+						}else
+							appendRegistersToGlobal_x(sc, &sc->outputsStruct, &sc->inoutID, &sc->temp);
 					}
 				}
 				else  if (type == 143) {
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RegisterBoost.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RegisterBoost.h
index 7575d02e..eaf9db19 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RegisterBoost.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_RegisterBoost.h
@@ -36,7 +36,7 @@ static inline void appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayo
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 
 	PfContainer localSize = VKFFT_ZERO_INIT;
 	localSize.type = 31;
@@ -59,16 +59,16 @@ static inline void appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayo
 		localInvocationID = &sc->gl_LocalInvocationID_x;
 		batchingInvocationID = &sc->gl_LocalInvocationID_y;
 	}
-	int64_t logicalStoragePerThread;
+	pfINT logicalStoragePerThread;
 	if (start == 1) {
 		logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
 	}
 	else {
 		logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
 	}
-	int64_t logicalGroupSize = sc->fftDim.data.i / logicalStoragePerThread;
+	pfINT logicalGroupSize = sc->fftDim.data.i / logicalStoragePerThread;
 	if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
-		for (int64_t k = 0; k < sc->registerBoost; k++) {
+		for (pfINT k = 0; k < sc->registerBoost; k++) {
 			if (k > 0) {
 				appendBarrierVkFFT(sc);
 			}
@@ -81,7 +81,7 @@ static inline void appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayo
 				PfDivCeil(sc, &temp_int1, &sc->fftDim, &temp_int);
 				PfIf_lt_start(sc, localInvocationID, &temp_int1);
 
-				for (uint64_t i = 0; i < (uint64_t)logicalStoragePerThread / sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < (pfUINT)logicalStoragePerThread / sc->registerBoost; i++) {
 					temp_int.data.i = i * logicalGroupSize;
 					PfAdd(sc, &sc->sdataID, localInvocationID, &temp_int);
 					if (sc->stridedSharedLayout) {
@@ -94,7 +94,7 @@ static inline void appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayo
 			}
 			else
 			{
-				for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
+				for (pfUINT i = 0; i < sc->min_registers_per_thread; i++) {
 					temp_int.data.i = i * localSize.data.i;
 					PfAdd(sc, &sc->sdataID, localInvocationID, &temp_int);
 					if (sc->stridedSharedLayout) {
@@ -119,7 +119,7 @@ static inline void appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayo
 				PfDivCeil(sc, &temp_int1, &sc->fftDim, &temp_int);
 				PfIf_lt_start(sc, localInvocationID, &temp_int1);
 
-				for (uint64_t i = 0; i < (uint64_t)logicalStoragePerThread / sc->registerBoost; i++) {
+				for (pfUINT i = 0; i < (pfUINT)logicalStoragePerThread / sc->registerBoost; i++) {
 					temp_int.data.i = i * logicalGroupSize;
 					PfAdd(sc, &sc->sdataID, localInvocationID, &temp_int);
 					if (sc->stridedSharedLayout) {
@@ -132,7 +132,7 @@ static inline void appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayo
 				PfIf_end(sc);
 			}
 			else {
-				for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
+				for (pfUINT i = 0; i < sc->min_registers_per_thread; i++) {
 					temp_int.data.i = i * localSize.data.i;
 					PfAdd(sc, &sc->sdataID, localInvocationID, &temp_int);
 					if (sc->stridedSharedLayout) {
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_FFT.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_FFT.h
index 0c079ac7..e65d899e 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_FFT.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_FFT.h
@@ -57,7 +57,10 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 
 	appendConstantsVkFFT(sc);
 	//additional functions
-	if (((!sc->LUT) || (!sc->LUT_4step)) && (sc->floatTypeCode == 2)) {
+	if ((((sc->floatTypeCode/10)%10) == 3) || (((sc->floatTypeInputMemoryCode/10)%10) == 3) || (((sc->floatTypeOutputMemoryCode/10)%10) == 3)) {
+		appendQuadDoubleDoubleStruct(sc);
+	}
+	if (((!sc->LUT) || (!sc->LUT_4step)) && (((sc->floatTypeCode/10)%10) == 2)) {
 		appendSinCos20(sc);
 	}
 	if ((sc->floatTypeCode != sc->floatTypeInputMemoryCode) || (sc->floatTypeCode != sc->floatTypeOutputMemoryCode)) {
@@ -98,14 +101,14 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 
 	appendRegisterInitialization(sc, type);
 	
-	PfContainer stageSize;
+	PfContainer stageSize = VKFFT_ZERO_INIT;
 	stageSize.type = 31;
-	PfContainer stageSizeSum;
+	PfContainer stageSizeSum = VKFFT_ZERO_INIT;
 	stageSizeSum.type = 31;
-	PfContainer stageAngle;
-	stageAngle.type = 32;
+	PfContainer stageAngle = VKFFT_ZERO_INIT;
+	stageAngle.type = 22;
 	
-	int64_t max_coordinate = 0;
+	pfINT max_coordinate = 0;
 	if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) {
 		max_coordinate = sc->matrixConvolution - 1;
 	}
@@ -119,7 +122,10 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 			appendC2R_read(sc, type, 0);
 		}
 		if ((type == 110) || (type == 111)) {
-			appendDCTI_read(sc, type, 0);
+			if(sc->performDST==1)
+				appendDSTI_read(sc, type, 0);
+			else
+				appendDCTI_read(sc, type, 0);
 		}
 		if ((type == 120) || (type == 121)) {
 			appendDCTII_read_III_write(sc, type, 0);
@@ -174,7 +180,7 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 						stageSizeSum.data.i += stageSize.data.i * 5;
 						break;
 					case 7:
-						stageSizeSum.data.i += stageSize.data.i * 6;
+							stageSizeSum.data.i += stageSize.data.i * 6;
 						break;
 					case 8:
 						stageSizeSum.data.i += stageSize.data.i * 3;
@@ -186,13 +192,19 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 						stageSizeSum.data.i += stageSize.data.i * 9;
 						break;
 					case 11:
-						stageSizeSum.data.i += stageSize.data.i * 10;
+						if (sc->precision == 3)
+							stageSizeSum.data.i += stageSize.data.i * 11;
+						else
+							stageSizeSum.data.i += stageSize.data.i * 10;
 						break;
 					case 12:
 						stageSizeSum.data.i += stageSize.data.i * 11;
 						break;
 					case 13:
-						stageSizeSum.data.i += stageSize.data.i * 12;
+						if (sc->precision == 3)
+							stageSizeSum.data.i += stageSize.data.i * 13;
+						else
+							stageSizeSum.data.i += stageSize.data.i * 12;
 						break;
 					case 14:
 						stageSizeSum.data.i += stageSize.data.i * 13;
@@ -258,7 +270,7 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 				stageSizeSum.data.i = 0;
 				stageAngle.data.d = sc->double_PI;
 				sc->inverse = 1;
-				for (uint64_t i = 0; i < (uint64_t)sc->numStages; i++) {
+				for (pfUINT i = 0; i < (pfUINT)sc->numStages; i++) {
 					temp_int.data.i = sc->stageRadix[i];
 					appendRadixStage(sc, &stageSize, &stageSizeSum, &stageAngle, &temp_int, (int)i, locType);
 					if (i > 0) {
@@ -279,7 +291,7 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 							stageSizeSum.data.i += stageSize.data.i * 5;
 							break;
 						case 7:
-							stageSizeSum.data.i += stageSize.data.i * 6;
+								stageSizeSum.data.i += stageSize.data.i * 6;
 							break;
 						case 8:
 							stageSizeSum.data.i += stageSize.data.i * 3;
@@ -291,13 +303,19 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 							stageSizeSum.data.i += stageSize.data.i * 9;
 							break;
 						case 11:
-							stageSizeSum.data.i += stageSize.data.i * 10;
+							if (sc->precision == 3)
+								stageSizeSum.data.i += stageSize.data.i * 11;
+							else
+								stageSizeSum.data.i += stageSize.data.i * 10;
 							break;
 						case 12:
 							stageSizeSum.data.i += stageSize.data.i * 11;
 							break;
 						case 13:
-							stageSizeSum.data.i += stageSize.data.i * 12;
+							if (sc->precision == 3)
+								stageSizeSum.data.i += stageSize.data.i * 13;
+							else
+								stageSizeSum.data.i += stageSize.data.i * 12;
 							break;
 						case 14:
 							stageSizeSum.data.i += stageSize.data.i * 13;
@@ -316,7 +334,7 @@ static inline VkFFTResult shaderGen_FFT(VkFFTSpecializationConstantsLayout* sc,
 							break;
 						}
 					}
-					if (i == sc->numStages - 1) {
+					if ((i == sc->numStages - 1) || (sc->registerBoost == 1)) {
 						temp_int.data.i = sc->stageRadix[i];
 						temp_int1.data.i = sc->stageRadix[i];
 
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_R2C_even_decomposition.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_R2C_even_decomposition.h
index 9fc6782e..9a19a4cb 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_R2C_even_decomposition.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_R2C_even_decomposition.h
@@ -44,11 +44,13 @@ static inline VkFFTResult shaderGen_R2C_even_decomposition(VkFFTSpecializationCo
 	PfContainer temp_int1 = VKFFT_ZERO_INIT;
 	temp_int1.type = 31;
 	PfContainer temp_double = VKFFT_ZERO_INIT;
-	temp_double.type = 32;
+	temp_double.type = 22;
 	appendVersion(sc);
 	appendExtensions(sc);
 	appendLayoutVkFFT(sc);
-
+	if ((((sc->floatTypeCode/10)%10) == 3) || (((sc->floatTypeInputMemoryCode/10)%10) == 3) || (((sc->floatTypeOutputMemoryCode/10)%10) == 3)) {
+		appendQuadDoubleDoubleStruct(sc);
+	}
 	if (((!sc->LUT) || (!sc->LUT_4step)) && (sc->floatTypeCode == 2)) {
 		appendSinCos20(sc);
 	}
@@ -132,20 +134,20 @@ static inline VkFFTResult shaderGen_R2C_even_decomposition(VkFFTSpecializationCo
 	
 	if (sc->size[0].data.i % 4 == 0) {
 		if (!sc->inverse) {
-			PfMov_x_y(sc, &sc->regIDs[2], &sc->regIDs[0]);
-			PfMov_x_Neg_y(sc, &sc->regIDs[3], &sc->regIDs[0]);
-			PfAdd_x(sc, &sc->regIDs[2], &sc->regIDs[2], &sc->regIDs[0]);
-			PfAdd_x(sc, &sc->regIDs[3], &sc->regIDs[3], &sc->regIDs[0]);
+			PfMov(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[1]);
+			PfMovNeg(sc, &sc->regIDs[3].data.c[0], &sc->regIDs[0].data.c[1]);
+			PfAdd(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0]);
+			PfAdd(sc, &sc->regIDs[3].data.c[0], &sc->regIDs[3].data.c[0], &sc->regIDs[0].data.c[0]);
 		}
 		else {
-			PfSub_x(sc, &sc->regIDs[2], &sc->regIDs[0], &sc->regIDs[1]);
-			PfMov_y_x(sc, &sc->regIDs[2], &sc->regIDs[2]);
-			PfAdd_x(sc, &sc->regIDs[2], &sc->regIDs[0], &sc->regIDs[1]);			
+			PfSub(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0], &sc->regIDs[1].data.c[0]);
+			PfMov(sc, &sc->regIDs[2].data.c[1], &sc->regIDs[2].data.c[0]);
+			PfAdd(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0], &sc->regIDs[1].data.c[0]);			
 		}
 		PfConjugate(sc, &sc->w, &sc->w);
 		
 		if (sc->inverse) {
-			temp_double.data.d = 2;
+			temp_double.data.d = pfFPinit("2.0");
 			PfMul(sc, &sc->w, &sc->w, &temp_double, 0);
 		}
 
@@ -159,15 +161,15 @@ static inline VkFFTResult shaderGen_R2C_even_decomposition(VkFFTSpecializationCo
 	}
 	else {
 		if (!sc->inverse) {
-			PfMov_x_y(sc, &sc->regIDs[2], &sc->regIDs[0]);
-			PfMov_x_Neg_y(sc, &sc->regIDs[3], &sc->regIDs[0]);
-			PfAdd_x(sc, &sc->regIDs[2], &sc->regIDs[2], &sc->regIDs[0]);
-			PfAdd_x(sc, &sc->regIDs[3], &sc->regIDs[3], &sc->regIDs[0]);			
+			PfMov(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[1]);
+			PfMovNeg(sc, &sc->regIDs[3].data.c[0], &sc->regIDs[0].data.c[1]);
+			PfAdd(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0]);
+			PfAdd(sc, &sc->regIDs[3].data.c[0], &sc->regIDs[3].data.c[0], &sc->regIDs[0].data.c[0]);			
 		}
 		else {
-			PfSub_x(sc, &sc->regIDs[2], &sc->regIDs[0], &sc->regIDs[1]);
-			PfMov_y_x(sc, &sc->regIDs[2], &sc->regIDs[2]);
-			PfAdd_x(sc, &sc->regIDs[2], &sc->regIDs[0], &sc->regIDs[1]);
+			PfSub(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0], &sc->regIDs[1].data.c[0]);
+			PfMov(sc, &sc->regIDs[2].data.c[1], &sc->regIDs[2].data.c[0]);
+			PfAdd(sc, &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0], &sc->regIDs[1].data.c[0]);
 		}
 		appendRegistersToGlobal(sc, &sc->outputsStruct, &sc->inoutID, &sc->regIDs[2]);
 
@@ -180,7 +182,7 @@ static inline VkFFTResult shaderGen_R2C_even_decomposition(VkFFTSpecializationCo
 	PfSub(sc, &sc->regIDs[3], &sc->regIDs[0], &sc->regIDs[1]);
 
 	if (!sc->inverse) {
-		temp_double.data.d = 0.5l;
+		temp_double.data.d = pfFPinit("0.5");
 		PfMul(sc, &sc->regIDs[2], &sc->regIDs[2], &temp_double,0);
 		PfMul(sc, &sc->regIDs[3], &sc->regIDs[3], &temp_double, 0);
 
@@ -197,31 +199,31 @@ static inline VkFFTResult shaderGen_R2C_even_decomposition(VkFFTSpecializationCo
 	}
 	if (!sc->inverse) {
 		PfConjugate(sc, &sc->w, &sc->w);
-		PfMov_x(sc, &sc->regIDs[0], &sc->regIDs[3]);
-		PfMov_y(sc, &sc->regIDs[0], &sc->regIDs[2]);
+		PfMov(sc, &sc->regIDs[0].data.c[0], &sc->regIDs[3].data.c[0]);
+		PfMov(sc, &sc->regIDs[0].data.c[1], &sc->regIDs[2].data.c[1]);
 		PfMul(sc, &sc->regIDs[1], &sc->regIDs[0], &sc->w, 0);
-		PfMov_x_y(sc, &sc->regIDs[0], &sc->regIDs[1]);
-		PfMov_y_Neg_x(sc, &sc->regIDs[0], &sc->regIDs[1]);
+		PfMov(sc, &sc->regIDs[0].data.c[0], &sc->regIDs[1].data.c[1]);
+		PfMovNeg(sc, &sc->regIDs[0].data.c[1], &sc->regIDs[1].data.c[0]);
 		
-		PfSub_x(sc, &sc->regIDs[1], &sc->regIDs[2], &sc->regIDs[0]);
-		PfSub_y(sc, &sc->regIDs[1], &sc->regIDs[0], &sc->regIDs[3]);
+		PfSub(sc, &sc->regIDs[1].data.c[0], &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0]);
+		PfSub(sc, &sc->regIDs[1].data.c[1], &sc->regIDs[0].data.c[1], &sc->regIDs[3].data.c[1]);
 		
-		PfAdd_x(sc, &sc->regIDs[0], &sc->regIDs[2], &sc->regIDs[0]);
-		PfAdd_y(sc, &sc->regIDs[0], &sc->regIDs[0], &sc->regIDs[3]);
+		PfAdd(sc, &sc->regIDs[0].data.c[0], &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0]);
+		PfAdd(sc, &sc->regIDs[0].data.c[1], &sc->regIDs[0].data.c[1], &sc->regIDs[3].data.c[1]);
 		
 	}
 	else {
-		PfMov_x(sc, &sc->regIDs[0], &sc->regIDs[3]);
-		PfMov_y(sc, &sc->regIDs[0], &sc->regIDs[2]);
+		PfMov(sc, &sc->regIDs[0].data.c[0], &sc->regIDs[3].data.c[0]);
+		PfMov(sc, &sc->regIDs[0].data.c[1], &sc->regIDs[2].data.c[1]);
 		PfMul(sc, &sc->regIDs[1], &sc->regIDs[0], &sc->w, 0);
-		PfMov_x_y(sc, &sc->regIDs[0], &sc->regIDs[1]);
-		PfMov_y_x(sc, &sc->regIDs[0], &sc->regIDs[1]);
+		PfMov(sc, &sc->regIDs[0].data.c[0], &sc->regIDs[1].data.c[1]);
+		PfMov(sc, &sc->regIDs[0].data.c[1], &sc->regIDs[1].data.c[0]);
 
-		PfAdd_x(sc, &sc->regIDs[1], &sc->regIDs[2], &sc->regIDs[0]);
-		PfSub_y(sc, &sc->regIDs[1], &sc->regIDs[0], &sc->regIDs[3]);
+		PfAdd(sc, &sc->regIDs[1].data.c[0], &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0]);
+		PfSub(sc, &sc->regIDs[1].data.c[1], &sc->regIDs[0].data.c[1], &sc->regIDs[3].data.c[1]);
 
-		PfSub_x(sc, &sc->regIDs[0], &sc->regIDs[2], &sc->regIDs[0]);
-		PfAdd_y(sc, &sc->regIDs[0], &sc->regIDs[0], &sc->regIDs[3]);		
+		PfSub(sc, &sc->regIDs[0].data.c[0], &sc->regIDs[2].data.c[0], &sc->regIDs[0].data.c[0]);
+		PfAdd(sc, &sc->regIDs[0].data.c[1], &sc->regIDs[0].data.c[1], &sc->regIDs[3].data.c[1]);		
 	}
 	
 	appendRegistersToGlobal(sc, &sc->outputsStruct, &sc->inoutID, &sc->regIDs[0]);
diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h
index 148682d6..3ddec3a9 100644
--- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h
+++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h
@@ -24,16 +24,27 @@
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_StringManagement/vkFFT_StringManager.h"
 
+static inline void PfPrintReg(VkFFTSpecializationConstantsLayout* sc, PfContainer* inoutID, PfContainer* in);
 //register manipulation functions: mov, add, sub, etc.
 static inline void PfCopyContainer(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
 	if (sc->res != VKFFT_SUCCESS) return;
-
+	if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfCopyContainer(sc, &out->data.dd[0], &in->data.dd[0]);
+		PfCopyContainer(sc, &out->data.dd[1], &in->data.dd[1]);
+	}
 	if (out->type > 100) {
 		if (in->type > 100) {
 			if (out->type == in->type) {
 				int len = 0;
-				len = sprintf(out->data.s, "%s", in->data.s);
+				len = sprintf(out->name, "%s", in->name);
 				if (len > out->size) sc->res = VKFFT_ERROR_MATH_FAILED;
+
+				switch (out->type % 10) {
+				case 3:
+					PfCopyContainer(sc, &out->data.c[0], &in->data.c[0]);
+					PfCopyContainer(sc, &out->data.c[1], &in->data.c[1]);
+					return;
+				}
 				return;
 			}
 		}
@@ -53,8 +64,8 @@ static inline void PfCopyContainer(VkFFTSpecializationConstantsLayout* sc, PfCon
 					out->data.d = in->data.d;
 					return;
 				case 3:
-					out->data.c[0] = in->data.c[0];
-					out->data.c[1] = in->data.c[1];
+					out->data.c[0].data.d = in->data.c[0].data.d;
+					out->data.c[1].data.d = in->data.c[1].data.d;
 					return;
 				}
 			}
@@ -67,20 +78,94 @@ static inline void PfAllocateContainerFlexible(VkFFTSpecializationConstantsLayou
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (container->size != 0) return;
 
-	container->type = 100;
-	container->data.s = (char*)calloc(size, sizeof(char));
-	container->size = size;
+	if (container->type > 100){
+		container->name = (char*)calloc(size, sizeof(char));
+		container->size = size;
 
-	if (container->data.s == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
+		if (container->name == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
+	}
+	if(container->type < 200){
+		if ((((container->type % 100) / 10) == 3) && ((container->type % 10) == 2)) {
+			if (container->data.dd == 0) container->data.dd = (PfContainer*) calloc(2, sizeof(PfContainer));
+			if (container->data.dd == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
+			container->data.dd[0].type = container->type-10;
+			container->data.dd[1].type = container->type-10;
+			PfAllocateContainerFlexible(sc, &container->data.dd[0], 50);
+			PfAllocateContainerFlexible(sc, &container->data.dd[1], 50);
+		}
+		else if ((container->type % 10) == 3){
+			if (container->data.c == 0) container->data.c = (PfContainer*) calloc(2, sizeof(PfContainer));
+			if (container->data.c == 0) sc->res = VKFFT_ERROR_MALLOC_FAILED;
+			container->data.c[0].type = container->type-1;
+			container->data.c[1].type = container->type-1;
+			PfAllocateContainerFlexible(sc, &container->data.c[0], 50);
+			PfAllocateContainerFlexible(sc, &container->data.c[1], 50);
+		}
+	}
 	return;
 }
+
 static inline void PfDeallocateContainer(VkFFTSpecializationConstantsLayout* sc, PfContainer* container) {
 	if (container->type > 0) {
-		free(container->data.s);
-		container->data.s = 0;
+		if (container->type > 100) {
+			if (container->name)
+				free(container->name);
+			container->name = 0;
+		}
 		container->size = 0;
 		container->type = 0;
+		if(container->type < 200){
+			if ((((container->type % 100) / 10) == 3) && ((container->type % 10) == 2)) {
+				PfDeallocateContainer(sc, &container->data.dd[0]);
+				PfDeallocateContainer(sc, &container->data.dd[1]);
+				if (container->data.dd)
+					free(container->data.dd);
+				container->data.dd = 0;
+			}
+			else if ((container->type % 10) == 3){
+				PfDeallocateContainer(sc, &container->data.c[0]);
+				PfDeallocateContainer(sc, &container->data.c[1]);
+				if (container->data.c)
+					free(container->data.c);
+				container->data.c = 0;
+			}
+		}
+	}
+	return;
+}
+
+static inline void PfConvToDoubleDouble(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if ((in->type > 100) || (((in->type % 100) / 10) == 3)) {
+		if(out->type==0){
+			out->type = in->type;
+			PfAllocateContainerFlexible(sc, out, 50);
+		}
+		PfCopyContainer(sc, out, in);
+		return;
+	}else{
+		if(out->type==0){
+			out->type = in->type + 10;
+			PfAllocateContainerFlexible(sc, out, 50);
+		}
+		double high, low;
+		if ((in->type % 10)== 2) {
+			high = (double) in->data.d;
+		    if (isnan (high) || isinf (high)){
+		    	low = 0.0;
+		    }else{
+		    	low = (double) (in->data.d - (pfLD)high);
+		    	double temp = high + low;
+		    	low = (high - temp) + low;
+		    	high = temp;
+		    }
+		    out->data.dd[0].data.d = high;
+			out->data.dd[1].data.d = low;
+		}
+		return;
 	}
+	sc->res = VKFFT_ERROR_MATH_FAILED;
+	return;
 }
 
 static inline void PfGetTypeFromCode(VkFFTSpecializationConstantsLayout* sc, int code, PfContainer** type) {
@@ -113,6 +198,9 @@ static inline void PfGetTypeFromCode(VkFFTSpecializationConstantsLayout* sc, int
 		case 2:
 			type[0] = &sc->doubleDef;
 			return;
+		case 3:
+			type[0] = &sc->quadDef;
+			return;
 		}
 		break;
 	case 3:
@@ -126,6 +214,9 @@ static inline void PfGetTypeFromCode(VkFFTSpecializationConstantsLayout* sc, int
 		case 2:
 			type[0] = &sc->double2Def;
 			return;
+		case 3:
+			type[0] = &sc->quad2Def;
+			return;
 		}
 		break;
 	}
@@ -137,15 +228,19 @@ static inline void PfAppendNumberLiteral(VkFFTSpecializationConstantsLayout* sc,
 	if (((number->type % 10) == 2) || ((number->type % 10) == 3)) {
 		switch ((number->type % 100) / 10) {
 		case 0:
-			sc->tempLen = sprintf(sc->tempStr, "%s", sc->halfLiteral.data.s);
+			sc->tempLen = sprintf(sc->tempStr, "%s", sc->halfLiteral.name);
 			PfAppendLine(sc);
 			return;
 		case 1:
-			sc->tempLen = sprintf(sc->tempStr, "%s", sc->floatLiteral.data.s);
+			sc->tempLen = sprintf(sc->tempStr, "%s", sc->floatLiteral.name);
 			PfAppendLine(sc);
 			return;
 		case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", sc->doubleLiteral.data.s);
+			sc->tempLen = sprintf(sc->tempStr, "%s", sc->doubleLiteral.name);
+			PfAppendLine(sc);
+			return;
+		case 3:
+			sc->tempLen = sprintf(sc->tempStr, "%s", sc->doubleLiteral.name);
 			PfAppendLine(sc);
 			return;
 		}
@@ -164,12 +259,15 @@ static inline void PfAppendConversionStart(VkFFTSpecializationConstantsLayout* s
 	case 2:
 		switch ((out->type % 100) / 10) {
 		case 0:
-#if((VKFFT_BACKEND==0)||(VKFFT_BACKEND==5))
-			sc->tempLen = sprintf(sc->tempStr, "half(");
+#if(VKFFT_BACKEND==0)
+			sc->tempLen = sprintf(sc->tempStr, "float16_t(");
 			PfAppendLine(sc);
 #elif((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2)||(VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
 			sc->tempLen = sprintf(sc->tempStr, "(half)");
 			PfAppendLine(sc);
+#elif(VKFFT_BACKEND==5)
+			sc->tempLen = sprintf(sc->tempStr, "half(");
+			PfAppendLine(sc);
 #endif
 			return;
 		case 1:
@@ -182,13 +280,24 @@ static inline void PfAppendConversionStart(VkFFTSpecializationConstantsLayout* s
 #endif
 			return;
 		case 2:
+			switch ((in->type % 100) / 10) {
+			case 0: case 1: case 2:
 #if((VKFFT_BACKEND==0)||(VKFFT_BACKEND==5))
-			sc->tempLen = sprintf(sc->tempStr, "double(");
-			PfAppendLine(sc);
+				sc->tempLen = sprintf(sc->tempStr, "double(");
+				PfAppendLine(sc);
 #elif((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2)||(VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-			sc->tempLen = sprintf(sc->tempStr, "(double)");
-			PfAppendLine(sc);
+				sc->tempLen = sprintf(sc->tempStr, "(double)");
+				PfAppendLine(sc);
 #endif
+				return;
+			case 3:
+				sc->tempLen = sprintf(sc->tempStr, "conv_pf_quad_to_double(");
+				PfAppendLine(sc);
+				return;
+			}
+		case 3:
+			sc->tempLen = sprintf(sc->tempStr, "conv_double_to_pf_quad(");
+			PfAppendLine(sc);
 			return;
 		}
 	case 3:
@@ -212,6 +321,8 @@ static inline void PfAppendConversionStart(VkFFTSpecializationConstantsLayout* s
 #endif
 			return;
 		case 2:
+			switch ((in->type % 100) / 10) {
+			case 0: case 1: case 2:
 #if(VKFFT_BACKEND==0)
 			sc->tempLen = sprintf(sc->tempStr, "dvec2(");
 			PfAppendLine(sc);
@@ -220,6 +331,15 @@ static inline void PfAppendConversionStart(VkFFTSpecializationConstantsLayout* s
 			PfAppendLine(sc);
 #endif
 			return;
+			case 3:
+				sc->tempLen = sprintf(sc->tempStr, "conv_pf_quad2_to_double2(");
+				PfAppendLine(sc);
+				return;
+			}
+		case 3:
+			sc->tempLen = sprintf(sc->tempStr, "conv_double2_to_pf_quad2(");
+			PfAppendLine(sc);
+			return;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
@@ -251,12 +371,23 @@ static inline void PfAppendConversionEnd(VkFFTSpecializationConstantsLayout* sc,
 #endif
 			return;
 		case 2:
+			switch ((in->type % 100) / 10) {
+			case 0: case 1: case 2:
 #if((VKFFT_BACKEND==0)||(VKFFT_BACKEND==5))
 			sc->tempLen = sprintf(sc->tempStr, ")");
 			PfAppendLine(sc);
 #elif((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2)||(VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
 #endif
 			return;
+			case 3:
+				sc->tempLen = sprintf(sc->tempStr, ")");
+				PfAppendLine(sc);
+				return;
+			}
+		case 3:
+			sc->tempLen = sprintf(sc->tempStr, ")");
+			PfAppendLine(sc);
+			return;
 		}
 	case 3:
 		switch ((out->type % 100) / 10) {
@@ -279,6 +410,8 @@ static inline void PfAppendConversionEnd(VkFFTSpecializationConstantsLayout* sc,
 #endif
 			return;
 		case 2:
+			switch ((in->type % 100) / 10) {
+			case 0: case 1: case 2:
 #if(VKFFT_BACKEND==0)
 			sc->tempLen = sprintf(sc->tempStr, ")");
 			PfAppendLine(sc);
@@ -287,64 +420,107 @@ static inline void PfAppendConversionEnd(VkFFTSpecializationConstantsLayout* sc,
 			PfAppendLine(sc);
 #endif
 			return;
+			case 3:
+				sc->tempLen = sprintf(sc->tempStr, ")");
+				PfAppendLine(sc);
+				return;
+			}
+		case 3:
+			sc->tempLen = sprintf(sc->tempStr, ")");
+			PfAppendLine(sc);
+			return;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
-static inline void PfDefine(VkFFTSpecializationConstantsLayout* sc, PfContainer* name) {
+static inline void PfSetContainerName(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, const char* name) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (name->type > 100) {
-		switch (name->type % 10) {
+	if (out->type > 100) {
+		sprintf(out->name, "%s", name);
+		if(out->type < 200){
+			if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)){
+				sprintf(out->data.dd[0].name, "%s.x", name);
+				sprintf(out->data.dd[1].name, "%s.y", name);
+			}else{
+				if (((out->type % 10) == 3) && (out->type > 100)) {
+					sprintf(out->data.c[0].name, "%s.x", name);
+					sprintf(out->data.c[1].name, "%s.y", name);
+					if (((out->type % 100) / 10) == 3){
+						sprintf(out->data.c[0].data.dd[0].name, "%s.x.x", name);
+						sprintf(out->data.c[0].data.dd[1].name, "%s.x.y", name);
+						sprintf(out->data.c[1].data.dd[0].name, "%s.y.x", name);
+						sprintf(out->data.c[1].data.dd[1].name, "%s.y.y", name);
+					}
+				}
+			}
+		}
+		return;
+	}
+	return;
+}
+static inline void PfDefine(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, const char* name) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if (out->type > 100) {
+		PfSetContainerName(sc, out, name);
+		switch (out->type % 10) {
 		case 1:
-			switch ((name->type % 100) / 10) {
+			switch ((out->type % 100) / 10) {
 			case 0:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->uintDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->uintDef.name, name);
 				PfAppendLine(sc);
 				return;
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->intDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->intDef.name, name);
 				PfAppendLine(sc);
 				return;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->uint64Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->uint64Def.name, name);
 				PfAppendLine(sc);
 				return;
 			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->int64Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->int64Def.name, name);
 				PfAppendLine(sc);
 				return;
 			}
 			break;
 		case 2:
-			switch ((name->type % 100) / 10) {
+			switch ((out->type % 100) / 10) {
 			case 0:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->halfDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->halfDef.name, name);
 				PfAppendLine(sc);
 				return;
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->floatDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->floatDef.name, name);
 				PfAppendLine(sc);
 				return;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->doubleDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->doubleDef.name, name);
+				PfAppendLine(sc);
+				return;
+			case 3:
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->quadDef.name, name);
 				PfAppendLine(sc);
 				return;
 			}
 			break;
 		case 3:
-			switch ((name->type % 100) / 10) {
+			switch ((out->type % 100) / 10) {
 			case 0:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->half2Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->half2Def.name, name);
 				PfAppendLine(sc);
 				return;
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->float2Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->float2Def.name, name);
 				PfAppendLine(sc);
 				return;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->double2Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->double2Def.name, name);
+				PfAppendLine(sc);
+				return;
+			case 3:
+				sc->tempLen = sprintf(sc->tempStr, "%s %s;\n", sc->quad2Def.name, name);
 				PfAppendLine(sc);
 				return;
 			}
@@ -355,25 +531,26 @@ static inline void PfDefine(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 	return;
 }
 static inline void PfDefineConstant(VkFFTSpecializationConstantsLayout* sc, PfContainer* name, PfContainer* value) {
+	//needs to be fixed for double-double
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (name->type > 100) {
 		switch (name->type % 10) {
 		case 1:
 			switch ((name->type % 100) / 10) {
 			case 0:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->uintDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->uintDef.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->intDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->intDef.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->uint64Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->uint64Def.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->int64Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->int64Def.name, name->name);
 				PfAppendLine(sc);
 				break;
 			}
@@ -381,15 +558,15 @@ static inline void PfDefineConstant(VkFFTSpecializationConstantsLayout* sc, PfCo
 		case 2:
 			switch ((name->type % 100) / 10) {
 			case 0:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->halfDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->halfDef.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->floatDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->floatDef.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->doubleDef.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->doubleDef.name, name->name);
 				PfAppendLine(sc);
 				break;
 			}
@@ -397,28 +574,30 @@ static inline void PfDefineConstant(VkFFTSpecializationConstantsLayout* sc, PfCo
 		case 3:
 			switch ((name->type % 100) / 10) {
 			case 0:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->half2Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->half2Def.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->float2Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->float2Def.name, name->name);
 				PfAppendLine(sc);
 				break;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.data.s, sc->double2Def.data.s, name->data.s);
+				sc->tempLen = sprintf(sc->tempStr, "%s %s %s", sc->constDef.name, sc->double2Def.name, name->name);
 				PfAppendLine(sc);
 				break;
 			}
 			break;
 		}
 		if (value->type < 100) {
+			sc->tempLen = sprintf(sc->tempStr, " = ");
+			PfAppendLine(sc);
 			switch (value->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", value->data.i);
 				PfAppendLine(sc);
 				break;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", value->data.d);
+				sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)value->data.d);
 				PfAppendLine(sc);
 				break;
 			case 3:
@@ -427,6 +606,8 @@ static inline void PfDefineConstant(VkFFTSpecializationConstantsLayout* sc, PfCo
 				break;
 			}
 			PfAppendNumberLiteral(sc, name);
+			sc->tempLen = sprintf(sc->tempStr, ";");
+			PfAppendLine(sc);
 			return;
 		}
 	}
@@ -437,133 +618,212 @@ static inline void PfDefineConstant(VkFFTSpecializationConstantsLayout* sc, PfCo
 static inline void PfSetToZero(VkFFTSpecializationConstantsLayout* sc, PfContainer* out) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	//out
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		switch (out->type % 10) {
-		case 1:
-			sc->tempLen = sprintf(sc->tempStr, "0");
-			PfAppendLine(sc);
-			break;
-		case 2: case 3:
-			sc->tempLen = sprintf(sc->tempStr, "0.0");
-			PfAppendLine(sc);
-			break;
-		}
-		PfAppendNumberLiteral(sc, out);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
-
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
+	if ((out->type % 10) == 3){
+		PfSetToZero(sc, &out->data.c[0]);
+		PfSetToZero(sc, &out->data.c[1]);
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfSetToZero(sc, &out->data.dd[0]);
+		PfSetToZero(sc, &out->data.dd[1]);
+		return;
+	}
+	else{
+		if (out->type > 100) {
+			sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
 			PfAppendLine(sc);
 			sc->tempLen = sprintf(sc->tempStr, " = ");
 			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, "0.0");
-			PfAppendLine(sc);
+			switch (out->type % 10) {
+			case 1:
+				sc->tempLen = sprintf(sc->tempStr, "0");
+				PfAppendLine(sc);
+				break;
+			case 2: case 3:
+				sc->tempLen = sprintf(sc->tempStr, "0.0");
+				PfAppendLine(sc);
+				break;
+			}
 			PfAppendNumberLiteral(sc, out);
 			sc->tempLen = sprintf(sc->tempStr, ";\n");
 			PfAppendLine(sc);
-			break;
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1: 
-			out->data.i = 0;
-			return;
-		case 2:
-			out->data.d = 0;
-			return;
-		case 3:
-			out->data.c[0] = 0;
-			out->data.c[1] = 0;
 			return;
 		}
+		else {
+			switch (out->type % 10) {
+			case 1:
+				out->data.i = 0;
+				return;
+			case 2:
+				out->data.d = 0;
+				return;
+			}
+		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 static inline void PfSetToZeroShared(VkFFTSpecializationConstantsLayout* sc, PfContainer* sdataID) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (sdataID->type > 100) {
-		switch (sdataID->type % 10) {
-		case 1: 
-			sc->tempLen = sprintf(sc->tempStr, "\
+	if(sc->storeSharedComplexComponentsSeparately){
+		if ((((sc->sdataStruct.type % 100) / 10) == 3) && ((sc->sdataStruct.type % 10) > 1)) {
+			if (sdataID->type > 100) {
+			switch (sdataID->type % 10) {
+			case 1: 
+					sc->tempLen = sprintf(sc->tempStr, "\
 sdata[%s].x = 0;\n\
-sdata[%s].y = 0;\n", sdataID->data.s, sdataID->data.s);
-			PfAppendLine(sc);
-			return;
+sdata[%s].y = 0;\n", sdataID->name, sdataID->name);
+					PfAppendLine(sc);
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s + %" PRIi64 "].x = 0;\n\
+sdata[%s + %" PRIi64 "].y = 0;\n", sdataID->name, sc->offsetImaginaryShared.data.i, sdataID->name, sc->offsetImaginaryShared.data.i);
+					PfAppendLine(sc);
+					return;
+				}
+			}
+			else {
+				switch (sdataID->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%" PRIi64 "].x = 0;\n\
+sdata[%" PRIi64 "].y = 0;\n", sdataID->data.i, sdataID->data.i);
+					PfAppendLine(sc);
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%" PRIi64 "].x = 0;\n\
+sdata[%" PRIi64 "].y = 0;\n", sdataID->data.i + sc->offsetImaginaryShared.data.i, sdataID->data.i + sc->offsetImaginaryShared.data.i);
+					PfAppendLine(sc);
+					return;
+				}
+			}
 		}
-	}
-	else {
-		switch (sdataID->type % 10) {
-		case 1:
-			sc->tempLen = sprintf(sc->tempStr, "\
+		if (sdataID->type > 100) {
+			switch (sdataID->type % 10) {
+			case 1: 
+				sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s] = 0;\n\
+sdata[%s + %" PRIi64 "] = 0;\n", sdataID->name, sdataID->name, sc->offsetImaginaryShared.data.i);
+				PfAppendLine(sc);
+				return;
+			}
+		}
+		else {
+			switch (sdataID->type % 10) {
+			case 1:
+				sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%" PRIi64 "] = 0;\n\
+sdata[%" PRIi64 "] = 0;\n", sdataID->data.i, sdataID->data.i + sc->offsetImaginaryShared.data.i);
+				PfAppendLine(sc);
+				return;
+			}
+		}
+	}else{
+		if ((((sc->sdataStruct.type % 100) / 10) == 3) && ((sc->sdataStruct.type % 10) > 1)) {
+			if (sdataID->type > 100) {
+			switch (sdataID->type % 10) {
+			case 1: 
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].x.x = 0;\n\
+sdata[%s].x.y = 0;\n", sdataID->name, sdataID->name);
+					PfAppendLine(sc);
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].y.x = 0;\n\
+sdata[%s].y.y = 0;\n", sdataID->name, sdataID->name);
+					PfAppendLine(sc);
+					return;
+				}
+			}
+			else {
+				switch (sdataID->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%" PRIi64 "].x.x = 0;\n\
+sdata[%" PRIi64 "].x.y = 0;\n", sdataID->data.i, sdataID->data.i);
+					PfAppendLine(sc);
+					sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%" PRIi64 "].y.x = 0;\n\
+sdata[%" PRIi64 "].y.y = 0;\n", sdataID->data.i, sdataID->data.i);
+					PfAppendLine(sc);
+					return;
+				}
+			}
+		}
+		if (sdataID->type > 100) {
+			switch (sdataID->type % 10) {
+			case 1: 
+				sc->tempLen = sprintf(sc->tempStr, "\
+sdata[%s].x = 0;\n\
+sdata[%s].y = 0;\n", sdataID->name, sdataID->name);
+				PfAppendLine(sc);
+				return;
+			}
+		}
+		else {
+			switch (sdataID->type % 10) {
+			case 1:
+				sc->tempLen = sprintf(sc->tempStr, "\
 sdata[%" PRIi64 "].x = 0;\n\
 sdata[%" PRIi64 "].y = 0;\n", sdataID->data.i, sdataID->data.i);
-			PfAppendLine(sc);
-			return;
+				PfAppendLine(sc);
+				return;
+			}
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
-
 static inline void PfMov(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
+	if ((out->type % 10) == 3){
 		if ((out->type > 100) && (in->type > 100) && ((out->type % 10) == (in->type % 10))) {
 			//packed instructions workaround if all values are in registers
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
+			sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
 			PfAppendLine(sc);
 			sc->tempLen = sprintf(sc->tempStr, " = ");
 			PfAppendLine(sc);
 			PfAppendConversionStart(sc, out, in);
-			sc->tempLen = sprintf(sc->tempStr, "%s", in->data.s);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in->name);
 			PfAppendLine(sc);
 			PfAppendConversionEnd(sc, out, in);
 			sc->tempLen = sprintf(sc->tempStr, ";\n");
 			PfAppendLine(sc);
 			return;
 		}
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
+		PfMov(sc, &out->data.c[0], &in->data.c[0]);
+		PfMov(sc, &out->data.c[1], &in->data.c[1]);
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfContainer temp = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp, in);
+		PfMov(sc, &out->data.dd[0], &temp.data.dd[0]);
+		PfMov(sc, &out->data.dd[1], &temp.data.dd[1]);
+		PfDeallocateContainer(sc, &temp);
+		return;
+	}
+	if (out->type > 100) {
+		if ((out->type > 100) && (in->type > 100) && ((out->type % 10) == (in->type % 10))) {
+			//packed instructions workaround if all values are in registers
+			sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
 			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
+			sc->tempLen = sprintf(sc->tempStr, " = ");
 			PfAppendLine(sc);
-			break;
+			PfAppendConversionStart(sc, out, in);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in->name);
+			PfAppendLine(sc);
+			PfAppendConversionEnd(sc, out, in);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
+			return;
 		}
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
 		sc->tempLen = sprintf(sc->tempStr, " = ");
 		PfAppendLine(sc);
 		PfAppendConversionStart(sc, out, in);
 		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 1: case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s", in->data.s);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s.x", in->data.s);
-				PfAppendLine(sc);
-				break;
-			}
+			sc->tempLen = sprintf(sc->tempStr, "%s", in->name);
+			PfAppendLine(sc);
 		}
 		else {
 			switch (in->type % 10) {
@@ -572,11 +832,7 @@ static inline void PfMov(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 				PfAppendLine(sc);
 				break;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.d);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.c[0]);
+				sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in->data.d);
 				PfAppendLine(sc);
 				break;
 			}
@@ -585,48 +841,6 @@ static inline void PfMov(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 		PfAppendConversionEnd(sc, out, in);
 		sc->tempLen = sprintf(sc->tempStr, ";\n");
 		PfAppendLine(sc);
-
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in);
-			if (in->type > 100) {
-				switch (in->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-			break;
-		}
 		return;
 	}
 	else {
@@ -640,7 +854,7 @@ static inline void PfMov(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					out->data.i = in->data.i;
 					return;
 				case 2:
-					out->data.i = (int64_t)in->data.d;
+					out->data.i = (pfINT)in->data.d;
 					return;
 				}
 				return;
@@ -654,57 +868,45 @@ static inline void PfMov(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					return;
 				}
 				return;
-			case 3:
-				out->data.c[0] = in->data.c[0];
-				out->data.c[1] = in->data.c[1];
-				return;
 			}
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-
-static inline void PfMov_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
+static inline void PfMovNeg(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
 	if (sc->res != VKFFT_SUCCESS) return;
+	if ((out->type % 10) == 3){
+		PfMovNeg(sc, &out->data.c[0], &in->data.c[0]);
+		PfMovNeg(sc, &out->data.c[1], &in->data.c[1]);
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfContainer temp = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp, in);
+		PfMovNeg(sc, &out->data.dd[0], &temp.data.dd[0]);
+		PfMovNeg(sc, &out->data.dd[1], &temp.data.dd[1]);
+		PfDeallocateContainer(sc, &temp);
+		return;
+	}
 	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
 		sc->tempLen = sprintf(sc->tempStr, " = ");
 		PfAppendLine(sc);
 		PfAppendConversionStart(sc, out, in);
 		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 1: case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s", in->data.s);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s.x", in->data.s);
-				PfAppendLine(sc);
-				break;
-			}
+			sc->tempLen = sprintf(sc->tempStr, "-%s", in->name);
+			PfAppendLine(sc);
 		}
 		else {
 			switch (in->type % 10) {
 			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in->data.i);
+				sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in->data.i);
 				PfAppendLine(sc);
 				break;
 			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.d);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.c[0]);
+				sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (-in->data.d));
 				PfAppendLine(sc);
 				break;
 			}
@@ -716,4641 +918,249 @@ static inline void PfMov_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 		return;
 	}
 	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
+		if (in->type > 100) {
+		}
+		else {
+			switch (out->type % 10) {
+			case 1:
 				switch (in->type % 10) {
 				case 1:
-					out->data.c[0] = (long double)in->data.i;
+					out->data.i = -in->data.i;
 					return;
 				case 2:
-					out->data.c[0] = in->data.d;
+					out->data.i = (pfINT)-in->data.d;
 					return;
-				case 3:
-					out->data.c[0] = in->data.c[0];
+				}
+				return;
+			case 2:
+				switch (in->type % 10) {
+				case 1:
+					out->data.d = (double)-in->data.i;
+					return;
+				case 2:
+					out->data.d = -in->data.d;
 					return;
 				}
+				return;
 			}
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-static inline void PfMov_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 1: case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s", in->data.s);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s.y", in->data.s);
-				PfAppendLine(sc);
-				break;
-			}
-		}
-		else {
-			switch (in->type % 10) {
-			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in->data.i);
-				PfAppendLine(sc);
-				break;
-			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.d);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.c[1]);
-				PfAppendLine(sc);
-				break;
+
+static inline void PfAdd(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2);
+
+static inline void PfSub(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2);
+
+static inline void PfQuadQuickSum(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {// double-double, double, double
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			if ((in_2->type % 10) == 3){
+				PfQuadQuickSum(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0]);
+				PfQuadQuickSum(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[1]);
+			}else{
+				PfQuadQuickSum(sc, &out->data.c[0], &in_1->data.c[0], in_2);
+				PfQuadQuickSum(sc, &out->data.c[1], &in_1->data.c[1], in_2);
+			}
+		}else{
+			if ((in_2->type % 10) == 3){
+				PfQuadQuickSum(sc, &out->data.c[0], in_1, &in_2->data.c[0]);
+				PfQuadQuickSum(sc, &out->data.c[1], in_1, &in_2->data.c[1]);
+			}else{
+				PfQuadQuickSum(sc, &out->data.c[0], in_1, in_2);
+				PfMov(sc, &out->data.c[1], &out->data.c[0]);
 			}
-			PfAppendNumberLiteral(sc, out);
 		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
 		return;
 	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
-				switch (in->type % 10) {
-				case 1:
-					out->data.c[1] = (long double)in->data.i;
-					return;
-				case 2:
-					out->data.c[1] = in->data.d;
-					return;
-				case 3:
-					out->data.c[1] = in->data.c[1];
-					return;
-				}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfAdd(sc, &out->data.dd[0], in_1, in_2);
+		PfSub(sc, &out->data.dd[1], &out->data.dd[0], in_1);
+		PfSub(sc, &out->data.dd[1], in_2, &out->data.dd[1]);
+	}
+	return;
+}
+
+static inline void PfQuadSum(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {// double-double, double, double
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			if ((in_2->type % 10) == 3){
+				PfQuadSum(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0], temp);
+				PfQuadSum(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[1], temp);
+			}else{
+				PfQuadSum(sc, &out->data.c[0], &in_1->data.c[0], in_2, temp);
+				PfQuadSum(sc, &out->data.c[1], &in_1->data.c[1], in_2, temp);
+			}
+		}else{
+			if ((in_2->type % 10) == 3){
+				PfQuadSum(sc, &out->data.c[0], in_1, &in_2->data.c[0], temp);
+				PfQuadSum(sc, &out->data.c[1], in_1, &in_2->data.c[1], temp);
+			}else{
+				PfQuadSum(sc, &out->data.c[0], in_1, in_2, temp);
+				PfMov(sc, &out->data.c[1], &out->data.c[0]);
 			}
 		}
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfAdd(sc, &out->data.dd[0], in_1, in_2);
+		PfSub(sc, &out->data.dd[1], &out->data.dd[0], in_1);
+		PfSub(sc, temp, &out->data.dd[0], &out->data.dd[1]);
+		PfSub(sc, temp, in_1, temp);
+		PfSub(sc, &out->data.dd[1], in_2, &out->data.dd[1]);
+		PfAdd(sc, &out->data.dd[1], &out->data.dd[1], temp);
 	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
-static inline void PfMov_x_Neg_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
+static inline void PfAdd(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
+	if ((out->type % 10) == 3){
+#if(VKFFT_BACKEND == 2)
+		if ((in_1->type > 100) && (in_2->type > 100) && (((out->type % 100) / 10) != 3)) {
+			//packed instructions workaround if all values are in registers
+			sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+			PfAppendLine(sc);
+			sc->tempLen = sprintf(sc->tempStr, " = ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+			PfAppendLine(sc);
+			PfAppendConversionEnd(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, " + ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_2);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+			PfAppendLine(sc);
+			PfAppendConversionEnd(sc, out, in_2);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
 			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
 			return;
 		}
+#endif
+		if ((in_2->type % 10) == 3){
+			PfAdd(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0]);
+			PfAdd(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[1]);
+		}else{
+			PfAdd(sc, &out->data.c[0], &in_1->data.c[0], in_2);
+			PfAdd(sc, &out->data.c[1], &in_1->data.c[1], in_2);
+		}
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfContainer temp1 = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp1, in_1);
+		PfContainer temp2 = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp2, in_2);
+		PfQuadSum(sc, &sc->tempQuad.data.c[0], &temp1.data.dd[0], &temp2.data.dd[0], &sc->tempQuad3.data.c[0].data.dd[0]);
+		PfAdd(sc, &out->data.dd[0], &temp1.data.dd[1], &temp2.data.dd[1]);
+		PfAdd(sc, &sc->tempQuad.data.c[0].data.dd[1], &sc->tempQuad.data.c[0].data.dd[1], &out->data.dd[0]);
+		PfQuadQuickSum(sc, out, &sc->tempQuad.data.c[0].data.dd[0], &sc->tempQuad.data.c[0].data.dd[1]);
+		PfDeallocateContainer(sc, &temp1);
+		PfDeallocateContainer(sc, &temp2);
+		return;
+	}
+	if (out->type > 100) {
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
 		sc->tempLen = sprintf(sc->tempStr, " = ");
 		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 1: case 2:
-				sc->tempLen = sprintf(sc->tempStr, "-%s", in->data.s);
-				PfAppendLine(sc);
+		if ((in_1->type < 100) && (in_2->type < 100)) {
+			switch (in_1->type % 10) {
+			case 1: 
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i + in_2->data.d));
+					PfAppendLine(sc);
+					break;
+				}
 				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "-%s.x", in->data.s);
-				PfAppendLine(sc);
+			case 2:
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d + (pfLD)in_2->data.i));
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d + in_2->data.d));
+					PfAppendLine(sc);
+					break;
+				}
 				break;
 			}
+			PfAppendNumberLiteral(sc, out);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
 		}
 		else {
-			switch (in->type % 10) {
-			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in->data.i);
-				PfAppendLine(sc);
-				break;
-			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in->data.d);
+			PfAppendConversionStart(sc, out, in_1);
+			if (in_1->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
 				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in->data.c[0]);
+			}
+			else {
+				switch (in_1->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_1->data.d);
+					PfAppendLine(sc);
+					break;
+				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, " + ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_2);
+			if (in_2->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
 				PfAppendLine(sc);
-				break;
 			}
-			PfAppendNumberLiteral(sc, out);
+			else {
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_2->data.d);
+					PfAppendLine(sc);
+					break;
+				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_2);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
 		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
 		return;
 	}
 	else {
 		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
+		case 1:
+			if (in_1->type > 100) {
 			}
 			else {
-				switch (in->type % 10) {
-				case 1:
-					out->data.c[0] = (long double)-in->data.i;
-					return;
-				case 2:
-					out->data.c[0] = -in->data.d;
-					return;
-				case 3:
-					out->data.c[0] = -in->data.c[0];
-					return;
-				}
-			}
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfMov_y_Neg_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 1: case 2:
-				sc->tempLen = sprintf(sc->tempStr, "-%s", in->data.s);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "-%s.y", in->data.s);
-				PfAppendLine(sc);
-				break;
-			}
-		}
-		else {
-			switch (in->type % 10) {
-			case 1:
-				sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in->data.i);
-				PfAppendLine(sc);
-				break;
-			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in->data.d);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in->data.c[1]);
-				PfAppendLine(sc);
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
-				switch (in->type % 10) {
-				case 1:
-					out->data.c[1] = (long double)-in->data.i;
-					return;
-				case 2:
-					out->data.c[1] = -in->data.d;
-					return;
-				case 3:
-					out->data.c[1] = -in->data.c[1];
-					return;
-				}
-			}
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfMov_x_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s.y", in->data.s);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-		}
-		else {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.c[1]);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
-				switch (in->type % 10) {
-				case 3:
-					out->data.c[0] = in->data.c[1];
-					return;
-				}
-			}
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfMov_x_Neg_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "-%s.y", in->data.s);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-		}
-		else {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in->data.c[1]);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
-				switch (in->type % 10) {
-				case 3:
-					out->data.c[0] = -in->data.c[1];
-					return;
-				}
-			}
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfMov_y_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s.x", in->data.s);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-		}
-		else {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", in->data.c[0]);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
-				switch (in->type % 10) {
-				case 3:
-					out->data.c[1] = in->data.c[0];
-					return;
-				}
-			}
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfMov_y_Neg_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		PfAppendConversionStart(sc, out, in);
-		if (in->type > 100) {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "-%s.x", in->data.s);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-		}
-		else {
-			switch (in->type % 10) {
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in->data.c[0]);
-				PfAppendLine(sc);
-				break;
-			default:
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-				return;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		PfAppendConversionEnd(sc, out, in);
-		sc->tempLen = sprintf(sc->tempStr, ";\n");
-		PfAppendLine(sc);
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in->type > 100) {
-			}
-			else {
-				switch (in->type % 10) {
-				case 3:
-					out->data.c[1] = -in->data.c[0];
-					return;
-				}
-			}
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfAdd(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-#if(VKFFT_BACKEND == 2)
-		if ((in_1->type > 100) && (in_2->type > 100)) {
-			//packed instructions workaround if all values are in registers
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-			PfAppendLine(sc);
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-			PfAppendLine(sc);
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-			return;
-		}
-#endif
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1: 
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s); 
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " + ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			break;
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = in_1->data.i + in_2->data.i;
-							return;
-						case 2:
-							out->data.i = in_1->data.i + (int64_t)in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = (int64_t)in_1->data.d + in_2->data.i;
-							return;
-						case 2:
-							out->data.i = (int64_t)(in_1->data.d + in_2->data.d);
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-		break;
-		case 2:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = (long double)(in_1->data.i + in_2->data.i);
-							return;
-						case 2:
-							out->data.d = (long double)in_1->data.i + in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = in_1->data.d + (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.d = in_1->data.d + in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-		break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = in_1->data.c[0] + (long double)in_2->data.i;
-							out->data.c[1] = in_1->data.c[1] + (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.d;
-							out->data.c[1] = in_1->data.c[1] + in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.c[0];
-							out->data.c[1] = in_1->data.c[1] + in_2->data.c[1];
-							return;
-						}
-					}
-					break;
-				}
-			}
-		break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfAdd_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = in_1->data.c[0] + (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.c[0];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfAdd_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[1] = in_1->data.c[1] + (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[1] = in_1->data.c[1] + in_2->data.d;
-							return;
-						case 3:
-							out->data.c[1] = in_1->data.c[1] + in_2->data.c[1];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfAdd_x_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-		}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-	}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = in_1->data.c[0] + (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.c[1];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfAdd_y_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i + in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-	}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[1] = in_1->data.c[1] + (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[1] = in_1->data.c[1] + in_2->data.d;
-							return;
-						case 3:
-							out->data.c[1] = in_1->data.c[1] + in_2->data.c[0];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfAddInv(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-#if(VKFFT_BACKEND == 2)
-		if ((in_1->type > 100) && (in_2->type > 100)) {
-			//packed instructions workaround if all values are in registers
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, "-%s", in_1->data.s);
-			PfAppendLine(sc);
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-			PfAppendLine(sc);
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-			return;
-		}
-#endif
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in_1->data.i - in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -(long double)in_1->data.i - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -(long double)in_1->data.i - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[0] - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[0] - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[0] - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "-%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "-%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "(-%s)", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "(-%s.x)", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "(%" PRIi64 ")", -in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "(%.17Le)", -in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "(%.17Le)", -in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in_1->data.i - in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -(long double)in_1->data.i - in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -(long double)in_1->data.i - in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d - (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d - in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d - in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[1] - (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[1] - in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[1] - in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "-%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "-%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", -in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", -in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " + ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "(-%s)", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "(-%s.y)", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "(%" PRIi64 ")", -in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "(%.17Le)", -in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "(%.17Le)", -in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			break;
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = -in_1->data.i - in_2->data.i;
-							return;
-						case 2:
-							out->data.i = -in_1->data.i - (int64_t)in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = -(int64_t)in_1->data.d - in_2->data.i;
-							return;
-						case 2:
-							out->data.i = -(int64_t)(in_1->data.d + in_2->data.d);
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 2:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = -(long double)(in_1->data.i + in_2->data.i);
-							return;
-						case 2:
-							out->data.d = -(long double)in_1->data.i - in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = -in_1->data.d - (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.d = -in_1->data.d - in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = -in_1->data.c[0] - (long double)in_2->data.i;
-							out->data.c[1] = -in_1->data.c[1] - (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = -in_1->data.c[0] - in_2->data.d;
-							out->data.c[1] = -in_1->data.c[1] - in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = -in_1->data.c[0] - in_2->data.c[0];
-							out->data.c[1] = -in_1->data.c[1] - in_2->data.c[1];
-							return;
-						}
-					}
-					break;
-				}
-			}
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfInc(VkFFTSpecializationConstantsLayout* sc, PfContainer* out) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 1:
-		case 2:
-			sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s + 1;\n", out->data.s, out->data.s);
-			PfAppendLine(sc);
-			return;
-		case 3:
-			break;
-		}
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			out->data.i = out->data.i + 1;
-			return;
-		case 2:
-			out->data.d = out->data.d + 1;
-			return;
-			break;
-		case 3:
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfSub(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-#if(VKFFT_BACKEND == 2)
-		if ((in_1->type > 100) && (in_2->type > 100)) {
-			//packed instructions workaround if all values are in registers
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-			PfAppendLine(sc);
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-			PfAppendLine(sc);
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-			return;
-		}
-#endif
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " - ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			break;
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = in_1->data.i - in_2->data.i;
-							return;
-						case 2:
-							out->data.i = in_1->data.i - (int64_t)in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = (int64_t)in_1->data.d - in_2->data.i;
-							return;
-						case 2:
-							out->data.i = (int64_t)(in_1->data.d - in_2->data.d);
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 2:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = (long double)(in_1->data.i - in_2->data.i);
-							return;
-						case 2:
-							out->data.d = (long double)in_1->data.i - in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = in_1->data.d - (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.d = in_1->data.d - in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = in_1->data.c[0] - (long double)in_2->data.i;
-							out->data.c[1] = in_1->data.c[1] - (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.d;
-							out->data.c[1] = in_1->data.c[1] - in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.c[0];
-							out->data.c[1] = in_1->data.c[1] - in_2->data.c[1];
-							return;
-						}
-					}
-					break;
-				}
-			}
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfSub_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = in_1->data.c[0] - (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.c[0];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfSub_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[1] = in_1->data.c[1] - (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[1] = in_1->data.c[1] - in_2->data.d;
-							return;
-						case 3:
-							out->data.c[1] = in_1->data.c[1] - in_2->data.c[1];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfSub_x_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-		}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-	}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[0] = in_1->data.c[0] - (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.c[1];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfSub_y_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			break;
-		default:
-			sc->res = VKFFT_ERROR_MATH_FAILED;
-			return;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.c[1] = in_1->data.c[1] - (double)in_2->data.i;
-							return;
-						case 2:
-							out->data.c[1] = in_1->data.c[1] - in_2->data.d;
-							return;
-						case 3:
-							out->data.c[1] = in_1->data.c[1] - in_2->data.c[0];
-							return;
-						}
-					}
-				}
-			}
-			break;
-
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfFMA(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* in_3) {
-	//fma inlining is not correct if all three numbers are complex for now
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-#if(VKFFT_BACKEND == 2)
-		if ((in_1->type > 100) && (in_2->type > 100) && (in_3->type > 100)) {
-			//packed instructions workaround if all values are in registers
-			if (((in_1->type % 10) != 3) || ((in_2->type % 10) != 3)) {
-				sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-				PfAppendLine(sc);
-				sc->tempLen = sprintf(sc->tempStr, " = ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-				PfAppendLine(sc);
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " * ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-				PfAppendLine(sc);
-				PfAppendConversionEnd(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, " + ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, "%s", in_3->data.s);
-				PfAppendLine(sc);
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-				return;
-			}
-		}
-#endif
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100) && (in_3->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i + in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)(in_1->data.i * in_2->data.i) + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)(in_1->data.i * in_2->data.i) + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-}
-					break;
-				case 2:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[0] + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[0] + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[0] + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.i + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.i + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[0] + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[0] + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[0] + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * (long double)in_2->data.i + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.i + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.i + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.d + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.d + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.d + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1] + (long double)in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1] + in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1] + in_3->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else if ((in_1->type < 100) && (in_2->type < 100) && (in_3->type > 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_3);
-			switch (in_3->type % 10) {
-			case 1: case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s", in_3->data.s);
-				PfAppendLine(sc);
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "%s.x", in_3->data.s);
-				PfAppendLine(sc);
-				break;
-			}
-			PfAppendConversionEnd(sc, out, in_3);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			sc->tempLen = sprintf(sc->tempStr, "fma(");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, ", ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ", ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_3);
-			if (in_3->type > 100) {
-				switch (in_3->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_3->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_3->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_3->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_3->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_3->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_3);
-			sc->tempLen = sprintf(sc->tempStr, ");\n");
-			PfAppendLine(sc);
-		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100) && (in_3->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i + in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)(in_1->data.i * in_2->data.i) + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)(in_1->data.i * in_2->data.i) + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					case 2:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					case 3:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[1] + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[1] + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[1] + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.i + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.i + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					case 2:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					case 3:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[1] + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[1] + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[1] + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * (long double)in_2->data.i + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.i + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.i + in_3->data.c[0]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					case 2:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.d + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.d + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.d + in_3->data.c[0]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					case 3:
-						switch (in_3->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[1] + in_1->data.c[1] * in_2->data.c[0] + (long double)in_3->data.i);
-							PfAppendLine(sc);
-							break;
-						case 2:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[1] + in_1->data.c[1] * in_2->data.c[0] + in_3->data.d);
-							PfAppendLine(sc);
-							break;
-						case 3:
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[1] + in_1->data.c[1] * in_2->data.c[0] + in_3->data.c[1]);
-							PfAppendLine(sc);
-							break;
-						}
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			else if ((in_1->type < 100) && (in_2->type < 100) && (in_3->type > 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[1] + in_1->data.c[1] * in_2->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, " + ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_3);
-				switch (in_3->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_3->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_3->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendConversionEnd(sc, out, in_3);
-			}
-			else {
-				sc->tempLen = sprintf(sc->tempStr, "fma(");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, ", ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, ", ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_3);
-				if (in_3->type > 100) {
-					switch (in_3->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_3->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_3->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_3->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_3->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_3->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_3);
-				sc->tempLen = sprintf(sc->tempStr, ");\n");
-				PfAppendLine(sc);
-			}
-		}
-
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.i = in_1->data.i * in_2->data.i + in_3->data.i;
-									return;
-								case 2:
-									out->data.i = in_1->data.i * in_2->data.i + (int64_t)in_3->data.d;
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 2:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.i = (int64_t)(in_1->data.i * in_2->data.d + in_3->data.i);
-									return;
-								case 2:
-									out->data.i = (int64_t)(in_1->data.i * in_2->data.d + in_3->data.d);
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.i = (int64_t)(in_1->data.d * in_2->data.i + in_3->data.i);
-									return;
-								case 2:
-									out->data.i = (int64_t)(in_1->data.d * in_2->data.i + in_3->data.d);
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 2:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.i = (int64_t)(in_1->data.d * in_2->data.d + in_3->data.i);
-									return;
-								case 2:
-									out->data.i = (int64_t)(in_1->data.d * in_2->data.d + in_3->data.d);
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 2:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.d = (long double)(in_1->data.i * in_2->data.i + in_3->data.i);
-									return;
-								case 2:
-									out->data.d = (long double)(in_1->data.i * in_2->data.i + in_3->data.d);
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 2:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.d = in_1->data.i * in_2->data.d + in_3->data.i;
-									return;
-								case 2:
-									out->data.d = in_1->data.i * in_2->data.d + in_3->data.d;
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.d = in_1->data.d * in_2->data.i + in_3->data.i;
-									return;
-								case 2:
-									out->data.d = in_1->data.d * in_2->data.i + in_3->data.d;
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 2:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.d = in_1->data.d * in_2->data.d + in_3->data.i;
-									return;
-								case 2:
-									out->data.d = in_1->data.d * in_2->data.d + in_3->data.d;
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-				case 2:
-					break;
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.c[0] = in_1->data.c[0] * in_2->data.i + in_3->data.i;
-									out->data.c[1] = in_1->data.c[1] * in_2->data.i + in_3->data.i;
-									return;
-								case 2:
-									out->data.c[0] = in_1->data.c[0] * in_2->data.i + in_3->data.d;
-									out->data.c[1] = in_1->data.c[1] * in_2->data.i + in_3->data.d;
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 2:
-							if (in_3->type > 100) {
-							}
-							else {
-								switch (in_3->type % 10) {
-								case 1:
-									out->data.c[0] = in_1->data.c[0] * in_2->data.d + in_3->data.i;
-									out->data.c[1] = in_1->data.c[1] * in_2->data.d + in_3->data.i;
-									return;
-								case 2:
-									out->data.c[0] = in_1->data.c[0] * in_2->data.d + in_3->data.d;
-									out->data.c[1] = in_1->data.c[1] * in_2->data.d + in_3->data.d;
-									return;
-								case 3:
-									break;
-								}
-							}
-							break;
-						case 3:
-							break;
-						}
-					}
-					break;
-				}
-			}
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-static inline void PfMul(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-#if(VKFFT_BACKEND == 2)
-		if ((in_1->type > 100) && (in_2->type > 100)) {
-			//packed instructions workaround if all values are in registers
-			if (((in_1->type % 10) != 3) || ((in_2->type % 10) != 3)) {
-				sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-				PfAppendLine(sc);
-				sc->tempLen = sprintf(sc->tempStr, " = ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-				PfAppendLine(sc);
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " * ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-				PfAppendLine(sc);
-				PfAppendConversionEnd(sc, out, in_2);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-				return;
-			}
-			else {
-				if ((((out->type % 100) / 10) < 2) && (out->type == in_1->type) && (out->type == in_2->type)) {
-					if ((strcmp(out->data.s, in_1->data.s)) && (strcmp(out->data.s, in_2->data.s))) {
-						PfMov_x_Neg_y(sc, out, in_1);
-						PfMov_y_x(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " = ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " * ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
-						
-						sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " = ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " * ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " + ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
-					}
-					else {
-						PfMov_x_Neg_y(sc, temp, in_1);
-						PfMov_y_x(sc, temp, in_1);
-						sc->tempLen = sprintf(sc->tempStr, "%s", temp->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " = ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s", temp->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " * ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
-
-						sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " = ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " * ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, " + ");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s", temp->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
-					}
-					return;
-				}
-			}
-		}
-#endif
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			if ((in_1->type < 100) || (in_2->type < 100) || ((in_1->type % 10) != 3) || ((in_2->type % 10) != 3) || ((strcmp(out->data.s, in_1->data.s)) && (strcmp(out->data.s, in_2->data.s)))) {
-				sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			}
-			else {
-				sc->tempLen = sprintf(sc->tempStr, "%s.x", temp->data.s);
-			}
-			PfAppendLine(sc);
-			break;
-		}
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " * ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-				sc->tempLen = sprintf(sc->tempStr, " - ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-				}
-				else {
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " * ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-				}
-				else {
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-			}
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			/*if ((in_1->type < 100) || (in_2->type < 100) || ((in_1->type % 10) != 3) || ((in_2->type % 10) != 3) || ((strcmp(out->data.s, in_1->data.s)) && (strcmp(out->data.s, in_2->data.s)))) {
-				sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			}
-			else {
-				sc->tempLen = sprintf(sc->tempStr, "%s.y", temp->data.s);
-			}*/
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[1] + in_1->data.c[1] * in_2->data.c[0]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " * ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-					sc->tempLen = sprintf(sc->tempStr, " + ");
-					PfAppendLine(sc);
-					PfAppendConversionStart(sc, out, in_1);
-					if (in_1->type > 100) {
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-					}
-					else {
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						PfAppendNumberLiteral(sc, out);
-					}
-					PfAppendConversionEnd(sc, out, in_1);
-					sc->tempLen = sprintf(sc->tempStr, " * ");
-					PfAppendLine(sc);
-					PfAppendConversionStart(sc, out, in_2);
-					if (in_2->type > 100) {
-						sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-						PfAppendLine(sc);
-					}
-					else {
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-						PfAppendLine(sc);
-						PfAppendNumberLiteral(sc, out);
-					}
-					PfAppendConversionEnd(sc, out, in_2);
-				}
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			if ((in_1->type < 100) || (in_2->type < 100) || ((in_1->type % 10) != 3) || ((in_2->type % 10) != 3) || ((strcmp(out->data.s, in_1->data.s)) && (strcmp(out->data.s, in_2->data.s)))) {
-			}
-			else {
-				PfMov_x(sc, out, temp);
-			}
-			break;
-		}
-
-		return;
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
+				switch (in_1->type % 10) {
 				case 1:
 					if (in_2->type > 100) {
 					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i * in_2->data.i;
+							out->data.i = in_1->data.i + in_2->data.i;
 							return;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.i * in_2->data.d);
+							out->data.i = in_1->data.i + (pfINT)in_2->data.d;
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
@@ -5360,21 +1170,17 @@ static inline void PfMul(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = (int64_t)(in_1->data.d * in_2->data.i);
+							out->data.i = (pfINT)in_1->data.d + in_2->data.i;
 							return;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.d * in_2->data.d);
+							out->data.i = (pfINT)(in_1->data.d + in_2->data.d);
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
 				}
 			}
-			break;
+		break;
 		case 2:
 			if (in_1->type > 100) {
 			}
@@ -5386,87 +1192,156 @@ static inline void PfMul(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.d = (long double)(in_1->data.i * in_2->data.i);
-							return;
-						case 2:
-							out->data.d = (long double)in_1->data.i * in_2->data.d;
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = in_1->data.d * (long double)in_2->data.i;
+							out->data.d = (pfLD)(in_1->data.i + in_2->data.i);
 							return;
 						case 2:
-							out->data.d = in_1->data.d * in_2->data.d;
+							out->data.d = (pfLD)in_1->data.i + in_2->data.d;
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
 				case 2:
-					break;
-				case 3:
 					if (in_2->type > 100) {
 					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.c[0] = in_1->data.c[0] * (long double)in_2->data.i;
-							out->data.c[1] = in_1->data.c[1] * (long double)in_2->data.i;
+							out->data.d = in_1->data.d + (pfLD)in_2->data.i;
 							return;
 						case 2:
-							out->data.c[0] = in_1->data.c[0] * in_2->data.d;
-							out->data.c[1] = in_1->data.c[1] * in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1];
-							out->data.c[1] = in_1->data.c[1] * in_2->data.c[0] + in_1->data.c[0] * in_2->data.c[1];
+							out->data.d = in_1->data.d + in_2->data.d;
 							return;
 						}
 					}
 					break;
 				}
 			}
-			break;
+		break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
-static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
+static inline void PfInc(VkFFTSpecializationConstantsLayout* sc, PfContainer* out) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (out->type > 100) {
+		//in_1 has to be same type as out
 		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
+		case 1:
+		case 2:
+			sc->tempLen = sprintf(sc->tempStr, "\
+%s = %s + 1;\n", out->name, out->name);
 			PfAppendLine(sc);
+			return;
+		}
+	}
+	else {
+		switch (out->type % 10) {
+		case 1:
+			out->data.i = out->data.i + 1;
+			return;
+		case 2:
+			out->data.d = out->data.d + 1;
+			return;
 			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
+		}
+	}
+	sc->res = VKFFT_ERROR_MATH_FAILED;
+	return;
+}
+
+static inline void PfQuadQuickDiff(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {// double-double, double, double
+	if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfSub(sc, &out->data.dd[0], in_1, in_2);
+		PfSub(sc, &out->data.dd[1], in_1, &out->data.dd[0]);
+		PfSub(sc, &out->data.dd[1], &out->data.dd[1], in_2);
+	}
+	return;
+}
+
+static inline void PfQuadDiff(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {// double-double, double, double
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			if ((in_2->type % 10) == 3){
+				PfQuadDiff(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0], temp);
+				PfQuadDiff(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[1], temp);
+			}else{
+				PfQuadDiff(sc, &out->data.c[0], &in_1->data.c[0], in_2, temp);
+				PfQuadDiff(sc, &out->data.c[1], &in_1->data.c[1], in_2, temp);
+			}
+		}else{
+			if ((in_2->type % 10) == 3){
+				PfQuadDiff(sc, &out->data.c[0], in_1, &in_2->data.c[0], temp);
+				PfQuadDiff(sc, &out->data.c[1], in_1, &in_2->data.c[1], temp);
+			}else{
+				PfQuadDiff(sc, &out->data.c[0], in_1, in_2, temp);
+				PfMov(sc, &out->data.c[1], &out->data.c[0]);
+			}
+		}
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfSub(sc, &out->data.dd[0], in_1, in_2);
+		PfSub(sc, &out->data.dd[1], &out->data.dd[0], in_1);
+		PfSub(sc, temp, &out->data.dd[0], &out->data.dd[1]);
+		PfSub(sc, temp, in_1, temp);
+		PfAdd(sc, &out->data.dd[1], in_2, &out->data.dd[1]);
+		PfSub(sc, &out->data.dd[1], temp, &out->data.dd[1]);
+	}
+	return;
+}
+
+static inline void PfSub(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if ((out->type % 10) == 3){
+#if(VKFFT_BACKEND == 2)
+		if ((in_1->type > 100) && (in_2->type > 100) && (((out->type % 100) / 10) != 3)) {
+			//packed instructions workaround if all values are in registers
+			sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
 			PfAppendLine(sc);
-			break;
+			sc->tempLen = sprintf(sc->tempStr, " = ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+			PfAppendLine(sc);
+			PfAppendConversionEnd(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, " - ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_2);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+			PfAppendLine(sc);
+			PfAppendConversionEnd(sc, out, in_2);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
+			return;
 		}
+#endif
+		if ((in_2->type % 10) == 3){
+			PfSub(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0]);
+			PfSub(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[1]);
+		}else{
+			PfSub(sc, &out->data.c[0], &in_1->data.c[0], in_2);
+			PfSub(sc, &out->data.c[1], &in_1->data.c[1], in_2);
+		}
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfContainer temp1 = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp1, in_1);
+		PfContainer temp2 = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp2, in_2);
+		PfQuadDiff(sc, &sc->tempQuad.data.c[0], &temp1.data.dd[0], &temp2.data.dd[0], &sc->tempQuad3.data.c[0].data.dd[0]);
+		PfSub(sc, &out->data.dd[0], &temp1.data.dd[1], &temp2.data.dd[1]);
+		PfAdd(sc, &sc->tempQuad.data.c[0].data.dd[1], &sc->tempQuad.data.c[0].data.dd[1], &out->data.dd[0]);
+		PfQuadQuickSum(sc, out, &sc->tempQuad.data.c[0].data.dd[0], &sc->tempQuad.data.c[0].data.dd[1]);
+		PfDeallocateContainer(sc, &temp1);
+		PfDeallocateContainer(sc, &temp2);
+		return;
+	}
+	if (out->type > 100) {
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
 		sc->tempLen = sprintf(sc->tempStr, " = ");
 		PfAppendLine(sc);
 		if ((in_1->type < 100) && (in_2->type < 100)) {
@@ -5474,15 +1349,11 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 			case 1:
 				switch (in_2->type % 10) {
 				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i - in_2->data.i);
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i - in_2->data.d));
 					PfAppendLine(sc);
 					break;
 				}
@@ -5490,31 +1361,11 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 			case 2:
 				switch (in_2->type % 10) {
 				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * (long double)in_2->data.i);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d - (pfLD)in_2->data.i));
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d - in_2->data.d));
 					PfAppendLine(sc);
 					break;
 				}
@@ -5527,16 +1378,8 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 		else {
 			PfAppendConversionStart(sc, out, in_1);
 			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+				PfAppendLine(sc);
 			}
 			else {
 				switch (in_1->type % 10) {
@@ -5545,31 +1388,19 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_1->data.d);
 					PfAppendLine(sc);
 					break;
 				}
 				PfAppendNumberLiteral(sc, out);
 			}
 			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " * ");
+			sc->tempLen = sprintf(sc->tempStr, " - ");
 			PfAppendLine(sc);
 			PfAppendConversionStart(sc, out, in_2);
 			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+				PfAppendLine(sc);
 			}
 			else {
 				switch (in_2->type % 10) {
@@ -5578,49 +1409,16 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_2->data.d);
 					PfAppendLine(sc);
 					break;
 				}
 				PfAppendNumberLiteral(sc, out);
 			}
 			PfAppendConversionEnd(sc, out, in_2);
-			if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-				sc->tempLen = sprintf(sc->tempStr, " - ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-				}
-				else {
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " * ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-				}
-				else {
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-			}
 			sc->tempLen = sprintf(sc->tempStr, ";\n");
 			PfAppendLine(sc);
 		}
-
 		return;
 	}
 	else {
@@ -5636,13 +1434,11 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i * in_2->data.i;
+							out->data.i = in_1->data.i - in_2->data.i;
 							return;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.i * in_2->data.d);
+							out->data.i = in_1->data.i - (pfINT)in_2->data.d;
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
@@ -5652,18 +1448,15 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = (int64_t)(in_1->data.d * in_2->data.i);
+							out->data.i = (pfINT)in_1->data.d - in_2->data.i;
 							return;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.d * in_2->data.d);
+							out->data.i = (pfINT)(in_1->data.d - in_2->data.d);
 							return;
-						case 3:
-							break;
+
 						}
 					}
 					break;
-				case 3:
-					break;
 				}
 			}
 			break;
@@ -5678,58 +1471,24 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.d = (long double)(in_1->data.i * in_2->data.i);
+							out->data.d = (pfLD)(in_1->data.i - in_2->data.i);
 							return;
 						case 2:
-							out->data.d = (long double)in_1->data.i * in_2->data.d;
+							out->data.d = (pfLD)in_1->data.i - in_2->data.d;
 							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = in_1->data.d * (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.d = in_1->data.d * in_2->data.d;
-							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
 				case 2:
-					break;
-				case 3:
 					if (in_2->type > 100) {
 					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.c[0] = in_1->data.c[0] * (long double)in_2->data.i;
+							out->data.d = in_1->data.d - (pfLD)in_2->data.i;
 							return;
 						case 2:
-							out->data.c[0] = in_1->data.c[0] * in_2->data.d;
-							return;
-						case 3:
-							out->data.c[0] = in_1->data.c[0] * in_2->data.c[0] - in_1->data.c[1] * in_2->data.c[1];
+							out->data.d = in_1->data.d - in_2->data.d;
 							return;
 						}
 					}
@@ -5743,181 +1502,458 @@ static inline void PfMul_x(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 	return;
 }
 
-static inline void PfMul_y(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
+static inline void PfIf_eq_start(VkFFTSpecializationConstantsLayout* sc, PfContainer* left, PfContainer* right);
+static inline void PfIf_gt_start(VkFFTSpecializationConstantsLayout* sc, PfContainer* left, PfContainer* right);
+static inline void PfIf_lt_start(VkFFTSpecializationConstantsLayout* sc, PfContainer* left, PfContainer* right);
+static inline void PfIf_else(VkFFTSpecializationConstantsLayout* sc);
+static inline void PfIf_end(VkFFTSpecializationConstantsLayout* sc);
+
+static inline void PfMul(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp);
+static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2);
+static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2);
+
+static inline void PfQuadSplit(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* temp) {// double-double, double, double
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			PfQuadSplit(sc, &out->data.c[0], &in_1->data.c[0], temp);
+			PfQuadSplit(sc, &out->data.c[1], &in_1->data.c[1], temp);
+		}else{
+			PfQuadSplit(sc, &out->data.c[0], in_1, temp);
+			PfQuadSplit(sc, &out->data.c[1], in_1, temp);
+		}
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		if (in_1->type > 100){
+			PfContainer temp_double = VKFFT_ZERO_INIT;
+			temp_double.type = 22;
+			temp_double.data.d = pfFPinit("3.7252902984619140625e-09"); // 2^-28
+			PfContainer temp_int = VKFFT_ZERO_INIT;
+			temp_int.type = 31;
+			/*PfSetToZero(sc, &sc->tempIntQuad);
+			temp_int.data.i = 134217729; // 2^27+1
+			PfIf_gt_start(sc, in_1, &temp_int);
+			temp_int.data.i = 1;
+			PfMov(sc, &sc->tempIntQuad, &temp_int);
+			PfIf_else(sc);
+			temp_int.data.i = -134217729; // 2^27+1
+			PfIf_lt_start(sc, in_1, &temp_int);
+			temp_int.data.i = 1;
+			PfMov(sc, &sc->tempIntQuad, &temp_int);
+			PfIf_end(sc);
+			PfIf_end(sc);
+			PfIf_eq_start(sc, &sc->tempIntQuad, &temp_int);
+
+			temp_double.data.d = pfFPinit("3.7252902984619140625e-09"); // 2^-28
+			PfMul(sc, &out->data.dd[1], in_1, &temp_double, 0);
+			temp_double.data.d = pfFPinit("134217729.0"); // 2^27+1
+			PfMul(sc, temp, &out->data.dd[1], &temp_double, 0);
+			PfSub(sc, &out->data.dd[0], temp, &out->data.dd[1]);
+			PfSub(sc, &out->data.dd[0], temp, &out->data.dd[0]);
+			PfSub(sc, &out->data.dd[1], &out->data.dd[1], &out->data.dd[0]);
+			temp_int.data.i = 268435456; // 2^27+1
+			PfMul(sc, &out->data.dd[0], &out->data.dd[0], &temp_int, 0);
+			PfMul(sc, &out->data.dd[1], &out->data.dd[1], &temp_int, 0);
+
+			PfIf_else(sc);*/
+
+			temp_double.data.d =  pfFPinit("134217729.0"); // 2^27+1
+			PfMul(sc, temp, in_1, &temp_double, 0);
+			PfSub(sc, &out->data.dd[0], temp, in_1);
+			PfSub(sc, &out->data.dd[0], temp, &out->data.dd[0]);
+			PfSub(sc, &out->data.dd[1], in_1, &out->data.dd[0]);
+			
+			//PfIf_end(sc);
+		}else{
+			PfContainer temp_double = VKFFT_ZERO_INIT;
+			temp_double.type = 22;
+			temp_double.data.d = pfFPinit("3.7252902984619140625e-09"); // 2^-28
+			PfContainer temp_int = VKFFT_ZERO_INIT;
+			temp_int.type = 31;
+			double temp_double2;
+			double temp_double3;
+			double temp_double4;
+
+			if ((in_1->data.d > 134217729) || (in_1->data.d < -134217729)){
+
+			temp_double.data.d = pfFPinit("3.7252902984619140625e-09"); // 2^-28
+			temp_double2 = ((double)in_1->data.d) * (double)temp_double.data.d;
+			temp_double3 = temp_double2 * 134217729;
+			temp_double4 = temp_double3 - temp_double2;
+			temp_double4 = temp_double3 - temp_double4;
+			temp_double2 = temp_double2 - temp_double4;
+			temp_double.data.d = temp_double4 * 268435456;
+			PfMov(sc, &out->data.dd[0], &temp_double);
+			temp_double.data.d = temp_double3 * 268435456;
+			PfMov(sc, &out->data.dd[1], &temp_double);
+
+			}else{
+
+			temp_double3 = ((double)in_1->data.d) * 134217729;
+			temp_double4 = temp_double3 - ((double)in_1->data.d);
+			temp_double3 = temp_double3 - temp_double4;
+			temp_double.data.d = temp_double3;
+			PfMov(sc, &out->data.dd[0], &temp_double);
+			temp_double3 = ((double)in_1->data.d) - temp_double3;
+			temp_double.data.d = temp_double3;
+			PfMov(sc, &out->data.dd[1], &temp_double);
+
+			}
+		}
+	}
+	return;
+}
+
+static inline void PfFMA(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* in_3);
+
+static inline void PfQuadProd(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1,  PfContainer* in_2, PfContainer* temp) {// double-double, double, double
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			if ((in_2->type % 10) == 3){
+				PfQuadProd(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0], &temp->data.c[0]);
+				PfQuadProd(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[1], &temp->data.c[1]);
+			}else{
+				PfQuadProd(sc, &out->data.c[0], &in_1->data.c[0], in_2, &temp->data.c[0]);
+				PfQuadProd(sc, &out->data.c[1], &in_1->data.c[1], in_2, &temp->data.c[1]);
+			}
+		}else{
+			if ((in_2->type % 10) == 3){
+				PfQuadProd(sc, &out->data.c[0], in_1, &in_2->data.c[0], &temp->data.c[0]);
+				PfQuadProd(sc, &out->data.c[1], in_1, &in_2->data.c[1], &temp->data.c[1]);
+			}else{
+				PfQuadProd(sc, &out->data.c[0], in_1, in_2, &temp->data.c[0]);
+				PfMov(sc, &out->data.c[1], &out->data.c[0]);
+			}
+		}
+		return;
+	}
+	if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		//v1
+		/*PfMul(sc, &out->data.dd[0], in_1, in_2, 0);
+		PfQuadSplit(sc, &temp->data.c[0], in_1, &out->data.dd[1]);
+		PfQuadSplit(sc, &sc->tempQuad2.data.c[0], in_2, &out->data.dd[1]);
+
+		//PfMovNeg(sc, &sc->tempQuad2.data.c[1].data.dd[1], &out->data.dd[0]);
+		//PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[0], &sc->tempQuad2.data.c[0].data.dd[0],  &sc->tempQuad2.data.c[1].data.dd[1]);
+		PfMul(sc, &out->data.dd[1], &temp->data.c[0].data.dd[0], &sc->tempQuad2.data.c[0].data.dd[0], 0);
+		PfSub(sc, &out->data.dd[1], &out->data.dd[1], &out->data.dd[0]);
+
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[0], &sc->tempQuad2.data.c[0].data.dd[1], &out->data.dd[1]);
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[1], &sc->tempQuad2.data.c[0].data.dd[0], &out->data.dd[1]);
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[1], &sc->tempQuad2.data.c[0].data.dd[1], &out->data.dd[1]);*/
+
+		//v2
+		/*PfMul(sc, &out->data.dd[0], in_1, in_2, 0);
+		PfQuadSplit(sc, &temp->data.c[0], in_1, &out->data.dd[1]);
+		PfQuadSplit(sc, &sc->tempQuad2.data.c[0], in_2, &out->data.dd[1]);
+		//important
+		PfMovNeg(sc, &sc->tempQuad2.data.c[1].data.dd[1], in_2);
+		PfFMA(sc, &sc->tempQuad2.data.c[1].data.dd[0], &sc->tempQuad2.data.c[1].data.dd[1], in_1, &out->data.dd[0]);
+
+		PfMovNeg(sc, &sc->tempQuad2.data.c[1].data.dd[1], &out->data.dd[0]);
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[0], &sc->tempQuad2.data.c[0].data.dd[0], &sc->tempQuad2.data.c[1].data.dd[1]);
+		//PfPrintReg(sc, &sc->inoutID, &sc->tempQuad2.data.c[1].data.dd[0]);
+		PfAdd(sc, &out->data.dd[1], &out->data.dd[1], &sc->tempQuad2.data.c[1].data.dd[0]);
+		//PfSub(sc, &out->data.dd[1], &out->data.dd[1], &sc->tempQuad2.data.c[1].data.dd[0]);
+		//PfPrintReg(sc, &sc->inoutID, &out->data.dd[1]);
+
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[0], &sc->tempQuad2.data.c[0].data.dd[1], &out->data.dd[1]);
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[1], &sc->tempQuad2.data.c[0].data.dd[0], &out->data.dd[1]);
+		PfFMA(sc, &out->data.dd[1], &temp->data.c[0].data.dd[1], &sc->tempQuad2.data.c[0].data.dd[1], &out->data.dd[1]);*/
+
+		//v3
+		PfMul(sc, &out->data.dd[0], in_1, in_2, 0);
+		PfMovNeg(sc, &out->data.dd[1], &out->data.dd[0]);
+		PfFMA(sc, &out->data.dd[1], in_1, in_2, &out->data.dd[1]);
+	}
+	return;
+}
+
+
+static inline void PfFMA(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* in_3) {
+	//fma inlining is not correct if all three numbers are complex for now
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			/*if ((in_1->type < 100) || (in_2->type < 100) || ((in_1->type % 10) != 3) || ((in_2->type % 10) != 3) || ((strcmp(out->data.s, in_1->data.s)) && (strcmp(out->data.s, in_2->data.s)))) {
-				sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
+	if ((out->type % 10) == 3){
+#if(VKFFT_BACKEND == 2)
+		if ((in_1->type > 100) && (in_2->type > 100) && (in_3->type > 100) && (((out->type % 100) / 10) != 3)) {
+			
+			//packed instructions workaround if all values are in registers
+			if (((in_1->type % 10) != 3) || ((in_2->type % 10) != 3)) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+				PfAppendLine(sc);
+				sc->tempLen = sprintf(sc->tempStr, " = ");
+				PfAppendLine(sc);
+				PfAppendConversionStart(sc, out, in_1);
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+				PfAppendLine(sc);
+				PfAppendConversionEnd(sc, out, in_1);
+				sc->tempLen = sprintf(sc->tempStr, " * ");
+				PfAppendLine(sc);
+				PfAppendConversionStart(sc, out, in_2);
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+				PfAppendLine(sc);
+				PfAppendConversionEnd(sc, out, in_2);
+				sc->tempLen = sprintf(sc->tempStr, " + ");
+				PfAppendLine(sc);
+				PfAppendConversionStart(sc, out, in_1);
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_3->name);
+				PfAppendLine(sc);
+				PfAppendConversionEnd(sc, out, in_1);
+				sc->tempLen = sprintf(sc->tempStr, ";\n");
+				PfAppendLine(sc);
+				return;
 			}
-			else {
-				sc->tempLen = sprintf(sc->tempStr, "%s.y", temp->data.s);
-			}*/
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
+		}
+#endif
+		if ((in_3->type % 10) == 3){
+			if ((in_2->type % 10) == 3){
+				if ((in_1->type % 10) == 3){
+					sc->res = VKFFT_ERROR_MATH_FAILED;
+				}else{
+					PfFMA(sc, &out->data.c[0], in_1, &in_2->data.c[0], &in_3->data.c[0]);
+					PfFMA(sc, &out->data.c[1], in_1, &in_2->data.c[1], &in_3->data.c[1]);
+				}
+			}else{
+				if ((in_1->type % 10) == 3){
+					PfFMA(sc, &out->data.c[0], &in_1->data.c[0], in_2, &in_3->data.c[0]);
+					PfFMA(sc, &out->data.c[1], &in_1->data.c[1], in_2, &in_3->data.c[1]);
+				}else{
+					if ((((out->type % 100) / 10) == 3)){
+						PfMul(sc, &out->data.c[0], in_1, in_2, &out->data.c[1]);
+						PfAdd(sc, &out->data.c[1], &out->data.c[0], &in_3->data.c[1]);
+						PfAdd(sc, &out->data.c[0], &out->data.c[0], &in_3->data.c[0]);
+					}else {
+						PfFMA(sc, &out->data.c[0], in_1, in_2, &in_3->data.c[0]);
+						PfFMA(sc, &out->data.c[1], in_1, in_2, &in_3->data.c[1]);
+					}
+				}
+			}
+		}else{
+			if ((in_2->type % 10) == 3){
+				if ((in_1->type % 10) == 3){
+					sc->res = VKFFT_ERROR_MATH_FAILED;
+				}else{
+					PfFMA(sc, &out->data.c[0], in_1, &in_2->data.c[0], in_3);
+					PfFMA(sc, &out->data.c[1], in_1, &in_2->data.c[1], in_3);
+				}
+			}else{
+				if ((in_1->type % 10) == 3){
+					PfFMA(sc, &out->data.c[0], &in_1->data.c[0], in_2, in_3);
+					PfFMA(sc, &out->data.c[1], &in_1->data.c[1], in_2, in_3);
+				}else{
+					PfFMA(sc, &out->data.c[0], in_1, in_2, in_3);
+					PfMov(sc, &out->data.c[1], &out->data.c[0]);
+				}
+			}
+		}
+		return;
+	}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfMul(sc, &sc->tempQuad.data.c[1], in_1, in_2, 0);
+		PfMov(sc, &sc->tempQuad2.data.c[1], &sc->tempQuad.data.c[1]);
+		PfAdd(sc, out, &sc->tempQuad2.data.c[1], in_3);
+		return;
+	}
+	if (out->type > 100) {
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, " = ");
+		PfAppendLine(sc);
+		if ((in_1->type < 100) && (in_2->type < 100) && (in_3->type < 100)) {
+			switch (in_1->type % 10) {
+			case 1:
+				switch (in_2->type % 10) {
 				case 1:
-					switch (in_2->type % 10) {
+					switch (in_3->type % 10) {
 					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
+						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i + in_3->data.i);
 						PfAppendLine(sc);
 						break;
 					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i * in_2->data.c[1]);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)(in_1->data.i * in_2->data.i) + in_3->data.d));
 						PfAppendLine(sc);
 						break;
 					}
 					break;
 				case 2:
-					switch (in_2->type % 10) {
+					switch (in_3->type % 10) {
 					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * (long double)in_2->data.i);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i * in_2->data.d + (pfLD)in_3->data.i));
 						PfAppendLine(sc);
 						break;
 					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d * in_2->data.c[1]);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i * in_2->data.d + in_3->data.d));
 						PfAppendLine(sc);
 						break;
 					}
 					break;
-				case 3:
-					switch (in_2->type % 10) {
+				}
+				break;
+			case 2:
+				switch (in_2->type % 10) {
+				case 1:
+					switch (in_3->type % 10) {
 					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * (long double)in_2->data.i);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * (pfLD)in_2->data.i + (pfLD)in_3->data.i));
 						PfAppendLine(sc);
 						break;
 					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] * in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] * in_2->data.c[1] + in_1->data.c[1] * in_2->data.c[0]);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * in_2->data.i + in_3->data.d));
 						PfAppendLine(sc);
 						break;
 					}
 					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
+				case 2:
+					switch (in_3->type % 10) {
 					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * in_2->data.d + (pfLD)in_3->data.i));
 						PfAppendLine(sc);
 						break;
 					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * in_2->data.d + in_3->data.d));
 						PfAppendLine(sc);
 						break;
 					}
-					PfAppendNumberLiteral(sc, out);
+					break;
 				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " * ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
+				break;
+			case 3:
+				switch (in_2->type % 10) {
+				case 1:
+					switch (in_3->type % 10) {
+					case 1:
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.c[0].data.d * (pfLD)in_2->data.i + (pfLD)in_3->data.i));
 						PfAppendLine(sc);
 						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
+					case 2:
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.c[0].data.d * in_2->data.i + in_3->data.d));
 						PfAppendLine(sc);
 						break;
 					}
-				}
-				else {
-					switch (in_2->type % 10) {
+					break;
+				case 2:
+					switch (in_3->type % 10) {
 					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.c[0].data.d * in_2->data.d + (pfLD)in_3->data.i));
 						PfAppendLine(sc);
 						break;
 					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
+						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.c[0].data.d * in_2->data.d + in_3->data.d));
 						PfAppendLine(sc);
 						break;
 					}
-					PfAppendNumberLiteral(sc, out);
+					break;
 				}
-				PfAppendConversionEnd(sc, out, in_2);
-				if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-					sc->tempLen = sprintf(sc->tempStr, " + ");
+				break;
+			}
+			PfAppendNumberLiteral(sc, out);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
+		}
+		else if ((in_1->type < 100) && (in_2->type < 100) && (in_3->type > 100)) {
+			switch (in_1->type % 10) {
+			case 1:
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
 					PfAppendLine(sc);
-					PfAppendConversionStart(sc, out, in_1);
-					if (in_1->type > 100) {
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-					}
-					else {
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						PfAppendNumberLiteral(sc, out);
-					}
-					PfAppendConversionEnd(sc, out, in_1);
-					sc->tempLen = sprintf(sc->tempStr, " * ");
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i * in_2->data.d));
 					PfAppendLine(sc);
-					PfAppendConversionStart(sc, out, in_2);
-					if (in_2->type > 100) {
-						sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-						PfAppendLine(sc);
-					}
-					else {
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
-						PfAppendLine(sc);
-						PfAppendNumberLiteral(sc, out);
-					}
-					PfAppendConversionEnd(sc, out, in_2);
+					break;
 				}
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
+				break;
+			case 2:
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * (pfLD)in_2->data.i));
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * in_2->data.d));
+					PfAppendLine(sc);
+					break;
+				}
+				break;
+			}
+			PfAppendNumberLiteral(sc, out);
+			sc->tempLen = sprintf(sc->tempStr, " + ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_3);
+			sc->tempLen = sprintf(sc->tempStr, "%s", in_3->name);
+			PfAppendLine(sc);
+			PfAppendConversionEnd(sc, out, in_3);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
+		}
+		else {
+			sc->tempLen = sprintf(sc->tempStr, "fma(");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_1);
+			if (in_1->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
 				PfAppendLine(sc);
 			}
-			break;
+			else {
+				switch (in_1->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_1->data.d);
+					PfAppendLine(sc);
+					break;
+				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, ", ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_2);
+			if (in_2->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+				PfAppendLine(sc);
+			}
+			else {
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_2->data.d);
+					PfAppendLine(sc);
+					break;
+				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_2);
+			sc->tempLen = sprintf(sc->tempStr, ", ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_3);
+			if (in_3->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_3->name);
+				PfAppendLine(sc);
+			}
+			else {
+				switch (in_3->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_3->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_3->data.d);
+					PfAppendLine(sc);
+					break;
+				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_3);
+			sc->tempLen = sprintf(sc->tempStr, ");\n");
+			PfAppendLine(sc);
 		}
-
 		return;
 	}
 	else {
@@ -5933,12 +1969,32 @@ static inline void PfMul_y(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i * in_2->data.i;
-							return;
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.i = in_1->data.i * in_2->data.i + in_3->data.i;
+									return;
+								case 2:
+									out->data.i = in_1->data.i * in_2->data.i + (pfINT)in_3->data.d;
+									return;
+								}
+							}
+							break;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.i * in_2->data.d);
-							return;
-						case 3:
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.i = (pfINT)(in_1->data.i * in_2->data.d + in_3->data.i);
+									return;
+								case 2:
+									out->data.i = (pfINT)(in_1->data.i * in_2->data.d + in_3->data.d);
+									return;
+								}
+							}
 							break;
 						}
 					}
@@ -5949,12 +2005,32 @@ static inline void PfMul_y(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = (int64_t)(in_1->data.d * in_2->data.i);
-							return;
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.i = (pfINT)(in_1->data.d * in_2->data.i + in_3->data.i);
+									return;
+								case 2:
+									out->data.i = (pfINT)(in_1->data.d * in_2->data.i + in_3->data.d);
+									return;
+								}
+							}
+							break;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.d * in_2->data.d);
-							return;
-						case 3:
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.i = (pfINT)(in_1->data.d * in_2->data.d + in_3->data.i);
+									return;
+								case 2:
+									out->data.i = (pfINT)(in_1->data.d * in_2->data.d + in_3->data.d);
+									return;
+								}
+							}
 							break;
 						}
 					}
@@ -5975,59 +2051,69 @@ static inline void PfMul_y(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.d = (long double)(in_1->data.i * in_2->data.i);
-							return;
-						case 2:
-							out->data.d = (long double)in_1->data.i * in_2->data.d;
-							return;
-						case 3:
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.d = (pfLD)(in_1->data.i * in_2->data.i + in_3->data.i);
+									return;
+								case 2:
+									out->data.d = (pfLD)(in_1->data.i * in_2->data.i + in_3->data.d);
+									return;
+								}
+							}
 							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = in_1->data.d * (long double)in_2->data.i;
-							return;
 						case 2:
-							out->data.d = in_1->data.d * in_2->data.d;
-							return;
-						case 3:
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.d = in_1->data.i * in_2->data.d + in_3->data.i;
+									return;
+								case 2:
+									out->data.d = in_1->data.i * in_2->data.d + in_3->data.d;
+									return;
+								}
+							}
 							break;
 						}
 					}
 					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
 				case 2:
-					break;
-				case 3:
 					if (in_2->type > 100) {
 					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.c[1] = in_1->data.c[1] * (long double)in_2->data.i;
-							return;
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.d = in_1->data.d * in_2->data.i + in_3->data.i;
+									return;
+								case 2:
+									out->data.d = in_1->data.d * in_2->data.i + in_3->data.d;
+									return;
+								}
+							}
+							break;
 						case 2:
-							out->data.c[1] = in_1->data.c[1] * in_2->data.d;
-							return;
-						case 3:
-							out->data.c[1] = in_1->data.c[1] * in_2->data.c[0] + in_1->data.c[0] * in_2->data.c[1];
-							return;
+							if (in_3->type > 100) {
+							}
+							else {
+								switch (in_3->type % 10) {
+								case 1:
+									out->data.d = in_1->data.d * in_2->data.d + in_3->data.i;
+									return;
+								case 2:
+									out->data.d = in_1->data.d * in_2->data.d + in_3->data.d;
+									return;
+								}
+							}
+							break;
 						}
 					}
 					break;
@@ -6040,150 +2126,155 @@ static inline void PfMul_y(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 	return;
 }
 
-static inline void PfFMA3(VkFFTSpecializationConstantsLayout* sc, PfContainer* out_1, PfContainer* out_2, PfContainer* in_1, PfContainer* in_num, PfContainer* in_conj) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out_1->type > 100) {
-		//in_1 has to be same type as out
-		switch (out_1->type % 10) {
-		case 1:
-		case 2:
-			break;
-		case 3:
-			switch (in_num->type % 10) {
-			case 1:
-				break;
-			case 2:
-				break;
-			case 3:
-				sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = fma(%s.x, %s.x, %s.x);\n\
-%s.y = fma(%s.y, %s.x, %s.y);\n", out_1->data.s, in_1->data.s, in_num->data.s, out_1->data.s, out_1->data.s, in_conj->data.s, in_num->data.s, out_1->data.s);
-				PfAppendLine(sc);
-				sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = fma(%s.y, %s.y, %s.x);\n\
-%s.y = fma(%s.x, %s.y, %s.y);\n", out_2->data.s, in_1->data.s, in_num->data.s, out_2->data.s, out_2->data.s, in_conj->data.s, in_num->data.s, out_2->data.s);
-				PfAppendLine(sc);
-				return;
-			}
-			break;
-		}
-	}
-	else {
-		switch (out_1->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			switch (in_num->type % 10) {
-			case 1:
-				break;
-			case 2:
-				break;
-			case 3:
-				out_1->data.c[0] = in_1->data.c[0] * in_num->data.c[0] + out_1->data.c[0];
-				out_1->data.c[1] = in_conj->data.c[1] * in_num->data.c[0] + out_1->data.c[1];
-				out_2->data.c[0] = in_1->data.c[1] * in_num->data.c[1] + out_2->data.c[0];
-				out_2->data.c[1] = in_conj->data.c[0] * in_num->data.c[1] + out_2->data.c[1];
-				return;
-			}
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfFMA3_const_w(VkFFTSpecializationConstantsLayout* sc, PfContainer* out_1, PfContainer* out_2, PfContainer* in_1, PfContainer* in_num_x, PfContainer* in_num_y, PfContainer* in_conj, PfContainer* temp) {
+static inline void PfMul(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (out_1->type > 100) {
-#if(VKFFT_BACKEND==2)
-		if (((out_1->type%100)/10) < 2) {
-			PfMov_x(sc, temp, in_1);
-			PfMov_y(sc, temp, in_conj);
-			PfFMA(sc, out_1, temp, in_num_x, out_1);
-
-			PfMov_x_y(sc, temp, in_1);
-			PfMov_y_x(sc, temp, in_conj);
-			PfFMA(sc, out_2, temp, in_num_y, out_2);
-			return;
-		}
-#endif
-		//in_1 has to be same type as out
-		switch (out_1->type % 10) {
-		case 1:
-		case 2:
-			break;
-		case 3:
-			switch (in_num_x->type % 10) {
-			case 1:
-				break;
-			case 2:
-				sc->tempLen = sprintf(sc->tempStr, "%s.x = fma(%s.x, %.17Le", out_1->data.s, in_1->data.s, in_num_x->data.d);
-				PfAppendLine(sc);
-				PfAppendNumberLiteral(sc, out_1);
-				sc->tempLen = sprintf(sc->tempStr, ", %s.x);\n", out_1->data.s);
-				PfAppendLine(sc);
-				sc->tempLen = sprintf(sc->tempStr, "%s.y = fma(%s.y, %.17Le", out_1->data.s, in_conj->data.s, in_num_x->data.d);
+	if ((out->type % 10) == 3){
+#if(VKFFT_BACKEND == 2)
+		if ((in_1->type > 100) && (in_2->type > 100) && (((out->type % 100) / 10) != 3)) {
+			//packed instructions workaround if all values are in registers
+			if (((in_1->type % 10) != 3) || ((in_2->type % 10) != 3)) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
 				PfAppendLine(sc);
-				PfAppendNumberLiteral(sc, out_1);
-				sc->tempLen = sprintf(sc->tempStr, ", %s.y);\n", out_1->data.s);
+				sc->tempLen = sprintf(sc->tempStr, " = ");
 				PfAppendLine(sc);
-				sc->tempLen = sprintf(sc->tempStr, "%s.x = fma(%s.y, %.17Le", out_2->data.s, in_1->data.s, in_num_y->data.d);
+				PfAppendConversionStart(sc, out, in_1);
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
 				PfAppendLine(sc);
-				PfAppendNumberLiteral(sc, out_1);
-				sc->tempLen = sprintf(sc->tempStr, ", %s.x);\n", out_2->data.s);
+				PfAppendConversionEnd(sc, out, in_1);
+				sc->tempLen = sprintf(sc->tempStr, " * ");
 				PfAppendLine(sc);
-				sc->tempLen = sprintf(sc->tempStr, "%s.y = fma(%s.x, %.17Le", out_2->data.s, in_conj->data.s, in_num_y->data.d);
+				PfAppendConversionStart(sc, out, in_2);
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
 				PfAppendLine(sc);
-				PfAppendNumberLiteral(sc, out_1);
-				sc->tempLen = sprintf(sc->tempStr, ", %s.y);\n", out_2->data.s);
+				PfAppendConversionEnd(sc, out, in_2);
+				sc->tempLen = sprintf(sc->tempStr, ";\n");
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
-			break;
-		}
-	}
-	else {
-		switch (out_1->type % 10) {
-		case 1:
-			break;
-		case 2:
-			break;
-		case 3:
-			switch (in_num_x->type % 10) {
-			case 1:
-				break;
-			case 2:
-				out_1->data.c[0] = in_1->data.c[0] * in_num_x->data.d + out_1->data.c[0];
-				out_1->data.c[1] = in_conj->data.c[1] * in_num_x->data.d + out_1->data.c[1];
-				out_2->data.c[0] = in_1->data.c[1] * in_num_y->data.d + out_2->data.c[0];
-				out_2->data.c[1] = in_conj->data.c[0] * in_num_y->data.d + out_2->data.c[1];
-				return;
-			case 3:
-				break;
+			else {
+				if ((((out->type % 100) / 10) < 2) && (out->type == in_1->type) && (out->type == in_2->type)) {
+					if ((strcmp(out->name, in_1->name)) && (strcmp(out->name, in_2->name))) {
+						PfMovNeg(sc, &out->data.c[0], &in_1->data.c[1]);
+						PfMov(sc, &out->data.c[1], &in_1->data.c[0]);
+						sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " = ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " * ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.c[1].name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, ";\n");
+						PfAppendLine(sc);
+						
+						sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " = ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " * ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.c[0].name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " + ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, ";\n");
+						PfAppendLine(sc);
+					}
+					else {
+						PfMovNeg(sc, &temp->data.c[0], &in_1->data.c[1]);
+						PfMov(sc, &temp->data.c[1], &in_1->data.c[0]);
+						sc->tempLen = sprintf(sc->tempStr, "%s", temp->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " = ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", temp->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " * ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.c[1].name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, ";\n");
+						PfAppendLine(sc);
+
+						sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " = ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " * ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.c[0].name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, " + ");
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "%s", temp->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, ";\n");
+						PfAppendLine(sc);
+					}
+					return;
+				}
+			}
+		}
+#endif
+		if ((in_2->type % 10) == 3){
+			if ((in_1->type % 10) == 3){
+				if ((in_1->type < 100) || (in_2->type < 100) || ((strcmp(out->name, in_1->name)) && (strcmp(out->name, in_2->name)))) {
+					PfMul(sc, &out->data.c[0], &in_1->data.c[1], &in_2->data.c[1], 0);
+					PfMovNeg(sc, &out->data.c[0], &out->data.c[0]);
+					PfFMA(sc, &out->data.c[0], &in_1->data.c[0], &in_2->data.c[0], &out->data.c[0]);
+
+					PfMul(sc, &out->data.c[1], &in_1->data.c[1], &in_2->data.c[0], 0);
+					PfFMA(sc, &out->data.c[1], &in_1->data.c[0], &in_2->data.c[1], &out->data.c[1]);
+				}else{
+					PfMul(sc, &temp->data.c[0], &in_1->data.c[1], &in_2->data.c[1], 0);
+					PfMovNeg(sc, &temp->data.c[0], &temp->data.c[0]);
+					PfFMA(sc, &temp->data.c[0], &in_1->data.c[0], &in_2->data.c[0], &temp->data.c[0]);
+
+					PfMul(sc, &temp->data.c[1], &in_1->data.c[1], &in_2->data.c[0], 0);
+					PfFMA(sc, &out->data.c[1], &in_1->data.c[0], &in_2->data.c[1], &temp->data.c[1]);
+					PfMov(sc, &out->data.c[0], &temp->data.c[0]);
+				}
+			}else{
+				PfMul(sc, &out->data.c[0], in_1, &in_2->data.c[0], 0);
+				PfMul(sc, &out->data.c[1], in_1, &in_2->data.c[1], 0);
+			}
+		}else{
+			if ((in_1->type % 10) == 3){
+				PfMul(sc, &out->data.c[0], &in_1->data.c[0], in_2, 0);
+				PfMul(sc, &out->data.c[1], &in_1->data.c[1], in_2, 0);
+			}else{
+				PfMul(sc, &out->data.c[0], in_1, in_2, 0);
+				PfMov(sc, &out->data.c[1], &out->data.c[0]);
 			}
-			break;
 		}
+		return;
 	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
+	else if ((((out->type % 100) / 10) == 3) && ((out->type % 10) == 2)) {
+		PfContainer temp1 = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp1, in_1);
+		PfContainer temp2 = VKFFT_ZERO_INIT;
+		PfConvToDoubleDouble(sc, &temp2, in_2);
+		
+		PfQuadProd(sc, &sc->tempQuad.data.c[0], &temp1.data.dd[0], &temp2.data.dd[0], &sc->tempQuad3);
+		PfFMA(sc, &sc->tempQuad.data.c[0].data.dd[1], &temp1.data.dd[0], &temp2.data.dd[1], &sc->tempQuad.data.c[0].data.dd[1]);
+		PfFMA(sc, &sc->tempQuad.data.c[0].data.dd[1], &temp1.data.dd[1], &temp2.data.dd[0], &sc->tempQuad.data.c[0].data.dd[1]);
+		PfQuadQuickSum(sc, out, &sc->tempQuad.data.c[0].data.dd[0], &sc->tempQuad.data.c[0].data.dd[1]);
 
-static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
-	if (sc->res != VKFFT_SUCCESS) return;
+		PfDeallocateContainer(sc, &temp1);
+		PfDeallocateContainer(sc, &temp2);
+		return;
+	}
 	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
-		}
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
 		sc->tempLen = sprintf(sc->tempStr, " = ");
 		PfAppendLine(sc);
 		if ((in_1->type < 100) && (in_2->type < 100)) {
@@ -6191,15 +2282,11 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 			case 1:
 				switch (in_2->type % 10) {
 				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i / in_2->data.i);
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i * in_2->data.i);
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i / in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i / in_2->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i * in_2->data.d));
 					PfAppendLine(sc);
 					break;
 				}
@@ -6207,32 +2294,13 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 			case 2:
 				switch (in_2->type % 10) {
 				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d / (long double)in_2->data.i);
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d / in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d / in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 3:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] / (long double)in_2->data.i);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * (pfLD)in_2->data.i));
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] / in_2->data.d);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d * in_2->data.d));
 					PfAppendLine(sc);
 					break;
-				case 3:
-					sc->res = VKFFT_ERROR_MATH_FAILED; 
-					break;
 				}
 				break;
 			}
@@ -6243,16 +2311,8 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 		else {
 			PfAppendConversionStart(sc, out, in_1);
 			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+				PfAppendLine(sc);
 			}
 			else {
 				switch (in_1->type % 10) {
@@ -6261,31 +2321,19 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_1->data.d);
 					PfAppendLine(sc);
 					break;
 				}
 				PfAppendNumberLiteral(sc, out);
 			}
 			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " / ");
+			sc->tempLen = sprintf(sc->tempStr, " * ");
 			PfAppendLine(sc);
 			PfAppendConversionStart(sc, out, in_2);
 			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+				PfAppendLine(sc);
 			}
 			else {
 				switch (in_2->type % 10) {
@@ -6294,162 +2342,17 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_2->data.d);
 					PfAppendLine(sc);
 					break;
 				}
 				PfAppendNumberLiteral(sc, out);
 			}
 			PfAppendConversionEnd(sc, out, in_2);
-			if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-				sc->res = VKFFT_ERROR_MATH_FAILED;
-			}
 			sc->tempLen = sprintf(sc->tempStr, ";\n");
 			PfAppendLine(sc);
 		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
-			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i / in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i / in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)in_1->data.i / in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d / (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d / in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d / in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] / (long double)in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] / in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->res = VKFFT_ERROR_MATH_FAILED; 
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " / ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-					sc->res = VKFFT_ERROR_MATH_FAILED;
-				}
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			break;
-		}
+
 		return;
 	}
 	else {
@@ -6465,13 +2368,11 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i / in_2->data.i;
+							out->data.i = in_1->data.i * in_2->data.i;
 							return;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.i / in_2->data.d);
+							out->data.i = (pfINT)(in_1->data.i * in_2->data.d);
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
@@ -6481,18 +2382,14 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = (int64_t)(in_1->data.d / in_2->data.i);
+							out->data.i = (pfINT)(in_1->data.d * in_2->data.i);
 							return;
 						case 2:
-							out->data.i = (int64_t)(in_1->data.d / in_2->data.d);
+							out->data.i = (pfINT)(in_1->data.d * in_2->data.d);
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
 				}
 			}
 			break;
@@ -6507,60 +2404,24 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.d = (long double)(in_1->data.i / in_2->data.i);
+							out->data.d = (pfLD)(in_1->data.i * in_2->data.i);
 							return;
 						case 2:
-							out->data.d = (long double)in_1->data.i / in_2->data.d;
+							out->data.d = (pfLD)in_1->data.i * in_2->data.d;
 							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = in_1->data.d / (long double)in_2->data.i;
-							return;
-						case 2:
-							out->data.d = in_1->data.d / in_2->data.d;
-							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
 				case 2:
-					break;
-				case 3:
 					if (in_2->type > 100) {
 					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.c[0] = in_1->data.c[0] / (long double)in_2->data.i;
-							out->data.c[1] = in_1->data.c[1] / (long double)in_2->data.i;
+							out->data.d = in_1->data.d * (pfLD)in_2->data.i;
 							return;
 						case 2:
-							out->data.c[0] = in_1->data.c[0] / in_2->data.d;
-							out->data.c[1] = in_1->data.c[1] / in_2->data.d;
-							return;
-						case 3:
-							sc->res = VKFFT_ERROR_MATH_FAILED; 
+							out->data.d = in_1->data.d * in_2->data.d;
 							return;
 						}
 					}
@@ -6573,68 +2434,90 @@ static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
+
+static inline void PfFMA3(VkFFTSpecializationConstantsLayout* sc, PfContainer* out_1, PfContainer* out_2, PfContainer* in_1, PfContainer* in_num, PfContainer* in_conj) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		switch (out->type % 10) {
-		case 1: case 2:
-			sc->tempLen = sprintf(sc->tempStr, "%s", out->data.s);
-			PfAppendLine(sc);
-			break;
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-			PfAppendLine(sc);
-			break;
+	PfFMA(sc, &out_1->data.c[0], &in_1->data.c[0], &in_num->data.c[0], &out_1->data.c[0]);
+	PfFMA(sc, &out_1->data.c[1], &in_conj->data.c[1], &in_num->data.c[0], &out_1->data.c[1]);
+	PfFMA(sc, &out_2->data.c[0], &in_1->data.c[1], &in_num->data.c[1], &out_2->data.c[0]);
+	PfFMA(sc, &out_2->data.c[1], &in_conj->data.c[0], &in_num->data.c[1], &out_2->data.c[1]);
+	/*out_1->data.c[0].data.d = in_1->data.c[0].data.d * in_num->data.c[0].data.d + out_1->data.c[0].data.d;
+				out_1->data.c[1].data.d = in_conj->data.c[1].data.d * in_num->data.c[0].data.d + out_1->data.c[1].data.d;
+				out_2->data.c[0].data.d = in_1->data.c[1].data.d * in_num->data.c[1].data.d + out_2->data.c[0].data.d;
+				out_2->data.c[1].data.d = in_conj->data.c[0].data.d * in_num->data.c[1].data.d + out_2->data.c[1].data.d;
+				*/
+	return;
+}
+static inline void PfFMA3_const_w(VkFFTSpecializationConstantsLayout* sc, PfContainer* out_1, PfContainer* out_2, PfContainer* in_1, PfContainer* in_num_x, PfContainer* in_num_y, PfContainer* in_conj, PfContainer* temp, PfContainer* tempx) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if (out_1->type > 100) {
+#if(VKFFT_BACKEND==2)
+		if (((out_1->type%100)/10) < 2) {
+			PfMov(sc, &temp->data.c[0], &in_1->data.c[0]);
+			PfMov(sc, &temp->data.c[1], &in_conj->data.c[1]);
+			PfFMA(sc, out_1, temp, in_num_x, out_1);
+
+			PfMov(sc, &temp->data.c[0], &in_1->data.c[1]);
+			PfMov(sc, &temp->data.c[1], &in_conj->data.c[0]);
+			PfFMA(sc, out_2, temp, in_num_y, out_2);
+			return;
+		}
+#endif
+		//in_1 has to be same type as out
+	}
+	PfFMA(sc, &out_1->data.c[0], &in_1->data.c[0], in_num_x, &out_1->data.c[0]);
+	PfFMA(sc, &out_1->data.c[1], &in_conj->data.c[1], in_num_x, &out_1->data.c[1]);
+	PfFMA(sc, &out_2->data.c[0], &in_1->data.c[1], in_num_y, &out_2->data.c[0]);
+	PfFMA(sc, &out_2->data.c[1], &in_conj->data.c[0], in_num_y, &out_2->data.c[1]);
+	/*out_1->data.c[0].data.d = in_1->data.c[0].data.d * in_num_x->data.d + out_1->data.c[0].data.d;
+				out_1->data.c[1].data.d = in_conj->data.c[1].data.d * in_num_x->data.d + out_1->data.c[1].data.d;
+				out_2->data.c[0].data.d = in_1->data.c[1].data.d * in_num_y->data.d + out_2->data.c[0].data.d;
+				out_2->data.c[1].data.d = in_conj->data.c[0].data.d * in_num_y->data.d + out_2->data.c[1].data.d;*/
+	return;
+}
+
+//no quad implementation needed so far, will add later
+static inline void PfDiv(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			PfDiv(sc, &out->data.c[0], &in_1->data.c[0], in_2);
+			PfDiv(sc, &out->data.c[1], &in_1->data.c[1], in_2);
+		}else{
+			PfDiv(sc, &out->data.c[0], in_1, in_2);
+			PfMov(sc, &out->data.c[1], &out->data.c[0]);
 		}
+		return;
+	}
+	if (out->type > 100) {
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
 		sc->tempLen = sprintf(sc->tempStr, " = ");
 		PfAppendLine(sc);
 		if ((in_1->type < 100) && (in_2->type < 100)) {
 			switch (in_1->type % 10) {
-			case 1:
-				switch (in_2->type % 10) {
-				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", (int64_t)ceil(in_1->data.i / (long double)in_2->data.i));
-					PfAppendLine(sc);
-					break;
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil((long double)in_1->data.i / in_2->data.d));
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil((long double)in_1->data.i / in_2->data.c[0]));
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			case 2:
+			case 1:
 				switch (in_2->type % 10) {
 				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.d / (long double)in_2->data.i));
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i / in_2->data.i);
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.d / in_2->data.d));
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.d / in_2->data.c[0]));
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)((pfLD)in_1->data.i / in_2->data.d));
 					PfAppendLine(sc);
 					break;
 				}
 				break;
-			case 3:
+			case 2:
 				switch (in_2->type % 10) {
 				case 1:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.c[0] / (long double)in_2->data.i));
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d / (pfLD)in_2->data.i));
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.c[0] / in_2->data.d));
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) (in_1->data.d / in_2->data.d));
 					PfAppendLine(sc);
 					break;
-				case 3:
-					sc->res = VKFFT_ERROR_MATH_FAILED;
-					break;
 				}
 				break;
 			}
@@ -6643,20 +2526,10 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 			PfAppendLine(sc);
 		}
 		else {
-			sc->tempLen = sprintf(sc->tempStr, "ceil(");
-			PfAppendLine(sc);
 			PfAppendConversionStart(sc, out, in_1);
 			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+				PfAppendLine(sc);
 			}
 			else {
 				switch (in_1->type % 10) {
@@ -6665,11 +2538,7 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_1->data.d);
 					PfAppendLine(sc);
 					break;
 				}
@@ -6680,16 +2549,8 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 			PfAppendLine(sc);
 			PfAppendConversionStart(sc, out, in_2);
 			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 1: case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+				PfAppendLine(sc);
 			}
 			else {
 				switch (in_2->type % 10) {
@@ -6698,11 +2559,7 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 					PfAppendLine(sc);
 					break;
 				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-					PfAppendLine(sc);
-					break;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_2->data.d);
 					PfAppendLine(sc);
 					break;
 				}
@@ -6712,149 +2569,8 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 			if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
 				sc->res = VKFFT_ERROR_MATH_FAILED;
 			}
-			sc->tempLen = sprintf(sc->tempStr, ");\n");
-			PfAppendLine(sc);
-		}
-		switch (out->type % 10) {
-		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-			PfAppendLine(sc);
-			sc->tempLen = sprintf(sc->tempStr, " = ");
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
 			PfAppendLine(sc);
-			if ((in_1->type < 100) && (in_2->type < 100)) {
-				switch (in_1->type % 10) {
-				case 1:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", (int64_t)ceil(in_1->data.i / (long double)in_2->data.i));
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil((long double)in_1->data.i / in_2->data.d));
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil((long double)in_1->data.i / in_2->data.c[1]));
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 2:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.d / (long double)in_2->data.i));
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.d / in_2->data.d));
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.d / in_2->data.c[1]));
-						PfAppendLine(sc);
-						break;
-					}
-					break;
-				case 3:
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.c[1] / (long double)in_2->data.i));
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", ceil(in_1->data.c[1] / in_2->data.d));
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->res = VKFFT_ERROR_MATH_FAILED;
-						break;
-					}
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-				sc->tempLen = sprintf(sc->tempStr, ";\n");
-				PfAppendLine(sc);
-			}
-			else {
-				sc->tempLen = sprintf(sc->tempStr, "ceil(");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_1);
-				if (in_1->type > 100) {
-					switch (in_1->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_1->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						if ((in_2->type % 10) == 3)
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-						else
-							sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_1);
-				sc->tempLen = sprintf(sc->tempStr, " / ");
-				PfAppendLine(sc);
-				PfAppendConversionStart(sc, out, in_2);
-				if (in_2->type > 100) {
-					switch (in_2->type % 10) {
-					case 1: case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%s", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-						PfAppendLine(sc);
-						break;
-					}
-				}
-				else {
-					switch (in_2->type % 10) {
-					case 1:
-						sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
-						PfAppendLine(sc);
-						break;
-					case 2:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.d);
-						PfAppendLine(sc);
-						break;
-					case 3:
-						sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-						PfAppendLine(sc);
-						break;
-					}
-					PfAppendNumberLiteral(sc, out);
-				}
-				PfAppendConversionEnd(sc, out, in_2);
-				if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
-					sc->res = VKFFT_ERROR_MATH_FAILED;
-				}
-				sc->tempLen = sprintf(sc->tempStr, ");\n");
-				PfAppendLine(sc);
-			}
-			break;
 		}
 		return;
 	}
@@ -6871,13 +2587,11 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i / in_2->data.i + (in_1->data.i % in_2->data.i != 0);
+							out->data.i = in_1->data.i / in_2->data.i;
 							return;
 						case 2:
-							out->data.i = (int64_t)ceil(in_1->data.i / in_2->data.d);
+							out->data.i = (pfINT)(in_1->data.i / in_2->data.d);
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
@@ -6887,18 +2601,14 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = (int64_t)ceil(in_1->data.d / in_2->data.i);
+							out->data.i = (pfINT)(in_1->data.d / in_2->data.i);
 							return;
 						case 2:
-							out->data.i = (int64_t)ceil(in_1->data.d / in_2->data.d);
+							out->data.i = (pfINT)(in_1->data.d / in_2->data.d);
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
 				}
 			}
 			break;
@@ -6913,60 +2623,25 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.d = (long double)(in_1->data.i / in_2->data.i + (in_1->data.i % in_2->data.i != 0));
-							return;
-						case 2:
-							out->data.d = (long double)ceil(in_1->data.i / in_2->data.d);
-							return;
-						case 3:
-							break;
-						}
-					}
-					break;
-				case 2:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.d = ceil(in_1->data.d / in_2->data.i);
+							out->data.d = (pfLD)(in_1->data.i / in_2->data.i);
 							return;
 						case 2:
-							out->data.d = ceil(in_1->data.d / in_2->data.d);
+							out->data.d = (pfLD)in_1->data.i / in_2->data.d;
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
-				case 3:
-					break;
-				}
-			}
-			break;
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
 				case 2:
-					break;
-				case 3:
 					if (in_2->type > 100) {
 					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.c[0] = ceil(in_1->data.c[0] / in_2->data.i);
-							out->data.c[1] = ceil(in_1->data.c[1] / in_2->data.i);
+							out->data.d = in_1->data.d / (pfLD)in_2->data.i;
 							return;
 						case 2:
-							out->data.c[0] = ceil(in_1->data.c[0] / in_2->data.d);
-							out->data.c[1] = ceil(in_1->data.c[1] / in_2->data.d);
+							out->data.d = in_1->data.d / in_2->data.d;
 							return;
-						case 3:
-							break;
 						}
 					}
 					break;
@@ -6978,95 +2653,146 @@ static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-
-static inline void PfMod(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
+static inline void PfDivCeil(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
 	if (sc->res != VKFFT_SUCCESS) return;
+	if ((out->type % 10) == 3){
+		if ((in_1->type % 10) == 3){
+			PfDivCeil(sc, &out->data.c[0], &in_1->data.c[0], in_2);
+			PfDivCeil(sc, &out->data.c[1], &in_1->data.c[1], in_2);
+		}else{
+			PfDivCeil(sc, &out->data.c[0], in_1, in_2);
+			PfMov(sc, &out->data.c[1], &out->data.c[0]);
+		}
+		return;
+	}
 	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 1:
+		sc->tempLen = sprintf(sc->tempStr, "%s", out->name);
+		PfAppendLine(sc);
+		sc->tempLen = sprintf(sc->tempStr, " = ");
+		PfAppendLine(sc);
+		if ((in_1->type < 100) && (in_2->type < 100)) {
+			switch (in_1->type % 10) {
+			case 1:
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", (pfINT)pfceil(in_1->data.i / (pfLD)in_2->data.i));
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)pfceil((pfLD)in_1->data.i / in_2->data.d));
+					PfAppendLine(sc);
+					break;
+				}
+				break;
+			case 2:
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)pfceil(in_1->data.d / (pfLD)in_2->data.i));
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double)pfceil(in_1->data.d / in_2->data.d));
+					PfAppendLine(sc);
+					break;
+				}
+				break;
+			}
+			PfAppendNumberLiteral(sc, out);
+			sc->tempLen = sprintf(sc->tempStr, ";\n");
+			PfAppendLine(sc);
+		}
+		else {
+			sc->tempLen = sprintf(sc->tempStr, "ceil(");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_1);
 			if (in_1->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_1->name);
+				PfAppendLine(sc);
+			}
+			else {
 				switch (in_1->type % 10) {
 				case 1:
-					if (in_2->type > 100) {
-						switch (in_2->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s %% %s;\n", out->data.s, in_1->data.s, in_2->data.s);
-							PfAppendLine(sc);
-							return;
-						case 2:
-							break;
-						case 3:
-							break;
-						}
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s %% %" PRIi64 ";\n", out->data.s, in_1->data.s, in_2->data.i);
-							PfAppendLine(sc);
-							return;
-						case 2:
-							break;
-						case 3:
-							break;
-						}
-					}
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_1->data.i);
+					PfAppendLine(sc);
 					break;
 				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_1->data.d);
+					PfAppendLine(sc);
 					break;
-				case 3:
+				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_1);
+			sc->tempLen = sprintf(sc->tempStr, " / ");
+			PfAppendLine(sc);
+			PfAppendConversionStart(sc, out, in_2);
+			if (in_2->type > 100) {
+				sc->tempLen = sprintf(sc->tempStr, "%s", in_2->name);
+				PfAppendLine(sc);
+			}
+			else {
+				switch (in_2->type % 10) {
+				case 1:
+					sc->tempLen = sprintf(sc->tempStr, "%" PRIi64 "", in_2->data.i);
+					PfAppendLine(sc);
+					break;
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "%.17Le", (long double) in_2->data.d);
+					PfAppendLine(sc);
 					break;
 				}
+				PfAppendNumberLiteral(sc, out);
+			}
+			PfAppendConversionEnd(sc, out, in_2);
+			if (((in_1->type % 10) == 3) && ((in_2->type % 10) == 3)) {
+				sc->res = VKFFT_ERROR_MATH_FAILED;
+			}
+			sc->tempLen = sprintf(sc->tempStr, ");\n");
+			PfAppendLine(sc);
+		}
+		return;
+	}
+	else {
+		switch (out->type % 10) {
+		case 1:
+			if (in_1->type > 100) {
 			}
 			else {
 				switch (in_1->type % 10) {
 				case 1:
 					if (in_2->type > 100) {
+					}
+					else {
 						switch (in_2->type % 10) {
 						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %" PRIi64 " %% %s;\n", out->data.s, in_1->data.i, in_2->data.s);
-							PfAppendLine(sc);
+							out->data.i = in_1->data.i / in_2->data.i + (in_1->data.i % in_2->data.i != 0);
 							return;
 						case 2:
-							break;
-						case 3:
-							break;
+							out->data.i = (pfINT)pfceil(in_1->data.i / in_2->data.d);
+							return;
 						}
 					}
+					break;
+				case 2:
+					if (in_2->type > 100) {
+					}
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %" PRIi64 ";\n", out->data.s, in_1->data.i % in_2->data.i);
-							PfAppendLine(sc);
+							out->data.i = (pfINT)pfceil(in_1->data.d / in_2->data.i);
 							return;
 						case 2:
-							break;
-						case 3:
-							break;
+							out->data.i = (pfINT)pfceil(in_1->data.d / in_2->data.d);
+							return;
 						}
 					}
 					break;
-				case 2:
-					break;
 				case 3:
 					break;
 				}
 			}
-		break;
-		case 2:
-			break;
-		case 3:
 			break;
-		}
-	}
-	else {
-		switch (out->type % 10) {
-		case 1:
+		case 2:
 			if (in_1->type > 100) {
 			}
 			else {
@@ -7077,25 +2803,40 @@ static inline void PfMod(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i % in_2->data.i;
+							out->data.d = (pfLD)(in_1->data.i / in_2->data.i + (in_1->data.i % in_2->data.i != 0));
+							return;
+						case 2:
+							out->data.d = (pfLD)pfceil(in_1->data.i / in_2->data.d);
 							return;
 						}
 					}
-				break;
+					break;
+				case 2:
+					if (in_2->type > 100) {
+					}
+					else {
+						switch (in_2->type % 10) {
+						case 1:
+							out->data.d = pfceil(in_1->data.d / in_2->data.i);
+							return;
+						case 2:
+							out->data.d = pfceil(in_1->data.d / in_2->data.d);
+							return;
+						}
+					}
+					break;
+				case 3:
+					break;
 				}
 			}
 			break;
-		case 2:
-			break;
-		case 3:
-			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
-static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
+static inline void PfMod(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (out->type > 100) {
 		//in_1 has to be same type as out
@@ -7108,7 +2849,7 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s && %s;\n", out->data.s, in_1->data.s, in_2->data.s);
+%s = %s %% %s;\n", out->name, in_1->name, in_2->name);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7121,7 +2862,7 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s && %" PRIi64 ";\n", out->data.s, in_1->data.s, in_2->data.i);
+%s = %s %% %" PRIi64 ";\n", out->name, in_1->name, in_2->data.i);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7144,7 +2885,7 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %" PRIi64 " && %s;\n", out->data.s, in_1->data.i, in_2->data.s);
+%s = %" PRIi64 " %% %s;\n", out->name, in_1->data.i, in_2->name);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7157,7 +2898,7 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %d;\n", out->data.s, in_1->data.i && in_2->data.i);
+%s = %" PRIi64 ";\n", out->name, in_1->data.i % in_2->data.i);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7173,7 +2914,7 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					break;
 				}
 			}
-			break;
+		break;
 		case 2:
 			break;
 		case 3:
@@ -7193,11 +2934,11 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 					else {
 						switch (in_2->type % 10) {
 						case 1:
-							out->data.i = in_1->data.i && in_2->data.i;
+							out->data.i = in_1->data.i % in_2->data.i;
 							return;
 						}
 					}
-					break;
+				break;
 				}
 			}
 			break;
@@ -7210,7 +2951,8 @@ static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* ou
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
+
+static inline void PfAnd(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (out->type > 100) {
 		//in_1 has to be same type as out
@@ -7223,7 +2965,7 @@ static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s || %s;\n", out->data.s, in_1->data.s, in_2->data.s);
+%s = %s && %s;\n", out->name, in_1->name, in_2->name);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7236,7 +2978,7 @@ static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s || %" PRIi64 ";\n", out->data.s, in_1->data.s, in_2->data.i);
+%s = %s && %" PRIi64 ";\n", out->name, in_1->name, in_2->data.i);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7259,7 +3001,7 @@ static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %" PRIi64 " || %s;\n", out->data.s, in_1->data.i, in_2->data.s);
+%s = %" PRIi64 " && %s;\n", out->name, in_1->data.i, in_2->name);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7272,7 +3014,7 @@ static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out
 						switch (in_2->type % 10) {
 						case 1:
 							sc->tempLen = sprintf(sc->tempStr, "\
-%s = %d;\n", out->data.s, in_1->data.i || in_2->data.i);
+%s = %d;\n", out->name, in_1->data.i && in_2->data.i);
 							PfAppendLine(sc);
 							return;
 						case 2:
@@ -7297,436 +3039,220 @@ static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out
 	}
 	else {
 		switch (out->type % 10) {
-		case 1:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 1:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 1:
-							out->data.i = in_1->data.i || in_2->data.i;
-							return;
-						}
-					}
-					break;
-				}
-			}
-			break;
-		case 2:
-			break;
-		case 3:
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-
-
-static inline void PfSinCos(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 3:
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 2:
-					switch ((out->type / 10) % 10) {
-					case 0: case 1:
-#if(VKFFT_BACKEND==0)
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = cos(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = sin(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-#elif ((VKFFT_BACKEND == 1) || (VKFFT_BACKEND == 2))
-						sc->tempLen = sprintf(sc->tempStr, "\
-__sincosf(%s, &%s.y, &%s.x);\n", in_1->data.s, out->data.s, out->data.s);
-						PfAppendLine(sc);
-#elif ((VKFFT_BACKEND == 3) || (VKFFT_BACKEND == 4))
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = native_cos(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = native_sin(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-#elif (VKFFT_BACKEND == 5)
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = cos(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = sin(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-#endif
-						return;
-					case 2:
-#if(VKFFT_BACKEND==0)
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s = sincos20(%s);\n", out->data.s, in_1->data.s);
-						PfAppendLine(sc);
-#elif ((VKFFT_BACKEND == 1) || (VKFFT_BACKEND == 2))
-						sc->tempLen = sprintf(sc->tempStr, "\
-sincos(%s, &%s.y, &%s.x);\n", in_1->data.s, out->data.s, out->data.s);
-						PfAppendLine(sc);
-#elif ((VKFFT_BACKEND == 3) || (VKFFT_BACKEND == 4) || (VKFFT_BACKEND == 5))
-						sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = sincos(%s, &%s.x);\n", out->data.s, in_1->data.s, out->data.s);
-						PfAppendLine(sc);
-#endif
-						return;
-					}
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s.x = %.17Le;\n", out->data.s, cos(in_1->data.d));
-					PfAppendLine(sc);
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s.y = %.17Le;\n", out->data.s, sin(in_1->data.d));
-					PfAppendLine(sc);
-					return;
-				}
-			}
-		}
-	}
-	else {
-		switch (out->type % 10) {
-		case 3:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 2:
-					out->data.c[0] = cos(in_1->data.d);
-					out->data.c[1] = sin(in_1->data.d);
-					return;
-				}
-			}
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfNorm(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 2:
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s = %s.x*%s.x + %s.y * %s.y;\n", out->data.s, in_1->data.s, in_1->data.s, in_1->data.s, in_1->data.s);
-					PfAppendLine(sc);
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s = %.17Le;\n", out->data.s, in_1->data.c[0] * in_1->data.c[0] + in_1->data.c[1] * in_1->data.c[1]);
-					PfAppendLine(sc);
-					return;
-				}
-			}
-		}
-	}
-	else {
-		switch (out->type % 10) {
-		case 2:
-			if (in_1->type > 100) {
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 3:
-					out->data.d = in_1->data.c[0] * in_1->data.c[0] + in_1->data.c[1] * in_1->data.c[1];
-					return;
-				}
-			}
-			break;
-		}
-	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
-	return;
-}
-static inline void PfRsqrt(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
-	if (sc->res != VKFFT_SUCCESS) return;
-	if (out->type > 100) {
-		//in_1 has to be same type as out
-		switch (out->type % 10) {
-		case 2:
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 2:
-#if(VKFFT_BACKEND==0)
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s = inversesqrt(%s);\n", out->data.s, in_1->data.s);
-					PfAppendLine(sc);
-#else
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s = rsqrt(%s);\n", out->data.s, in_1->data.s);
-					PfAppendLine(sc);
-#endif
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 2:
-					sc->tempLen = sprintf(sc->tempStr, "\
-%s = %.17Le;\n", out->data.s, 1.0l / sqrt(in_1->data.d));
-					PfAppendLine(sc);
-					return;
-				}
-			}
-		}
-	}
-	else {
-		switch (out->type % 10) {
-		case 2:
+		case 1:
 			if (in_1->type > 100) {
 			}
 			else {
 				switch (in_1->type % 10) {
-				case 2:
-					out->data.d = 1.0l / sqrt(in_1->data.d);
-					return;
+				case 1:
+					if (in_2->type > 100) {
+					}
+					else {
+						switch (in_2->type % 10) {
+						case 1:
+							out->data.i = in_1->data.i && in_2->data.i;
+							return;
+						}
+					}
+					break;
 				}
 			}
 			break;
+		case 2:
+			break;
+		case 3:
+			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-
-static inline void PfConjugate(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
+static inline void PfOr(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (out->type > 100) {
 		//in_1 has to be same type as out
 		switch (out->type % 10) {
-		case 3:
+		case 1:
 			if (in_1->type > 100) {
 				switch (in_1->type % 10) {
-				case 3:
-					if (strcmp(out->data.s, in_1->data.s)) {
-						sc->tempLen = sprintf(sc->tempStr, "%s.x = ", out->data.s);
-						PfAppendLine(sc);
-						PfAppendConversionStart(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-						PfAppendLine(sc);
-						PfAppendConversionEnd(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
-						sc->tempLen = sprintf(sc->tempStr, "%s.y = ", out->data.s);
-						PfAppendLine(sc);
-						PfAppendConversionStart(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, "-%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						PfAppendConversionEnd(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
+				case 1:
+					if (in_2->type > 100) {
+						switch (in_2->type % 10) {
+						case 1:
+							sc->tempLen = sprintf(sc->tempStr, "\
+%s = %s || %s;\n", out->name, in_1->name, in_2->name);
+							PfAppendLine(sc);
+							return;
+						case 2:
+							break;
+						case 3:
+							break;
+						}
 					}
 					else {
-						sc->tempLen = sprintf(sc->tempStr, "%s.y = ", out->data.s);
-						PfAppendLine(sc);
-						PfAppendConversionStart(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, "-%s.y", in_1->data.s);
-						PfAppendLine(sc);
-						PfAppendConversionEnd(sc, out, in_1);
-						sc->tempLen = sprintf(sc->tempStr, ";\n");
-						PfAppendLine(sc);
+						switch (in_2->type % 10) {
+						case 1:
+							sc->tempLen = sprintf(sc->tempStr, "\
+%s = %s || %" PRIi64 ";\n", out->name, in_1->name, in_2->data.i);
+							PfAppendLine(sc);
+							return;
+						case 2:
+							break;
+						case 3:
+							break;
+						}
 					}
-					return;
+					break;
+				case 2:
+					break;
+				case 3:
+					break;
 				}
 			}
 			else {
 				switch (in_1->type % 10) {
+				case 1:
+					if (in_2->type > 100) {
+						switch (in_2->type % 10) {
+						case 1:
+							sc->tempLen = sprintf(sc->tempStr, "\
+%s = %" PRIi64 " || %s;\n", out->name, in_1->data.i, in_2->name);
+							PfAppendLine(sc);
+							return;
+						case 2:
+							break;
+						case 3:
+							break;
+						}
+					}
+					else {
+						switch (in_2->type % 10) {
+						case 1:
+							sc->tempLen = sprintf(sc->tempStr, "\
+%s = %d;\n", out->name, in_1->data.i || in_2->data.i);
+							PfAppendLine(sc);
+							return;
+						case 2:
+							break;
+						case 3:
+							break;
+						}
+					}
+					break;
+				case 2:
+					break;
 				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x = %.17Le", out->data.s, in_1->data.c[0]);
-					PfAppendLine(sc);
-					PfAppendNumberLiteral(sc, out);
-					sc->tempLen = sprintf(sc->tempStr, ";\n");
-					PfAppendLine(sc);
-					sc->tempLen = sprintf(sc->tempStr, "%s.x = %.17Le", out->data.s, -in_1->data.c[1]);
-					PfAppendLine(sc);
-					PfAppendNumberLiteral(sc, out);
-					sc->tempLen = sprintf(sc->tempStr, ";\n");
-					PfAppendLine(sc);
-					return;
+					break;
 				}
 			}
 			break;
+		case 2:
+			break;
+		case 3:
+			break;
 		}
 	}
 	else {
 		switch (out->type % 10) {
-		case 3:
+		case 1:
 			if (in_1->type > 100) {
 			}
 			else {
 				switch (in_1->type % 10) {
-				case 3:
-					out->data.c[0] = in_1->data.c[0];
-					out->data.c[1] = -in_1->data.c[1];
-					return;
+				case 1:
+					if (in_2->type > 100) {
+					}
+					else {
+						switch (in_2->type % 10) {
+						case 1:
+							out->data.i = in_1->data.i || in_2->data.i;
+							return;
+						}
+					}
+					break;
 				}
 			}
+			break;
+		case 2:
+			break;
+		case 3:
+			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
-static inline void PfShuffleComplex(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
+
+static inline void PfSinCos(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (out->type > 100) {
-		if (strcmp(out->data.s, in_2->data.s)) {
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-		}
-		else {
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", temp->data.s);
-		}
-		PfAppendLine(sc);
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 3:
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] - in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
-			if (in_1->type > 100) {
-				switch (in_1->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		if (strcmp(out->data.s, in_2->data.s)) {
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-		}
-		else {
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", temp->data.s);
-		}
-		PfAppendLine(sc);
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 3:
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] + in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
+		//in_1 has to be same type as out
+		switch (out->type % 10) {
+		case 3:
 			if (in_1->type > 100) {
 				switch (in_1->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-					;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
-					PfAppendLine(sc);
-					break;
+				case 2:
+					switch ((out->type / 10) % 10) {
+					case 0: case 1:
+#if(VKFFT_BACKEND==0)
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.x = cos(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.y = sin(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+#elif ((VKFFT_BACKEND == 1) || (VKFFT_BACKEND == 2))
+						sc->tempLen = sprintf(sc->tempStr, "\
+__sincosf(%s, &%s.y, &%s.x);\n", in_1->name, out->name, out->name);
+						PfAppendLine(sc);
+#elif ((VKFFT_BACKEND == 3) || (VKFFT_BACKEND == 4))
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.x = native_cos(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.y = native_sin(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+#elif (VKFFT_BACKEND == 5)
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.x = cos(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.y = sin(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+#endif
+						return;
+					case 2:
+#if(VKFFT_BACKEND==0)
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s = sincos20(%s);\n", out->name, in_1->name);
+						PfAppendLine(sc);
+#elif ((VKFFT_BACKEND == 1) || (VKFFT_BACKEND == 2))
+						sc->tempLen = sprintf(sc->tempStr, "\
+sincos(%s, &%s.y, &%s.x);\n", in_1->name, out->name, out->name);
+						PfAppendLine(sc);
+#elif ((VKFFT_BACKEND == 3) || (VKFFT_BACKEND == 4) || (VKFFT_BACKEND == 5))
+						sc->tempLen = sprintf(sc->tempStr, "\
+%s.y = sincos(%s, &%s.x);\n", out->name, in_1->name, out->name);
+						PfAppendLine(sc);
+#endif
+						return;
+					}
 				}
 			}
 			else {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
+				switch (in_1->type % 10) {
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s.x = %.17Le;\n", out->name, (long double)pfcos(in_1->data.d));
 					PfAppendLine(sc);
-					break;
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s.y = %.17Le;\n", out->name, (long double)pfsin(in_1->data.d));
+					PfAppendLine(sc);
+					return;
 				}
-				PfAppendNumberLiteral(sc, out);
 			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		if (!strcmp(out->data.s, in_2->data.s)) {
-			PfMov(sc, out, temp);
 		}
-		return;
 	}
 	else {
 		switch (out->type % 10) {
@@ -7735,200 +3261,142 @@ static inline void PfShuffleComplex(VkFFTSpecializationConstantsLayout* sc, PfCo
 			}
 			else {
 				switch (in_1->type % 10) {
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 3:
-							out->data.c[0] = in_1->data.c[0] - in_2->data.c[1];
-							out->data.c[1] = in_1->data.c[1] + in_2->data.c[0];
-							return;
-						}
-					}
-					break;
+				case 2:
+					out->data.c[0].data.d = pfcos(in_1->data.d);
+					out->data.c[1].data.d = pfsin(in_1->data.d);
+					return;
 				}
 			}
+			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
-static inline void PfShuffleComplexInv(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
+static inline void PfNorm(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (out->type > 100) {
-		if (strcmp(out->data.s, in_2->data.s)) {
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", out->data.s);
-		}
-		else {
-			sc->tempLen = sprintf(sc->tempStr, "%s.x", temp->data.s);
-		}
-		PfAppendLine(sc);
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 3:
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0] + in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				break;
-			}
-			PfAppendNumberLiteral(sc, out);
-		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
+		//in_1 has to be same type as out
+		switch (out->type % 10) {
+		case 2:
 			if (in_1->type > 100) {
 				switch (in_1->type % 10) {
 				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_1->data.s);
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s = %s.x*%s.x + %s.y * %s.y;\n", out->name, in_1->name, in_1->name, in_1->name, in_1->name);
 					PfAppendLine(sc);
-					break;
 				}
 			}
 			else {
 				switch (in_1->type % 10) {
 				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[0]);
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s = %.17Le;\n", out->name, (long double)(in_1->data.c[0].data.d * in_1->data.c[0].data.d + in_1->data.c[1].data.d * in_1->data.c[1].data.d));
 					PfAppendLine(sc);
-					break;
+					return;
 				}
-				PfAppendNumberLiteral(sc, out);
 			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " + ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_2->data.s);
-					PfAppendLine(sc);
-					break;
-				}
+		}
+	}
+	else {
+		switch (out->type % 10) {
+		case 2:
+			if (in_1->type > 100) {
 			}
 			else {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[1]);
-					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
-		}
-		if (strcmp(out->data.s, in_2->data.s)) {
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", out->data.s);
-		}
-		else {
-			sc->tempLen = sprintf(sc->tempStr, "%s.y", temp->data.s);
-		}
-		PfAppendLine(sc);
-		sc->tempLen = sprintf(sc->tempStr, " = ");
-		PfAppendLine(sc);
-		if ((in_1->type < 100) && (in_2->type < 100)) {
-			switch (in_1->type % 10) {
-			case 3:
-				switch (in_2->type % 10) {
+				switch (in_1->type % 10) {
 				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1] - in_2->data.c[0]);
-					PfAppendLine(sc);
-					break;
+					out->data.d = in_1->data.c[0].data.d * in_1->data.c[0].data.d + in_1->data.c[1].data.d * in_1->data.c[1].data.d;
+					return;
 				}
-				break;
 			}
-			PfAppendNumberLiteral(sc, out);
+			break;
 		}
-		else {
-			PfAppendConversionStart(sc, out, in_1);
+	}
+	sc->res = VKFFT_ERROR_MATH_FAILED;
+	return;
+}
+static inline void PfRsqrt(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if (out->type > 100) {
+		//in_1 has to be same type as out
+		switch (out->type % 10) {
+		case 2:
 			if (in_1->type > 100) {
 				switch (in_1->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.y", in_1->data.s);
-					PfAppendLine(sc);
-					break;
-				}
-			}
-			else {
-				switch (in_1->type % 10) {
-					;
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_1->data.c[1]);
+				case 2:
+#if(VKFFT_BACKEND==0)
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s = inversesqrt(%s);\n", out->name, in_1->name);
 					PfAppendLine(sc);
-					break;
-				}
-				PfAppendNumberLiteral(sc, out);
-			}
-			PfAppendConversionEnd(sc, out, in_1);
-			sc->tempLen = sprintf(sc->tempStr, " - ");
-			PfAppendLine(sc);
-			PfAppendConversionStart(sc, out, in_2);
-			if (in_2->type > 100) {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%s.x", in_2->data.s);
+#else
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s = rsqrt(%s);\n", out->name, in_1->name);
 					PfAppendLine(sc);
-					break;
+#endif
 				}
 			}
 			else {
-				switch (in_2->type % 10) {
-				case 3:
-					sc->tempLen = sprintf(sc->tempStr, "%.17Le", in_2->data.c[0]);
+				switch (in_1->type % 10) {
+				case 2:
+					sc->tempLen = sprintf(sc->tempStr, "\
+%s = %.17Le;\n", out->name, (long double)(pfFPinit("1.0") / pfsqrt(in_1->data.d)));
 					PfAppendLine(sc);
-					break;
+					return;
 				}
-				PfAppendNumberLiteral(sc, out);
 			}
-			PfAppendConversionEnd(sc, out, in_2);
-			sc->tempLen = sprintf(sc->tempStr, ";\n");
-			PfAppendLine(sc);
 		}
-		if (!strcmp(out->data.s, in_2->data.s)) {
-			PfMov(sc, out, temp);
-		}
-		return;
 	}
 	else {
 		switch (out->type % 10) {
-		case 3:
+		case 2:
 			if (in_1->type > 100) {
 			}
 			else {
 				switch (in_1->type % 10) {
-				case 3:
-					if (in_2->type > 100) {
-					}
-					else {
-						switch (in_2->type % 10) {
-						case 3:
-							out->data.c[0] = in_1->data.c[0] + in_2->data.c[1];
-							out->data.c[1] = in_1->data.c[1] - in_2->data.c[0];
-							return;
-						}
-					}
-					break;
+				case 2:
+					out->data.d = pfFPinit("1.0") / pfsqrt(in_1->data.d);
+					return;
 				}
 			}
+			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
 
+static inline void PfConjugate(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	if (strcmp(out->name, in_1->name)) 
+		PfMov(sc, &out->data.c[0], &in_1->data.c[0]);
+	PfMovNeg(sc, &out->data.c[1], &in_1->data.c[1]);
+	return;
+}
+
+static inline void PfShuffleComplex(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	PfMovNeg(sc, &temp->data.c[0], &in_2->data.c[1]);
+	PfMov(sc, &temp->data.c[1], &in_2->data.c[0]);
+	PfAdd(sc, out, in_1, temp);
+	return;
+}
+static inline void PfShuffleComplexInv(VkFFTSpecializationConstantsLayout* sc, PfContainer* out, PfContainer* in_1, PfContainer* in_2, PfContainer* temp) {
+	if (sc->res != VKFFT_SUCCESS) return;
+	PfMov(sc, &temp->data.c[0], &in_2->data.c[1]);
+	PfMovNeg(sc, &temp->data.c[1], &in_2->data.c[0]);
+	PfAdd(sc, out, in_1, temp);
+
+	return;
+}
+
 //logic functions: if, ge, gt, le, lt, etc.
 static inline void PfIf_eq_start(VkFFTSpecializationConstantsLayout* sc, PfContainer* left, PfContainer* right) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (left->type > 100) {
 		if (right->type > 100) {
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (%s == %s) {\n", left->data.s, right->data.s);
+if (%s == %s) {\n", left->name, right->name);
 			PfAppendLine(sc);
 			return;
 		}
@@ -7936,16 +3404,14 @@ if (%s == %s) {\n", left->data.s, right->data.s);
 			switch (right->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s == %" PRIi64 ") {\n", left->data.s, right->data.i);
+if (%s == %" PRIi64 ") {\n", left->name, right->data.i);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s == %.17Le) {\n", left->data.s, right->data.d);
+if (%s == %.17Le) {\n", left->name, (long double)right->data.d);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -7954,16 +3420,14 @@ if (%s == %.17Le) {\n", left->data.s, right->data.d);
 			switch (left->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%" PRIi64 " == %s) {\n", left->data.i, right->data.s);
+if (%" PRIi64 " == %s) {\n", left->data.i, right->name);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%.17Le == %s) {\n", left->data.d, right->data.s);
+if (%.17Le == %s) {\n", (long double)left->data.d, right->name);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 		else {
@@ -7980,8 +3444,6 @@ if (%d) {\n", (left->data.i == right->data.i));
 if (%d) {\n", (left->data.i == right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				break;
 			case 2:
@@ -7996,12 +3458,8 @@ if (%d) {\n", (left->data.d == right->data.i));
 if (%d) {\n", (left->data.d == right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -8013,7 +3471,7 @@ static inline void PfIf_lt_start(VkFFTSpecializationConstantsLayout* sc, PfConta
 	if (left->type > 100) {
 		if (right->type > 100) {
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (%s < %s) {\n", left->data.s, right->data.s);
+if (%s < %s) {\n", left->name, right->name);
 			PfAppendLine(sc);
 			return;
 		}
@@ -8021,16 +3479,14 @@ if (%s < %s) {\n", left->data.s, right->data.s);
 			switch (right->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s < %" PRIi64 ") {\n", left->data.s, right->data.i);
+if (%s < %" PRIi64 ") {\n", left->name, right->data.i);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s < %.17Le) {\n", left->data.s, right->data.d);
+if (%s < %.17Le) {\n", left->name, (long double)right->data.d);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -8039,16 +3495,14 @@ if (%s < %.17Le) {\n", left->data.s, right->data.d);
 			switch (left->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%" PRIi64 " < %s) {\n", left->data.i, right->data.s);
+if (%" PRIi64 " < %s) {\n", left->data.i, right->name);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%.17Le < %s) {\n", left->data.d, right->data.s);
+if (%.17Le < %s) {\n", (long double)left->data.d, right->name);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 		else {
@@ -8065,8 +3519,6 @@ if (%d) {\n", (left->data.i < right->data.i));
 if (%d) {\n", (left->data.i < right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				break;
 			case 2:
@@ -8081,24 +3533,21 @@ if (%d) {\n", (left->data.d < right->data.i));
 if (%d) {\n", (left->data.d < right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
+
 static inline void PfIf_le_start(VkFFTSpecializationConstantsLayout* sc, PfContainer* left, PfContainer* right) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (left->type > 100) {
 		if (right->type > 100) {
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (%s <= %s) {\n", left->data.s, right->data.s);
+if (%s <= %s) {\n", left->name, right->name);
 			PfAppendLine(sc);
 			return;
 }
@@ -8106,16 +3555,14 @@ if (%s <= %s) {\n", left->data.s, right->data.s);
 			switch (right->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s <= %" PRIi64 ") {\n", left->data.s, right->data.i);
+if (%s <= %" PRIi64 ") {\n", left->name, right->data.i);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s <= %.17Le) {\n", left->data.s, right->data.d);
+if (%s <= %.17Le) {\n", left->name, (long double)right->data.d);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -8124,16 +3571,14 @@ if (%s <= %.17Le) {\n", left->data.s, right->data.d);
 			switch (left->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%" PRIi64 " <= %s) {\n", left->data.i, right->data.s);
+if (%" PRIi64 " <= %s) {\n", left->data.i, right->name);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%.17Le <= %s) {\n", left->data.d, right->data.s);
+if (%.17Le <= %s) {\n", (long double)left->data.d, right->name);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 		else {
@@ -8150,8 +3595,6 @@ if (%d) {\n", (left->data.i <= right->data.i));
 if (%d) {\n", (left->data.i <= right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				break;
 			case 2:
@@ -8166,8 +3609,6 @@ if (%d) {\n", (left->data.d <= right->data.i));
 if (%d) {\n", (left->data.d <= right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				return;
 			case 3:
@@ -8183,7 +3624,7 @@ static inline void PfIf_gt_start(VkFFTSpecializationConstantsLayout* sc, PfConta
 	if (left->type > 100) {
 		if (right->type > 100) {
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (%s > %s) {\n", left->data.s, right->data.s);
+if (%s > %s) {\n", left->name, right->name);
 			PfAppendLine(sc);
 			return;
 }
@@ -8191,16 +3632,14 @@ if (%s > %s) {\n", left->data.s, right->data.s);
 			switch (right->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s > %" PRIi64 ") {\n", left->data.s, right->data.i);
+if (%s > %" PRIi64 ") {\n", left->name, right->data.i);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s > %.17Le) {\n", left->data.s, right->data.d);
+if (%s > %.17Le) {\n", left->name, (long double)right->data.d);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -8209,16 +3648,14 @@ if (%s > %.17Le) {\n", left->data.s, right->data.d);
 			switch (left->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%" PRIi64 " > %s) {\n", left->data.i, right->data.s);
+if (%" PRIi64 " > %s) {\n", left->data.i, right->name);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%.17Le > %s) {\n", left->data.d, right->data.s);
+if (%.17Le > %s) {\n", (long double)left->data.d, right->name);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 		else {
@@ -8235,8 +3672,6 @@ if (%d) {\n", (left->data.i > right->data.i));
 if (%d) {\n", (left->data.i > right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				break;
 			case 2:
@@ -8251,24 +3686,21 @@ if (%d) {\n", (left->data.d > right->data.i));
 if (%d) {\n", (left->data.d > right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
 	return;
 }
+
 static inline void PfIf_ge_start(VkFFTSpecializationConstantsLayout* sc, PfContainer* left, PfContainer* right) {
 	if (sc->res != VKFFT_SUCCESS) return;
 	if (left->type > 100) {
 		if (right->type > 100) {
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (%s >= %s) {\n", left->data.s, right->data.s);
+if (%s >= %s) {\n", left->name, right->name);
 			PfAppendLine(sc);
 			return;
 }
@@ -8276,16 +3708,14 @@ if (%s >= %s) {\n", left->data.s, right->data.s);
 			switch (right->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s >= %" PRIi64 ") {\n", left->data.s, right->data.i);
+if (%s >= %" PRIi64 ") {\n", left->name, right->data.i);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%s >= %.17Le) {\n", left->data.s, right->data.d);
+if (%s >= %.17Le) {\n", left->name, (long double)right->data.d);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -8294,16 +3724,14 @@ if (%s >= %.17Le) {\n", left->data.s, right->data.d);
 			switch (left->type % 10) {
 			case 1:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%" PRIi64 " >= %s) {\n", left->data.i, right->data.s);
+if (%" PRIi64 " >= %s) {\n", left->data.i, right->name);
 				PfAppendLine(sc);
 				return;
 			case 2:
 				sc->tempLen = sprintf(sc->tempStr, "\
-if (%.17Le >= %s) {\n", left->data.d, right->data.s);
+if (%.17Le >= %s) {\n", (long double)left->data.d, right->name);
 				PfAppendLine(sc);
 				return;
-			case 3:
-				break;
 			}
 		}
 		else {
@@ -8320,8 +3748,6 @@ if (%d) {\n", (left->data.i >= right->data.i));
 if (%d) {\n", (left->data.i >= right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				break;
 			case 2:
@@ -8336,12 +3762,8 @@ if (%d) {\n", (left->data.d >= right->data.i));
 if (%d) {\n", (left->data.d >= right->data.d));
 					PfAppendLine(sc);
 					return;
-				case 3:
-					break;
 				}
 				return;
-			case 3:
-				break;
 			}
 		}
 	}
@@ -8362,13 +3784,11 @@ static inline void PfIfTrue(VkFFTSpecializationConstantsLayout* sc, PfContainer*
 		switch (in->type % 10) {
 		case 1:
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (%s) {\n", in->data.s);
+if (%s) {\n", in->name);
 			PfAppendLine(sc);
 			return;
 		case 2:
 			break;
-		case 3:
-			break;
 		}
 	}
 	else {
@@ -8380,8 +3800,6 @@ if (%" PRIi64 ") {\n", in->data.i);
 			return;
 		case 2:
 			break;
-		case 3:
-			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
@@ -8393,13 +3811,11 @@ static inline void PfIfFalse(VkFFTSpecializationConstantsLayout* sc, PfContainer
 		switch (in->type % 10) {
 		case 1:
 			sc->tempLen = sprintf(sc->tempStr, "\
-if (!%s) {\n", in->data.s);
+if (!%s) {\n", in->name);
 			PfAppendLine(sc);
 			return;
 		case 2:
 			break;
-		case 3:
-			break;
 		}
 	}
 	else {
@@ -8411,8 +3827,6 @@ if (!%" PRIi64 ") {\n", in->data.i);
 			return;
 		case 2:
 			break;
-		case 3:
-			break;
 		}
 	}
 	sc->res = VKFFT_ERROR_MATH_FAILED;
@@ -8438,14 +3852,17 @@ static inline void PfPrintReg(VkFFTSpecializationConstantsLayout* sc, PfContaine
 	if (in->type > 100) {
 		switch (in->type % 10) {
 		case 1:
-			sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%d\\n\", %s, %s);", inoutID->data.s, in->data.s);
+			sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%d\\n\", %s, %s);", inoutID->name, in->name);
 			PfAppendLine(sc);
 			return;
 		case 2:
-			sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%f\\n\", %s, %s);", inoutID->data.s, in->data.s);
+			sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%.17e\\n\", %s, %s);", inoutID->name, in->name);
 			PfAppendLine(sc); return;
 		case 3:
-			sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%f %%f\\n\", %s, %s.x, %s.y);", inoutID->data.s, in->data.s, in->data.s);
+			if (((in->type/10) % 10) == 3)
+				sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%.17e %%.17e %%.17e %%.17e\\n\", %s, %s.x.x, %s.x.y, %s.y.x, %s.y.y);", inoutID->name, in->name, in->name, in->name, in->name);
+			else
+				sc->tempLen = sprintf(sc->tempStr, "printf(\"%%d %%f %%f\\n\", %s, %s.x, %s.y);", inoutID->name, in->name, in->name);
 			PfAppendLine(sc);
 			return;
 		}
@@ -8454,31 +3871,35 @@ static inline void PfPrintReg(VkFFTSpecializationConstantsLayout* sc, PfContaine
 	return;
 }
 
-static inline void PfPermute(VkFFTSpecializationConstantsLayout* sc, uint64_t* permute, uint64_t num_elem, uint64_t type, PfContainer* regIDs, PfContainer* temp) {
+static inline void PfPermute(VkFFTSpecializationConstantsLayout* sc, pfUINT* permute, pfUINT num_elem, pfUINT type, PfContainer* regIDs, PfContainer* temp) {
 	if (sc->res != VKFFT_SUCCESS) return;
-	char* temp_ID[33];
+	PfContainer tempID[33] = VKFFT_ZERO_INIT;
+	for (int i = 0; i < num_elem; i++) {
+		tempID[i].type = 100 + sc->vecTypeCode;
+		PfAllocateContainerFlexible(sc, &tempID[i], 50);
+	}
 	if (type == 0) {
 		if (sc->locID[0].type > 100) {
-			for (uint64_t i = 0; i < num_elem; i++)
-				temp_ID[i] = sc->locID[i].data.s;
-			for (uint64_t i = 0; i < num_elem; i++)
-				sc->locID[i].data.s = temp_ID[permute[i]];
-			return;
+			for (pfUINT i = 0; i < num_elem; i++)
+				PfCopyContainer(sc, &tempID[i], &sc->locID[i]);
+			for (pfUINT i = 0; i < num_elem; i++)
+				PfCopyContainer(sc, &sc->locID[i], &tempID[permute[i]]);
 		}
 	}
 	if (type == 1) {
 		if (regIDs[0].type > 100) {
-			for (uint64_t i = 0; i < num_elem; i++)
-				temp_ID[i] = regIDs[i].data.s;
-			for (uint64_t i = 0; i < num_elem; i++)
-				regIDs[i].data.s = temp_ID[permute[i]];
-			return;
+			for (pfUINT i = 0; i < num_elem; i++)
+				PfCopyContainer(sc, &tempID[i], &regIDs[i]);
+			for (pfUINT i = 0; i < num_elem; i++)
+				PfCopyContainer(sc, &regIDs[i], &tempID[permute[i]]);
 		}
 	}
-	sc->res = VKFFT_ERROR_MATH_FAILED;
+	for (int i = 0; i < num_elem; i++) {
+		PfDeallocateContainer(sc, &tempID[i]);
+	}
 	return;
 }
-static inline void PfSubgroupAdd(VkFFTSpecializationConstantsLayout* sc, PfContainer* in, PfContainer* out, uint64_t subWarpSplit) {
+static inline void PfSubgroupAdd(VkFFTSpecializationConstantsLayout* sc, PfContainer* in, PfContainer* out, pfUINT subWarpSplit) {
 	if (sc->res != VKFFT_SUCCESS) return;
 
 #if (VKFFT_BACKEND==0)
@@ -8511,4 +3932,4 @@ static inline void PfSubgroupAdd(VkFFTSpecializationConstantsLayout* sc, PfConta
 	return;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_CompileKernel.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_CompileKernel.h
index 736a933e..defc0d3b 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_CompileKernel.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_CompileKernel.h
@@ -39,10 +39,10 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 	char* code0 = axis->specializationConstants.code0;
 #if(VKFFT_BACKEND==0)
 	uint32_t* code;
-	uint64_t codeSize;
+	pfUINT codeSize;
 	if (app->configuration.loadApplicationFromString) {
 		char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos;
-		memcpy(&codeSize, localStrPointer, sizeof(uint64_t));
+		memcpy(&codeSize, localStrPointer, sizeof(pfUINT));
 		code = (uint32_t*)malloc(codeSize);
 		if (!code) {
 			free(code0);
@@ -50,8 +50,8 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		memcpy(code, localStrPointer + sizeof(uint64_t), codeSize);
-		app->currentApplicationStringPos += codeSize + sizeof(uint64_t);
+		memcpy(code, localStrPointer + sizeof(pfUINT), codeSize);
+		app->currentApplicationStringPos += codeSize + sizeof(pfUINT);
 	}
 	else
 	{
@@ -298,10 +298,10 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 	}
 #elif(VKFFT_BACKEND==1)
 	char* code;
-	uint64_t codeSize;
+	pfUINT codeSize;
 	if (app->configuration.loadApplicationFromString) {
 		char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos;
-		memcpy(&codeSize, localStrPointer, sizeof(uint64_t));
+		memcpy(&codeSize, localStrPointer, sizeof(pfUINT));
 		code = (char*)malloc(codeSize);
 		if (!code) {
 			free(code0);
@@ -309,8 +309,8 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		memcpy(code, localStrPointer + sizeof(uint64_t), codeSize);
-		app->currentApplicationStringPos += codeSize + sizeof(uint64_t);
+		memcpy(code, localStrPointer + sizeof(pfUINT), codeSize);
+		app->currentApplicationStringPos += codeSize + sizeof(pfUINT);
 	}
 	else {
 		nvrtcProgram prog;
@@ -329,6 +329,7 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM;
 		}
+		int numOpts = 1;
 		char* opts[5];
 		opts[0] = (char*)malloc(sizeof(char) * 50);
 		if (!opts[0]) {
@@ -342,12 +343,27 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 #else
 		sprintf(opts[0], "--gpu-architecture=compute_%" PRIu64 "%" PRIu64 "", app->configuration.computeCapabilityMajor, app->configuration.computeCapabilityMinor);
 #endif
+		if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory){
+			opts[1] = (char*)malloc(sizeof(char) * 50);
+			if (!opts[1]) {
+				free(code0);
+				code0 = 0;
+				deleteVkFFT(app);
+				return VKFFT_ERROR_MALLOC_FAILED;
+			}
+			numOpts++;
+			sprintf(opts[1], "-fmad=false");
+		}
 		//result = nvrtcAddNameExpression(prog, "&consts");
 		//if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result));
 		result = nvrtcCompileProgram(prog,  // prog
-			1,     // numOptions
+			numOpts,     // numOptions
 			(const char* const*)opts); // options
+
 		free(opts[0]);
+		if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+			free(opts[1]);
+
 		if (result != NVRTC_SUCCESS) {
 			printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
 			char* log = (char*)malloc(sizeof(char) * 4000000);
@@ -444,7 +460,7 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 		deleteVkFFT(app);
 		return VKFFT_ERROR_FAILED_TO_GET_FUNCTION;
 	}
-	if ((uint64_t)axis->specializationConstants.usedSharedMemory.data.i > app->configuration.sharedMemorySizeStatic) {
+	if ((pfUINT)axis->specializationConstants.usedSharedMemory.data.i > app->configuration.sharedMemorySizeStatic) {
 		result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory.data.i);
 		if (result2 != CUDA_SUCCESS) {
 			printf("cuFuncSetAttribute error: %d\n", result2);
@@ -475,10 +491,10 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 	}
 #elif(VKFFT_BACKEND==2)
 	uint32_t* code;
-	uint64_t codeSize;
+	pfUINT codeSize;
 	if (app->configuration.loadApplicationFromString) {
 		char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos;
-		memcpy(&codeSize, localStrPointer, sizeof(uint64_t));
+		memcpy(&codeSize, localStrPointer, sizeof(pfUINT));
 		code = (uint32_t*)malloc(codeSize);
 		if (!code) {
 			free(code0);
@@ -486,8 +502,8 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		memcpy(code, localStrPointer + sizeof(uint64_t), codeSize);
-		app->currentApplicationStringPos += codeSize + sizeof(uint64_t);
+		memcpy(code, localStrPointer + sizeof(pfUINT), codeSize);
+		app->currentApplicationStringPos += codeSize + sizeof(pfUINT);
 	}
 	else
 	{
@@ -515,10 +531,26 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 				return VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION;
 			}
 		}
+		int numOpts = 0;
+		char* opts[5];
+		if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory){
+			opts[0] = (char*)malloc(sizeof(char) * 50);
+			if (!opts[0]) {
+				free(code0);
+				code0 = 0;
+				deleteVkFFT(app);
+				return VKFFT_ERROR_MALLOC_FAILED;
+			}
+			numOpts++;
+			sprintf(opts[0], "-ffp-contract=off");
+		}
 
 		result = hiprtcCompileProgram(prog,  // prog
-			0,     // numOptions
-			0); // options
+			numOpts,     // numOptions
+			(const char**)opts); // options
+
+		if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+			free(opts[0]);	
 		if (result != HIPRTC_SUCCESS) {
 			printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
 			char* log = (char*)malloc(sizeof(char) * 100000);
@@ -601,7 +633,7 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 		deleteVkFFT(app);
 		return VKFFT_ERROR_FAILED_TO_GET_FUNCTION;
 	}
-	if ((uint64_t)axis->specializationConstants.usedSharedMemory.data.i > app->configuration.sharedMemorySizeStatic) {
+	if ((pfUINT)axis->specializationConstants.usedSharedMemory.data.i > app->configuration.sharedMemorySizeStatic) {
 		result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory.data.i);
 		//result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared);
 		if (result2 != hipSuccess) {
@@ -634,9 +666,9 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 #elif(VKFFT_BACKEND==3)
 	if (app->configuration.loadApplicationFromString) {
 		char* code;
-		uint64_t codeSize;
+		pfUINT codeSize;
 		char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos;
-		memcpy(&codeSize, localStrPointer, sizeof(uint64_t));
+		memcpy(&codeSize, localStrPointer, sizeof(pfUINT));
 		size_t codeSize_size_t = (size_t)codeSize;
 		code = (char*)malloc(codeSize);
 		if (!code) {
@@ -645,8 +677,8 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		memcpy(code, localStrPointer + sizeof(uint64_t), codeSize);
-		app->currentApplicationStringPos += codeSize + sizeof(uint64_t);
+		memcpy(code, localStrPointer + sizeof(pfUINT), codeSize);
+		app->currentApplicationStringPos += codeSize + sizeof(pfUINT);
 		const unsigned char* temp_code = (const unsigned char*)code;
 		axis->program = clCreateProgramWithBinary(app->configuration.context[0], 1, app->configuration.device, &codeSize_size_t, (const unsigned char**)(&temp_code), 0, &res);
 		if (res != CL_SUCCESS) {
@@ -703,7 +735,7 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM;
 		}
-		axis->binarySize = (uint64_t)codeSize;
+		axis->binarySize = (pfUINT)codeSize;
 		axis->binary = (char*)malloc(axis->binarySize);
 		if (!axis->binary) {
 			free(code0);
@@ -736,10 +768,10 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 	}
 #elif(VKFFT_BACKEND==4)
 	uint32_t* code;
-	uint64_t codeSize;
+	pfUINT codeSize;
 	if (app->configuration.loadApplicationFromString) {
 		char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos;
-		memcpy(&codeSize, localStrPointer, sizeof(uint64_t));
+		memcpy(&codeSize, localStrPointer, sizeof(pfUINT));
 		code = (uint32_t*)malloc(codeSize);
 		if (!code) {
 			free(code0);
@@ -747,8 +779,8 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		memcpy(code, localStrPointer + sizeof(uint64_t), codeSize);
-		app->currentApplicationStringPos += codeSize + sizeof(uint64_t);
+		memcpy(code, localStrPointer + sizeof(pfUINT), codeSize);
+		app->currentApplicationStringPos += codeSize + sizeof(pfUINT);
 
 		const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0;
 		ze_module_desc_t moduleDesc = {
@@ -774,7 +806,7 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 	}
 	else {
 		size_t codelen = strlen(code0);
-		uint64_t successOpen = 0;
+		pfUINT successOpen = 0;
 		FILE* temp;
 		char fname_cl[100];
 		char fname_bc[100];
@@ -803,7 +835,7 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 		system(system_call);
 		temp = fopen(fname_spv, "rb");
 		fseek(temp, 0L, SEEK_END);
-		uint64_t spv_size = ftell(temp);
+		pfUINT spv_size = ftell(temp);
 		rewind(temp);
 
 		uint8_t* spv_binary = (uint8_t*)malloc(spv_size);
@@ -889,9 +921,9 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 	NS::Error* error;
 	if (app->configuration.loadApplicationFromString) {
 		char* code;
-		uint64_t codeSize;
+		pfUINT codeSize;
 		char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos;
-		memcpy(&codeSize, localStrPointer, sizeof(uint64_t));
+		memcpy(&codeSize, localStrPointer, sizeof(pfUINT));
 		size_t codeSize_size_t = (size_t)codeSize;
 		code = (char*)malloc(codeSize);
 		if (!code) {
@@ -900,8 +932,8 @@ static inline VkFFTResult VkFFT_CompileKernel(VkFFTApplication* app, VkFFTAxis*
 			deleteVkFFT(app);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		memcpy(code, localStrPointer + sizeof(uint64_t), codeSize);
-		app->currentApplicationStringPos += codeSize + sizeof(uint64_t);
+		memcpy(code, localStrPointer + sizeof(pfUINT), codeSize);
+		app->currentApplicationStringPos += codeSize + sizeof(pfUINT);
 		dispatch_data_t data = dispatch_data_create(code, codeSize, 0, 0);
 		axis->library = app->configuration.device->newLibrary(data, &error);
 		free(code);
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DeletePlan.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DeletePlan.h
index 9e5e3989..f37a0a5d 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DeletePlan.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DeletePlan.h
@@ -23,8 +23,8 @@
 #define VKFFT_DELETEPLAN_H
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 
-static inline void deleteAxis(VkFFTApplication* app, VkFFTAxis* axis) {
-	if (axis->specializationConstants.numRaderPrimes) {
+static inline void deleteAxis(VkFFTApplication* app, VkFFTAxis* axis, int isInverseBluesteinAxes) {
+	if (axis->specializationConstants.numRaderPrimes && (!isInverseBluesteinAxes)) {
 		free(axis->specializationConstants.raderContainer);
 		axis->specializationConstants.raderContainer = 0;
 		axis->specializationConstants.numRaderPrimes = 0;
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h
index ecc9baf8..04e3f541 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_DispatchPlan.h
@@ -23,22 +23,22 @@
 #define VKFFT_DISPATCHPLAN_H
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 
-static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* axis, uint64_t* dispatchBlock) {
+static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* axis, pfUINT* dispatchBlock) {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 	if (axis->specializationConstants.swapComputeWorkGroupID == 1) {
-		uint64_t temp = dispatchBlock[0];
+		pfUINT temp = dispatchBlock[0];
 		dispatchBlock[0] = dispatchBlock[1];
 		dispatchBlock[1] = temp;
 	}
 	if (axis->specializationConstants.swapComputeWorkGroupID == 2) {
-		uint64_t temp = dispatchBlock[0];
+		pfUINT temp = dispatchBlock[0];
 		dispatchBlock[0] = dispatchBlock[2];
 		dispatchBlock[2] = temp;
 	}
-	uint64_t blockNumber[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)app->configuration.maxComputeWorkGroupCount[0]),(uint64_t)ceil(dispatchBlock[1] / (double)app->configuration.maxComputeWorkGroupCount[1]),(uint64_t)ceil(dispatchBlock[2] / (double)app->configuration.maxComputeWorkGroupCount[2]) };
-	uint64_t blockSize[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)blockNumber[0]), (uint64_t)ceil(dispatchBlock[1] / (double)blockNumber[1]), (uint64_t)ceil(dispatchBlock[2] / (double)blockNumber[2]) };
-	uint64_t lastBlockSize[3] = { blockSize[0],blockSize[1],blockSize[2] };
-	uint64_t dispatchSize[3] = { 1,1,1 };
+	pfUINT blockNumber[3] = { (pfUINT)pfceil(dispatchBlock[0] / (double)app->configuration.maxComputeWorkGroupCount[0]),(pfUINT)pfceil(dispatchBlock[1] / (double)app->configuration.maxComputeWorkGroupCount[1]),(pfUINT)pfceil(dispatchBlock[2] / (double)app->configuration.maxComputeWorkGroupCount[2]) };
+	pfUINT blockSize[3] = { (pfUINT)pfceil(dispatchBlock[0] / (double)blockNumber[0]), (pfUINT)pfceil(dispatchBlock[1] / (double)blockNumber[1]), (pfUINT)pfceil(dispatchBlock[2] / (double)blockNumber[2]) };
+	pfUINT lastBlockSize[3] = { blockSize[0],blockSize[1],blockSize[2] };
+	pfUINT dispatchSize[3] = { 1,1,1 };
 	if (blockNumber[0] == 0) blockNumber[0] = 1;
 	if (blockNumber[1] == 0) blockNumber[1] = 1;
 	if (blockNumber[2] == 0) blockNumber[2] = 1;
@@ -56,11 +56,11 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 	}
 	//printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", dispatchBlock[0], dispatchBlock[1], dispatchBlock[2]);
 	//printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", blockNumber[0], blockNumber[1], blockNumber[2]);
-	for (uint64_t i = 0; i < 3; i++)
+	for (pfUINT i = 0; i < 3; i++)
 		if (blockNumber[i] == 1) blockSize[i] = dispatchBlock[i];
-	for (uint64_t i = 0; i < blockNumber[0]; i++) {
-		for (uint64_t j = 0; j < blockNumber[1]; j++) {
-			for (uint64_t k = 0; k < blockNumber[2]; k++) {
+	for (pfUINT i = 0; i < blockNumber[0]; i++) {
+		for (pfUINT j = 0; j < blockNumber[1]; j++) {
+			for (pfUINT k = 0; k < blockNumber[2]; k++) {
 				if (axis->pushConstants.workGroupShift[0] != i * blockSize[0]) {
 					axis->pushConstants.workGroupShift[0] = i * blockSize[0];
 					axis->updatePushConstants = 1;
@@ -75,41 +75,41 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 				}
 				if (axis->updatePushConstants) {
 					if (app->configuration.useUint64) {
-						uint64_t offset = 0;
-						uint64_t temp = 0;
+						pfUINT offset = 0;
+						pfUINT temp = 0;
 						if (axis->specializationConstants.performWorkGroupShift[0]) {
-							memcpy(&axis->pushConstants.data[offset], &axis->pushConstants.workGroupShift[0], sizeof(uint64_t));
-							offset+=sizeof(uint64_t);
+							memcpy(&axis->pushConstants.data[offset], &axis->pushConstants.workGroupShift[0], sizeof(pfUINT));
+							offset+=sizeof(pfUINT);
 						}
 						if (axis->specializationConstants.performWorkGroupShift[1]) {
-							memcpy(&axis->pushConstants.data[offset], &axis->pushConstants.workGroupShift[1], sizeof(uint64_t));
-							offset += sizeof(uint64_t);
+							memcpy(&axis->pushConstants.data[offset], &axis->pushConstants.workGroupShift[1], sizeof(pfUINT));
+							offset += sizeof(pfUINT);
 						}
 						if (axis->specializationConstants.performWorkGroupShift[2]) {
-							memcpy(&axis->pushConstants.data[offset], &axis->pushConstants.workGroupShift[2], sizeof(uint64_t));
-							offset += sizeof(uint64_t);
+							memcpy(&axis->pushConstants.data[offset], &axis->pushConstants.workGroupShift[2], sizeof(pfUINT));
+							offset += sizeof(pfUINT);
 						}
 						if (axis->specializationConstants.performPostCompilationInputOffset) {
 							temp = axis->specializationConstants.inputOffset.data.i / axis->specializationConstants.inputNumberByteSize;
-							memcpy(&axis->pushConstants.data[offset], &temp, sizeof(uint64_t));
-							offset += sizeof(uint64_t);
+							memcpy(&axis->pushConstants.data[offset], &temp, sizeof(pfUINT));
+							offset += sizeof(pfUINT);
 						}
 						if (axis->specializationConstants.performPostCompilationOutputOffset) {
 							temp = axis->specializationConstants.outputOffset.data.i / axis->specializationConstants.outputNumberByteSize;
-							memcpy(&axis->pushConstants.data[offset], &temp, sizeof(uint64_t));
-							offset += sizeof(uint64_t);
+							memcpy(&axis->pushConstants.data[offset], &temp, sizeof(pfUINT));
+							offset += sizeof(pfUINT);
 						}
 						if (axis->specializationConstants.performPostCompilationKernelOffset) {
 							if (axis->specializationConstants.kernelNumberByteSize != 0)
 								temp = axis->specializationConstants.kernelOffset.data.i / axis->specializationConstants.kernelNumberByteSize;
 							else
 								temp = 0;
-							memcpy(&axis->pushConstants.data[offset], &temp, sizeof(uint64_t));
-							offset += sizeof(uint64_t);
+							memcpy(&axis->pushConstants.data[offset], &temp, sizeof(pfUINT));
+							offset += sizeof(pfUINT);
 						}
 					}
 					else {
-						uint64_t offset = 0;
+						pfUINT offset = 0;
 						uint32_t temp = 0;
 						if (axis->specializationConstants.performWorkGroupShift[0]) {
 							temp = (uint32_t)axis->pushConstants.workGroupShift[0];
@@ -159,7 +159,7 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 				CUresult result = CUDA_SUCCESS;
 				args[0] = axis->inputBuffer;
 				args[1] = axis->outputBuffer;
-				uint64_t args_id = 2;
+				pfUINT args_id = 2;
 				if (axis->specializationConstants.convolutionStep) {
 					args[args_id] = app->configuration.kernel;
 					args_id++;
@@ -225,7 +225,7 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 				void* args[10];
 				args[0] = axis->inputBuffer;
 				args[1] = axis->outputBuffer;
-				uint64_t args_id = 2;
+				pfUINT args_id = 2;
 				if (axis->specializationConstants.convolutionStep) {
 					args[args_id] = app->configuration.kernel;
 					args_id++;
@@ -300,7 +300,7 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 				if (result != CL_SUCCESS) {
 					return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG;
 				}
-				uint64_t args_id = 2;
+				pfUINT args_id = 2;
 				if (axis->specializationConstants.convolutionStep) {
 					args[args_id] = app->configuration.kernel;
 					result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]);
@@ -373,7 +373,7 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 				if (result != ZE_RESULT_SUCCESS) {
 					return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG;
 				}
-				uint64_t args_id = 2;
+				pfUINT args_id = 2;
 				if (axis->specializationConstants.convolutionStep) {
 					args[args_id] = app->configuration.kernel;
 					result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]);
@@ -439,9 +439,9 @@ static inline VkFFTResult VkFFT_DispatchPlan(VkFFTApplication* app, VkFFTAxis* a
 				void* args[10];
 				app->configuration.commandEncoder->setBuffer(axis->inputBuffer[0], 0, 0);
 				app->configuration.commandEncoder->setBuffer(axis->outputBuffer[0], 0, 1);
-				app->configuration.commandEncoder->setThreadgroupMemoryLength((uint64_t)ceil(axis->specializationConstants.usedSharedMemory.data.i / 16.0) * 16, 0);
+				app->configuration.commandEncoder->setThreadgroupMemoryLength((pfUINT)pfceil(axis->specializationConstants.usedSharedMemory.data.i / 16.0) * 16, 0);
 
-				uint64_t args_id = 2;
+				pfUINT args_id = 2;
 				if (axis->specializationConstants.convolutionStep) {
 					app->configuration.commandEncoder->setBuffer(app->configuration.kernel[0], 0, args_id);
 					args_id++;
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_InitAPIParameters.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_InitAPIParameters.h
index 10978b08..da4fa295 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_InitAPIParameters.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_InitAPIParameters.h
@@ -28,107 +28,121 @@
 static inline VkFFTResult initMemoryParametersAPI(VkFFTApplication* app, VkFFTSpecializationConstantsLayout* sc) {
 	VkFFTResult res = VKFFT_SUCCESS;
 
+	sc->halfLiteral.type = 300;	
 	PfAllocateContainerFlexible(sc, &sc->halfLiteral, 50);
-	sc->halfLiteral.type = 100;	
+	sc->floatLiteral.type = 310;	
 	PfAllocateContainerFlexible(sc, &sc->floatLiteral, 50);
-	sc->floatLiteral.type = 110;	
+	sc->doubleLiteral.type = 320;
 	PfAllocateContainerFlexible(sc, &sc->doubleLiteral, 50);
-	sc->doubleLiteral.type = 120;
+	sc->halfDef.type = 302;
 	PfAllocateContainerFlexible(sc, &sc->halfDef, 50);
-	sc->halfDef.type = 100;
+	sc->floatDef.type = 312;
 	PfAllocateContainerFlexible(sc, &sc->floatDef, 50);
-	sc->floatDef.type = 110;
+	sc->doubleDef.type = 322;
 	PfAllocateContainerFlexible(sc, &sc->doubleDef, 50);
-	sc->doubleDef.type = 120;
+	sc->quadDef.type = 322;
+	PfAllocateContainerFlexible(sc, &sc->quadDef, 50);
+	sc->half2Def.type = 303;
 	PfAllocateContainerFlexible(sc, &sc->half2Def, 50);
-	sc->half2Def.type = 100;
+	sc->float2Def.type = 313;
 	PfAllocateContainerFlexible(sc, &sc->float2Def, 50);
-	sc->float2Def.type = 110;
+	sc->double2Def.type = 323;
 	PfAllocateContainerFlexible(sc, &sc->double2Def, 50);
-	sc->double2Def.type = 120;
-
+	sc->quad2Def.type = 323;
+	PfAllocateContainerFlexible(sc, &sc->quad2Def, 50);
+	
+	sc->uintDef.type = 301;
 	PfAllocateContainerFlexible(sc, &sc->uintDef, 50);
-	sc->uintDef.type = 100;
+	sc->intDef.type = 311;
 	PfAllocateContainerFlexible(sc, &sc->intDef, 50);
-	sc->intDef.type = 110;
+	sc->uint64Def.type = 321;
 	PfAllocateContainerFlexible(sc, &sc->uint64Def, 50);
-	sc->uint64Def.type = 120;
+	sc->int64Def.type = 331;
 	PfAllocateContainerFlexible(sc, &sc->int64Def, 50);
-	sc->int64Def.type = 130;
-
+	
 #if(VKFFT_BACKEND==0)
-	sprintf(sc->halfLiteral.data.s, "h");
-	sprintf(sc->floatLiteral.data.s, "f");
-	sprintf(sc->doubleLiteral.data.s, "LF");
-	sprintf(sc->halfDef.data.s, "half");
-	sprintf(sc->floatDef.data.s, "float");
-	sprintf(sc->doubleDef.data.s, "double");
-	sprintf(sc->half2Def.data.s, "f16vec2");
-	sprintf(sc->float2Def.data.s, "vec2");
-	sprintf(sc->double2Def.data.s, "dvec2");
-
-	sprintf(sc->intDef.data.s, "int");
-	sprintf(sc->uintDef.data.s, "uint");
-	sprintf(sc->int64Def.data.s, "int64_t");
-	sprintf(sc->uint64Def.data.s, "uint64_t");
+	sprintf(sc->halfLiteral.name, "h");
+	sprintf(sc->floatLiteral.name, "f");
+	sprintf(sc->doubleLiteral.name, "LF");
+	sprintf(sc->halfDef.name, "float16_t");
+	sprintf(sc->floatDef.name, "float");
+	sprintf(sc->doubleDef.name, "double");
+	sprintf(sc->quadDef.name, "dvec2");
+	sprintf(sc->half2Def.name, "f16vec2");
+	sprintf(sc->float2Def.name, "vec2");
+	sprintf(sc->double2Def.name, "dvec2");
+	sprintf(sc->quad2Def.name, "pf_quad2");
+
+	sprintf(sc->intDef.name, "int");
+	sprintf(sc->uintDef.name, "uint");
+	sprintf(sc->int64Def.name, "int64_t");
+	sprintf(sc->uint64Def.name, "uint64_t");
 #elif(VKFFT_BACKEND==1)
-	sprintf(sc->halfLiteral.data.s, "h");
-	sprintf(sc->floatLiteral.data.s, "f");
-	sprintf(sc->doubleLiteral.data.s, "l");
-	sprintf(sc->halfDef.data.s, "half");
-	sprintf(sc->floatDef.data.s, "float");
-	sprintf(sc->doubleDef.data.s, "double");
-	sprintf(sc->half2Def.data.s, "half2");
-	sprintf(sc->float2Def.data.s, "float2");
-	sprintf(sc->double2Def.data.s, "double2");
-
-	sprintf(sc->intDef.data.s, "int");
-	sprintf(sc->uintDef.data.s, "unsigned int");
-	sprintf(sc->int64Def.data.s, "long long");
-	sprintf(sc->uint64Def.data.s, "unsigned long long");
+	sprintf(sc->halfLiteral.name, "h");
+	sprintf(sc->floatLiteral.name, "f");
+	sprintf(sc->doubleLiteral.name, "l");
+	sprintf(sc->halfDef.name, "half");
+	sprintf(sc->floatDef.name, "float");
+	sprintf(sc->doubleDef.name, "double");
+	sprintf(sc->quadDef.name, "double2");
+	sprintf(sc->half2Def.name, "half2");
+	sprintf(sc->float2Def.name, "float2");
+	sprintf(sc->double2Def.name, "double2");
+	sprintf(sc->quad2Def.name, "pf_quad2");
+
+	sprintf(sc->intDef.name, "int");
+	sprintf(sc->uintDef.name, "unsigned int");
+	sprintf(sc->int64Def.name, "long long");
+	sprintf(sc->uint64Def.name, "unsigned long long");
 #elif(VKFFT_BACKEND==2)
-	sprintf(sc->halfLiteral.data.s, "h");
-	sprintf(sc->floatLiteral.data.s, "f");
-	sprintf(sc->doubleLiteral.data.s, "l");
-	sprintf(sc->halfDef.data.s, "half");
-	sprintf(sc->floatDef.data.s, "float");
-	sprintf(sc->doubleDef.data.s, "double");
-	sprintf(sc->half2Def.data.s, "half2");
-	sprintf(sc->float2Def.data.s, "float2");
-	sprintf(sc->double2Def.data.s, "double2");
-
-	sprintf(sc->intDef.data.s, "int");
-	sprintf(sc->uintDef.data.s, "unsigned int");
-	sprintf(sc->int64Def.data.s, "long long");
-	sprintf(sc->uint64Def.data.s, "unsigned long long");
+	sprintf(sc->halfLiteral.name, "h");
+	sprintf(sc->floatLiteral.name, "f");
+	sprintf(sc->doubleLiteral.name, "l");
+	sprintf(sc->halfDef.name, "half");
+	sprintf(sc->floatDef.name, "float");
+	sprintf(sc->doubleDef.name, "double");
+	sprintf(sc->quadDef.name, "double2");
+	sprintf(sc->half2Def.name, "half2");
+	sprintf(sc->float2Def.name, "float2");
+	sprintf(sc->double2Def.name, "double2");
+	sprintf(sc->quad2Def.name, "pf_quad2");
+
+	sprintf(sc->intDef.name, "int");
+	sprintf(sc->uintDef.name, "unsigned int");
+	sprintf(sc->int64Def.name, "long long");
+	sprintf(sc->uint64Def.name, "unsigned long long");
 #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-	sprintf(sc->halfLiteral.data.s, "h");
-	sprintf(sc->floatLiteral.data.s, "f");
-	sprintf(sc->halfDef.data.s, "half");
-	sprintf(sc->floatDef.data.s, "float");
-	sprintf(sc->doubleDef.data.s, "double");
-	sprintf(sc->half2Def.data.s, "half2");
-	sprintf(sc->float2Def.data.s, "float2");
-	sprintf(sc->double2Def.data.s, "double2");
-
-	sprintf(sc->intDef.data.s, "int");
-	sprintf(sc->uintDef.data.s, "unsigned int");
-	sprintf(sc->int64Def.data.s, "long long");
-	sprintf(sc->uint64Def.data.s, "unsigned long long");
+	sprintf(sc->halfLiteral.name, "h");
+	sprintf(sc->floatLiteral.name, "f");
+	sprintf(sc->halfDef.name, "half");
+	sprintf(sc->floatDef.name, "float");
+	sprintf(sc->doubleDef.name, "double");
+	sprintf(sc->quadDef.name, "double2");
+	sprintf(sc->half2Def.name, "half2");
+	sprintf(sc->float2Def.name, "float2");
+	sprintf(sc->double2Def.name, "double2");
+	sprintf(sc->quad2Def.name, "struct pf_quad2");
+
+	sprintf(sc->intDef.name, "int");
+	sprintf(sc->uintDef.name, "unsigned int");
+	sprintf(sc->int64Def.name, "long long");
+	sprintf(sc->uint64Def.name, "unsigned long long");
 #elif(VKFFT_BACKEND==5)
-	sprintf(sc->halfLiteral.data.s, "h");
-	sprintf(sc->floatLiteral.data.s, "f");
-	sprintf(sc->halfDef.data.s, "half");
-	sprintf(sc->floatDef.data.s, "float");
-	sprintf(sc->doubleDef.data.s, "double");
-	sprintf(sc->half2Def.data.s, "half2");
-	sprintf(sc->float2Def.data.s, "float2");
-	sprintf(sc->double2Def.data.s, "double2");
-
-	sprintf(sc->intDef.data.s, "int");
-	sprintf(sc->uintDef.data.s, "uint");
-	sprintf(sc->int64Def.data.s, "long");
-	sprintf(sc->uint64Def.data.s, "ulong");
+	sprintf(sc->halfLiteral.name, "h");
+	sprintf(sc->floatLiteral.name, "f");
+	sprintf(sc->halfDef.name, "half");
+	sprintf(sc->floatDef.name, "float");
+	sprintf(sc->doubleDef.name, "double");
+	sprintf(sc->quadDef.name, "double2");
+	sprintf(sc->half2Def.name, "half2");
+	sprintf(sc->float2Def.name, "float2");
+	sprintf(sc->double2Def.name, "double2");
+	sprintf(sc->quad2Def.name, "pf_quad2");
+
+	sprintf(sc->intDef.name, "int");
+	sprintf(sc->uintDef.name, "uint");
+	sprintf(sc->int64Def.name, "long");
+	sprintf(sc->uint64Def.name, "ulong");
 #endif
 
 
@@ -179,6 +193,30 @@ static inline VkFFTResult initMemoryParametersAPI(VkFFTApplication* app, VkFFTSp
 		sc->vecTypeInputMemoryCode = 23;
 		sc->vecTypeOutputMemoryCode = 23;
 	}
+	else if (app->configuration.quadDoubleDoublePrecision) {
+		sc->floatTypeCode = 32;
+		sc->vecTypeCode = 33;
+
+		sc->floatTypeKernelMemoryCode = 32;
+		sc->floatTypeInputMemoryCode = 32;
+		sc->floatTypeOutputMemoryCode = 32;
+
+		sc->vecTypeKernelMemoryCode = 33;
+		sc->vecTypeInputMemoryCode = 33;
+		sc->vecTypeOutputMemoryCode = 33;
+	}
+	else if (app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+		sc->floatTypeCode = 32;
+		sc->vecTypeCode = 33;
+
+		sc->floatTypeKernelMemoryCode = 22;
+		sc->floatTypeInputMemoryCode = 22;
+		sc->floatTypeOutputMemoryCode = 22;
+
+		sc->vecTypeKernelMemoryCode = 23;
+		sc->vecTypeInputMemoryCode = 23;
+		sc->vecTypeOutputMemoryCode = 23;
+	}
 	else {
 		if (app->configuration.doublePrecisionFloatMemory) {
 			sc->floatTypeCode = 22;
@@ -224,276 +262,287 @@ static inline VkFFTResult initParametersAPI(VkFFTApplication* app, VkFFTSpeciali
 	VkFFTResult res = VKFFT_SUCCESS;
 	sc->tempStr = (char*)calloc(sc->maxTempLength, sizeof(char));
 	if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED;
+	char name[50];
 	sc->tempLen = 0;
 	sc->currentLen = 0;
-	PfAllocateContainerFlexible(sc, &sc->inputsStruct, 50);
 	sc->inputsStruct.type = 200 + sc->inputMemoryCode;
-	PfAllocateContainerFlexible(sc, &sc->outputsStruct, 50);
+	PfAllocateContainerFlexible(sc, &sc->inputsStruct, 50);
+
 	sc->outputsStruct.type = 200 + sc->outputMemoryCode;
+	PfAllocateContainerFlexible(sc, &sc->outputsStruct, 50);
 
-	PfAllocateContainerFlexible(sc, &sc->sdataStruct, 50);
 	sc->sdataStruct.type = 200 + sc->vecTypeCode;
-	sprintf(sc->sdataStruct.data.s, "sdata");
+	PfAllocateContainerFlexible(sc, &sc->sdataStruct, 50);
+	sprintf(name, "sdata");
+	PfSetContainerName(sc, &sc->sdataStruct, name);
 
-	PfAllocateContainerFlexible(sc, &sc->LUTStruct, 50);
 	sc->LUTStruct.type = 200 + sc->vecTypeCode;
-	sprintf(sc->LUTStruct.data.s, "twiddleLUT");
+	PfAllocateContainerFlexible(sc, &sc->LUTStruct, 50);
+	sprintf(name, "twiddleLUT");
+	PfSetContainerName(sc, &sc->LUTStruct, name);
 
-	PfAllocateContainerFlexible(sc, &sc->BluesteinStruct, 50);
 	sc->BluesteinStruct.type = 200 + sc->vecTypeCode;
-	sprintf(sc->BluesteinStruct.data.s, "BluesteinMultiplication");
+	PfAllocateContainerFlexible(sc, &sc->BluesteinStruct, 50);
+	sprintf(name, "BluesteinMultiplication");
+	PfSetContainerName(sc, &sc->BluesteinStruct, name);
 
-	PfAllocateContainerFlexible(sc, &sc->BluesteinConvolutionKernelStruct, 50);
 	sc->BluesteinConvolutionKernelStruct.type = 200 + sc->vecTypeCode;
-	sprintf(sc->BluesteinConvolutionKernelStruct.data.s, "BluesteinConvolutionKernel");
-	
-	PfAllocateContainerFlexible(sc, &sc->kernelStruct, 50);
+	PfAllocateContainerFlexible(sc, &sc->BluesteinConvolutionKernelStruct, 50);
+	sprintf(name, "BluesteinConvolutionKernel");
+	PfSetContainerName(sc, &sc->BluesteinConvolutionKernelStruct, name);
+
 	sc->kernelStruct.type = 200 + sc->vecTypeCode;
-	sprintf(sc->kernelStruct.data.s, "kernel_obj");
+	PfAllocateContainerFlexible(sc, &sc->kernelStruct, 50);
+	sprintf(name, "kernel_obj");
+	PfSetContainerName(sc, &sc->kernelStruct, name);
 
 	for (int i = 0; i < sc->numRaderPrimes; i++) {
 		if (sc->raderContainer[i].prime > 0) {
 			if (sc->inline_rader_g_pow == 1) {
+				sc->raderContainer[i].g_powConstantStruct.type = 200 + sc->uintType32Code;
 				PfAllocateContainerFlexible(sc, &sc->raderContainer[i].g_powConstantStruct, 50);
-				sc->BluesteinConvolutionKernelStruct.type = 201;
-				sprintf(sc->raderContainer[i].g_powConstantStruct.data.s, "g_pow_%d", sc->raderContainer[i].prime);
+				sprintf(name, "g_pow_%d", sc->raderContainer[i].prime);
+				PfSetContainerName(sc, &sc->raderContainer[i].g_powConstantStruct, name);
 			}
 			if (sc->inline_rader_kernel) {
+				sc->raderContainer[i].r_rader_kernelConstantStruct.type = 200 + sc->floatTypeCode;
 				PfAllocateContainerFlexible(sc, &sc->raderContainer[i].r_rader_kernelConstantStruct, 50);
-				sc->BluesteinConvolutionKernelStruct.type = 200 + sc->floatTypeCode;
-				sprintf(sc->raderContainer[i].r_rader_kernelConstantStruct.data.s, "r_rader_kernel_%d", sc->raderContainer[i].prime);
+				sprintf(name, "r_rader_kernel_%d", sc->raderContainer[i].prime);
+				PfSetContainerName(sc, &sc->raderContainer[i].r_rader_kernelConstantStruct, name);
 
+				sc->raderContainer[i].i_rader_kernelConstantStruct.type = 200 + sc->floatTypeCode;
 				PfAllocateContainerFlexible(sc, &sc->raderContainer[i].i_rader_kernelConstantStruct, 50);
-				sc->BluesteinConvolutionKernelStruct.type = 200 + sc->floatTypeCode;
-				sprintf(sc->raderContainer[i].i_rader_kernelConstantStruct.data.s, "i_rader_kernel_%d", sc->raderContainer[i].prime);
+				sprintf(name, "i_rader_kernel_%d", sc->raderContainer[i].prime);
+				PfSetContainerName(sc, &sc->raderContainer[i].i_rader_kernelConstantStruct, name);
 			}
 		}
 	}
 	if (sc->inline_rader_g_pow == 2) {
+		sc->g_powStruct.type = 200 + sc->uintType32Code;
 		PfAllocateContainerFlexible(sc, &sc->g_powStruct, 50);
-		sc->g_powStruct.type = 201;
-		sprintf(sc->g_powStruct.data.s, "g_pow");
+		sprintf(name, "g_pow");
+		PfSetContainerName(sc, &sc->g_powStruct, name);
 	}
+	sc->gl_LocalInvocationID_x.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_LocalInvocationID_x, 50);
-	sc->gl_LocalInvocationID_x.type = 101;
+	sc->gl_LocalInvocationID_y.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_LocalInvocationID_y, 50);
-	sc->gl_LocalInvocationID_y.type = 101;
+	sc->gl_LocalInvocationID_z.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_LocalInvocationID_z, 50);
-	sc->gl_LocalInvocationID_z.type = 101;
+	sc->gl_GlobalInvocationID_x.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_GlobalInvocationID_x, 50);
-	sc->gl_GlobalInvocationID_x.type = 101;
+	sc->gl_GlobalInvocationID_y.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_GlobalInvocationID_y, 50);
-	sc->gl_GlobalInvocationID_y.type = 101;
+	sc->gl_GlobalInvocationID_z.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_GlobalInvocationID_z, 50);
-	sc->gl_GlobalInvocationID_z.type = 101;
+	sc->gl_WorkGroupSize_x.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_WorkGroupSize_x, 50);
-	sc->gl_WorkGroupSize_x.type = 101;
+	sc->gl_WorkGroupSize_y.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_WorkGroupSize_y, 50);
-	sc->gl_WorkGroupSize_y.type = 101;
+	sc->gl_WorkGroupSize_z.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_WorkGroupSize_z, 50);
-	sc->gl_WorkGroupSize_z.type = 101;
+	sc->gl_WorkGroupID_x.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_WorkGroupID_x, 50);
-	sc->gl_WorkGroupID_x.type = 101;
+	sc->gl_WorkGroupID_y.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_WorkGroupID_y, 50);
-	sc->gl_WorkGroupID_y.type = 101;
+	sc->gl_WorkGroupID_z.type = 100 + sc->uintType32Code;
 	PfAllocateContainerFlexible(sc, &sc->gl_WorkGroupID_z, 50);
-	sc->gl_WorkGroupID_z.type = 101;
-
+	
 	//PfAllocateContainerFlexible(sc, &sc->cosDef, 50);
 	//sc->cosDef.type = 100;
 	//PfAllocateContainerFlexible(sc, &sc->sinDef, 50);
 	//sc->sinDef.type = 100;
 
+	sc->constDef.type = 300;
 	PfAllocateContainerFlexible(sc, &sc->constDef, 50);
-	sc->constDef.type = 100;
-
+	
+	sc->functionDef.type = 300;
 	PfAllocateContainerFlexible(sc, &sc->functionDef, 50);
-	sc->functionDef.type = 100;
-
+	
 	if (sc->performWorkGroupShift[0]) {
-		PfAllocateContainerFlexible(sc, &sc->workGroupShiftX, 50);
 		sc->workGroupShiftX.type = 100 + sc->uintTypeCode;
-		sprintf(sc->workGroupShiftX.data.s, "workGroupShiftX");
+		PfAllocateContainerFlexible(sc, &sc->workGroupShiftX, 50);
+		sprintf(sc->workGroupShiftX.name, "workGroupShiftX");
 	}
 	if (sc->performWorkGroupShift[1]) {
-		PfAllocateContainerFlexible(sc, &sc->workGroupShiftY, 50);
 		sc->workGroupShiftY.type = 100 + sc->uintTypeCode;
-		sprintf(sc->workGroupShiftY.data.s, "workGroupShiftY");
+		PfAllocateContainerFlexible(sc, &sc->workGroupShiftY, 50);
+		sprintf(sc->workGroupShiftY.name, "workGroupShiftY");
 	}
 	if (sc->performWorkGroupShift[2]) {
-		PfAllocateContainerFlexible(sc, &sc->workGroupShiftZ, 50);
 		sc->workGroupShiftZ.type = 100 + sc->uintTypeCode;
-		sprintf(sc->workGroupShiftZ.data.s, "workGroupShiftZ");
+		PfAllocateContainerFlexible(sc, &sc->workGroupShiftZ, 50);
+		sprintf(sc->workGroupShiftZ.name, "workGroupShiftZ");
 	}
 	if (sc->performPostCompilationInputOffset) {
-		PfAllocateContainerFlexible(sc, &sc->inputOffset, 50);
 		sc->inputOffset.type = 100 + sc->uintTypeCode;
-		sprintf(sc->inputOffset.data.s, "inputOffset");
+		PfAllocateContainerFlexible(sc, &sc->inputOffset, 50);
+		sprintf(sc->inputOffset.name, "inputOffset");
 	}
 	if (sc->performPostCompilationOutputOffset) {
-		PfAllocateContainerFlexible(sc, &sc->outputOffset, 50);
 		sc->outputOffset.type = 100 + sc->uintTypeCode;
-		sprintf(sc->outputOffset.data.s, "outputOffset");
+		PfAllocateContainerFlexible(sc, &sc->outputOffset, 50);
+		sprintf(sc->outputOffset.name, "outputOffset");
 	}
 	if (sc->performPostCompilationKernelOffset) {
-		PfAllocateContainerFlexible(sc, &sc->kernelOffset, 50);
 		sc->kernelOffset.type = 100 + sc->uintTypeCode;
-		sprintf(sc->kernelOffset.data.s, "kernelOffset");
+		PfAllocateContainerFlexible(sc, &sc->kernelOffset, 50);
+		sprintf(sc->kernelOffset.name, "kernelOffset");
 	}
 #if(VKFFT_BACKEND==0)
-	sprintf(sc->inputsStruct.data.s, "inputs");
-	sprintf(sc->outputsStruct.data.s, "outputs");
-	sprintf(sc->gl_LocalInvocationID_x.data.s, "gl_LocalInvocationID.x");
-	sprintf(sc->gl_LocalInvocationID_y.data.s, "gl_LocalInvocationID.y");
-	sprintf(sc->gl_LocalInvocationID_z.data.s, "gl_LocalInvocationID.z");
+	sprintf(sc->inputsStruct.name, "inputs");
+	sprintf(sc->outputsStruct.name, "outputs");
+	sprintf(sc->gl_LocalInvocationID_x.name, "gl_LocalInvocationID.x");
+	sprintf(sc->gl_LocalInvocationID_y.name, "gl_LocalInvocationID.y");
+	sprintf(sc->gl_LocalInvocationID_z.name, "gl_LocalInvocationID.z");
 	switch (sc->swapComputeWorkGroupID) {
 	case 0:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "gl_GlobalInvocationID.x");
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "gl_GlobalInvocationID.y");
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "gl_GlobalInvocationID.z");
-		sprintf(sc->gl_WorkGroupID_x.data.s, "gl_WorkGroupID.x");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "gl_WorkGroupID.y");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "gl_WorkGroupID.z");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "gl_GlobalInvocationID.x");
+		sprintf(sc->gl_GlobalInvocationID_y.name, "gl_GlobalInvocationID.y");
+		sprintf(sc->gl_GlobalInvocationID_z.name, "gl_GlobalInvocationID.z");
+		sprintf(sc->gl_WorkGroupID_x.name, "gl_WorkGroupID.x");
+		sprintf(sc->gl_WorkGroupID_y.name, "gl_WorkGroupID.y");
+		sprintf(sc->gl_WorkGroupID_z.name, "gl_WorkGroupID.z");
 		break;
 	case 1:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(gl_LocalInvocationID.x + gl_WorkGroupID.y * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "(gl_LocalInvocationID.y + gl_WorkGroupID.x * %" PRIi64 ")", sc->localSize[1].data.i);
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "gl_GlobalInvocationID.z");
-		sprintf(sc->gl_WorkGroupID_x.data.s, "gl_WorkGroupID.y");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "gl_WorkGroupID.x");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "gl_WorkGroupID.z");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(gl_LocalInvocationID.x + gl_WorkGroupID.y * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "(gl_LocalInvocationID.y + gl_WorkGroupID.x * %" PRIi64 ")", sc->localSize[1].data.i);
+		sprintf(sc->gl_GlobalInvocationID_z.name, "gl_GlobalInvocationID.z");
+		sprintf(sc->gl_WorkGroupID_x.name, "gl_WorkGroupID.y");
+		sprintf(sc->gl_WorkGroupID_y.name, "gl_WorkGroupID.x");
+		sprintf(sc->gl_WorkGroupID_z.name, "gl_WorkGroupID.z");
 		break;
 	case 2:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(gl_LocalInvocationID.x + gl_WorkGroupID.z * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "gl_GlobalInvocationID.y");
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "(gl_LocalInvocationID.z + gl_WorkGroupID.x * %" PRIi64 ")", sc->localSize[2].data.i);
-		sprintf(sc->gl_WorkGroupID_x.data.s, "gl_WorkGroupID.z");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "gl_WorkGroupID.y");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "gl_WorkGroupID.x");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(gl_LocalInvocationID.x + gl_WorkGroupID.z * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "gl_GlobalInvocationID.y");
+		sprintf(sc->gl_GlobalInvocationID_z.name, "(gl_LocalInvocationID.z + gl_WorkGroupID.x * %" PRIi64 ")", sc->localSize[2].data.i);
+		sprintf(sc->gl_WorkGroupID_x.name, "gl_WorkGroupID.z");
+		sprintf(sc->gl_WorkGroupID_y.name, "gl_WorkGroupID.y");
+		sprintf(sc->gl_WorkGroupID_z.name, "gl_WorkGroupID.x");
 		break;
 	}
-	sprintf(sc->gl_WorkGroupSize_x.data.s, "%" PRIi64 "", sc->localSize[0].data.i);
-	sprintf(sc->gl_WorkGroupSize_y.data.s, "%" PRIi64 "", sc->localSize[1].data.i);
-	sprintf(sc->gl_WorkGroupSize_z.data.s, "%" PRIi64 "", sc->localSize[2].data.i);
-	//sprintf(sc->cosDef.data.s, "cos");
-	//sprintf(sc->sinDef.data.s, "sin");
-	sprintf(sc->constDef.data.s, "const");
+	sprintf(sc->gl_WorkGroupSize_x.name, "%" PRIi64 "", sc->localSize[0].data.i);
+	sprintf(sc->gl_WorkGroupSize_y.name, "%" PRIi64 "", sc->localSize[1].data.i);
+	sprintf(sc->gl_WorkGroupSize_z.name, "%" PRIi64 "", sc->localSize[2].data.i);
+	//sprintf(sc->cosDef.name, "cos");
+	//sprintf(sc->sinDef.name, "sin");
+	sprintf(sc->constDef.name, "const");
 #elif((VKFFT_BACKEND==1) ||(VKFFT_BACKEND==2))
-	sprintf(sc->inputsStruct.data.s, "inputs");
-	sprintf(sc->outputsStruct.data.s, "outputs");
-	sprintf(sc->gl_LocalInvocationID_x.data.s, "threadIdx.x");
-	sprintf(sc->gl_LocalInvocationID_y.data.s, "threadIdx.y");
-	sprintf(sc->gl_LocalInvocationID_z.data.s, "threadIdx.z");
+	sprintf(sc->inputsStruct.name, "inputs");
+	sprintf(sc->outputsStruct.name, "outputs");
+	sprintf(sc->gl_LocalInvocationID_x.name, "threadIdx.x");
+	sprintf(sc->gl_LocalInvocationID_y.name, "threadIdx.y");
+	sprintf(sc->gl_LocalInvocationID_z.name, "threadIdx.z");
 	switch (sc->swapComputeWorkGroupID) {
 	case 0:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(threadIdx.x + blockIdx.x * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "(threadIdx.y + blockIdx.y * %" PRIi64 ")", sc->localSize[1].data.i);
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "(threadIdx.z + blockIdx.z * %" PRIi64 ")", sc->localSize[2].data.i);
-		sprintf(sc->gl_WorkGroupID_x.data.s, "blockIdx.x");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "blockIdx.y");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "blockIdx.z");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(threadIdx.x + blockIdx.x * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "(threadIdx.y + blockIdx.y * %" PRIi64 ")", sc->localSize[1].data.i);
+		sprintf(sc->gl_GlobalInvocationID_z.name, "(threadIdx.z + blockIdx.z * %" PRIi64 ")", sc->localSize[2].data.i);
+		sprintf(sc->gl_WorkGroupID_x.name, "blockIdx.x");
+		sprintf(sc->gl_WorkGroupID_y.name, "blockIdx.y");
+		sprintf(sc->gl_WorkGroupID_z.name, "blockIdx.z");
 		break;
 	case 1:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(threadIdx.x + blockIdx.y * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "(threadIdx.y + blockIdx.x * %" PRIi64 ")", sc->localSize[1].data.i);
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "(threadIdx.z + blockIdx.z * %" PRIi64 ")", sc->localSize[2].data.i);
-		sprintf(sc->gl_WorkGroupID_x.data.s, "blockIdx.y");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "blockIdx.x");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "blockIdx.z");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(threadIdx.x + blockIdx.y * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "(threadIdx.y + blockIdx.x * %" PRIi64 ")", sc->localSize[1].data.i);
+		sprintf(sc->gl_GlobalInvocationID_z.name, "(threadIdx.z + blockIdx.z * %" PRIi64 ")", sc->localSize[2].data.i);
+		sprintf(sc->gl_WorkGroupID_x.name, "blockIdx.y");
+		sprintf(sc->gl_WorkGroupID_y.name, "blockIdx.x");
+		sprintf(sc->gl_WorkGroupID_z.name, "blockIdx.z");
 		break;
 	case 2:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(threadIdx.x + blockIdx.z * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "(threadIdx.y + blockIdx.y * %" PRIi64 ")", sc->localSize[1].data.i);
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "(threadIdx.z + blockIdx.x * %" PRIi64 ")", sc->localSize[2].data.i);
-		sprintf(sc->gl_WorkGroupID_x.data.s, "blockIdx.z");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "blockIdx.y");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "blockIdx.x");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(threadIdx.x + blockIdx.z * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "(threadIdx.y + blockIdx.y * %" PRIi64 ")", sc->localSize[1].data.i);
+		sprintf(sc->gl_GlobalInvocationID_z.name, "(threadIdx.z + blockIdx.x * %" PRIi64 ")", sc->localSize[2].data.i);
+		sprintf(sc->gl_WorkGroupID_x.name, "blockIdx.z");
+		sprintf(sc->gl_WorkGroupID_y.name, "blockIdx.y");
+		sprintf(sc->gl_WorkGroupID_z.name, "blockIdx.x");
 		break;
 	}
-	sprintf(sc->gl_WorkGroupSize_x.data.s, "%" PRIi64 "", sc->localSize[0].data.i);
-	sprintf(sc->gl_WorkGroupSize_y.data.s, "%" PRIi64 "", sc->localSize[1].data.i);
-	sprintf(sc->gl_WorkGroupSize_z.data.s, "%" PRIi64 "", sc->localSize[2].data.i);
-	//sprintf(sc->cosDef.data.s, "__cosf");
-	//sprintf(sc->sinDef.data.s, "__sinf");
-	sprintf(sc->constDef.data.s, "const");
-	sprintf(sc->functionDef.data.s, "__device__ static __inline__ ");
+	sprintf(sc->gl_WorkGroupSize_x.name, "%" PRIi64 "", sc->localSize[0].data.i);
+	sprintf(sc->gl_WorkGroupSize_y.name, "%" PRIi64 "", sc->localSize[1].data.i);
+	sprintf(sc->gl_WorkGroupSize_z.name, "%" PRIi64 "", sc->localSize[2].data.i);
+	//sprintf(sc->cosDef.name, "__cosf");
+	//sprintf(sc->sinDef.name, "__sinf");
+	sprintf(sc->constDef.name, "const");
+	sprintf(sc->functionDef.name, "__device__ static __inline__ ");
 #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
-	sprintf(sc->inputsStruct.data.s, "inputs");
-	sprintf(sc->outputsStruct.data.s, "outputs");
-	sprintf(sc->gl_LocalInvocationID_x.data.s, "get_local_id(0)");
-	sprintf(sc->gl_LocalInvocationID_y.data.s, "get_local_id(1)");
-	sprintf(sc->gl_LocalInvocationID_z.data.s, "get_local_id(2)");
+	sprintf(sc->inputsStruct.name, "inputs");
+	sprintf(sc->outputsStruct.name, "outputs");
+	sprintf(sc->gl_LocalInvocationID_x.name, "get_local_id(0)");
+	sprintf(sc->gl_LocalInvocationID_y.name, "get_local_id(1)");
+	sprintf(sc->gl_LocalInvocationID_z.name, "get_local_id(2)");
 	switch (sc->swapComputeWorkGroupID) {
 	case 0:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "get_global_id(0)");
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "get_global_id(1)");
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "get_global_id(2)");
-		sprintf(sc->gl_WorkGroupID_x.data.s, "get_group_id(0)");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "get_group_id(1)");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "get_group_id(2)");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "get_global_id(0)");
+		sprintf(sc->gl_GlobalInvocationID_y.name, "get_global_id(1)");
+		sprintf(sc->gl_GlobalInvocationID_z.name, "get_global_id(2)");
+		sprintf(sc->gl_WorkGroupID_x.name, "get_group_id(0)");
+		sprintf(sc->gl_WorkGroupID_y.name, "get_group_id(1)");
+		sprintf(sc->gl_WorkGroupID_z.name, "get_group_id(2)");
 		break;
 	case 1:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(get_local_id(0) + get_group_id(1) * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "(get_local_id(1) + get_group_id(0) * %" PRIi64 ")", sc->localSize[1].data.i);
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "get_global_id(2)");
-		sprintf(sc->gl_WorkGroupID_x.data.s, "get_group_id(1)");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "get_group_id(0)");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "get_group_id(2)");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(get_local_id(0) + get_group_id(1) * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "(get_local_id(1) + get_group_id(0) * %" PRIi64 ")", sc->localSize[1].data.i);
+		sprintf(sc->gl_GlobalInvocationID_z.name, "get_global_id(2)");
+		sprintf(sc->gl_WorkGroupID_x.name, "get_group_id(1)");
+		sprintf(sc->gl_WorkGroupID_y.name, "get_group_id(0)");
+		sprintf(sc->gl_WorkGroupID_z.name, "get_group_id(2)");
 		break;
 	case 2:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(get_local_id(0) + get_group_id(2) * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "get_global_id(1)");
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "(get_local_id(2) + get_group_id(0) * %" PRIi64 ")", sc->localSize[2].data.i);
-		sprintf(sc->gl_WorkGroupID_x.data.s, "get_group_id(2)");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "get_group_id(1)");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "get_group_id(0)");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(get_local_id(0) + get_group_id(2) * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "get_global_id(1)");
+		sprintf(sc->gl_GlobalInvocationID_z.name, "(get_local_id(2) + get_group_id(0) * %" PRIi64 ")", sc->localSize[2].data.i);
+		sprintf(sc->gl_WorkGroupID_x.name, "get_group_id(2)");
+		sprintf(sc->gl_WorkGroupID_y.name, "get_group_id(1)");
+		sprintf(sc->gl_WorkGroupID_z.name, "get_group_id(0)");
 		break;
 	}
-	sprintf(sc->gl_WorkGroupSize_x.data.s, "%" PRIi64 "", sc->localSize[0].data.i);
-	sprintf(sc->gl_WorkGroupSize_y.data.s, "%" PRIi64 "", sc->localSize[1].data.i);
-	sprintf(sc->gl_WorkGroupSize_z.data.s, "%" PRIi64 "", sc->localSize[2].data.i);
-	//sprintf(sc->cosDef.data.s, "native_cos");
-	//sprintf(sc->sinDef.data.s, "native_sin");
-	sprintf(sc->constDef.data.s, "__constant");
-	sprintf(sc->functionDef.data.s, "static __inline__ ");
+	sprintf(sc->gl_WorkGroupSize_x.name, "%" PRIi64 "", sc->localSize[0].data.i);
+	sprintf(sc->gl_WorkGroupSize_y.name, "%" PRIi64 "", sc->localSize[1].data.i);
+	sprintf(sc->gl_WorkGroupSize_z.name, "%" PRIi64 "", sc->localSize[2].data.i);
+	//sprintf(sc->cosDef.name, "native_cos");
+	//sprintf(sc->sinDef.name, "native_sin");
+	sprintf(sc->constDef.name, "__constant");
+	sprintf(sc->functionDef.name, "static __inline__ ");
 #elif(VKFFT_BACKEND==5)
-	sprintf(sc->inputsStruct.data.s, "inputs");
-	sprintf(sc->outputsStruct.data.s, "outputs");
-	sprintf(sc->gl_LocalInvocationID_x.data.s, "thread_position_in_threadgroup.x");
-	sprintf(sc->gl_LocalInvocationID_y.data.s, "thread_position_in_threadgroup.y");
-	sprintf(sc->gl_LocalInvocationID_z.data.s, "thread_position_in_threadgroup.z");
+	sprintf(sc->inputsStruct.name, "inputs");
+	sprintf(sc->outputsStruct.name, "outputs");
+	sprintf(sc->gl_LocalInvocationID_x.name, "thread_position_in_threadgroup.x");
+	sprintf(sc->gl_LocalInvocationID_y.name, "thread_position_in_threadgroup.y");
+	sprintf(sc->gl_LocalInvocationID_z.name, "thread_position_in_threadgroup.z");
 	switch (sc->swapComputeWorkGroupID) {
 	case 0:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "thread_position_in_grid.x");
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "thread_position_in_grid.y");
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "thread_position_in_grid.z");
-		sprintf(sc->gl_WorkGroupID_x.data.s, "threadgroup_position_in_grid.x");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "threadgroup_position_in_grid.y");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "threadgroup_position_in_grid.z");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "thread_position_in_grid.x");
+		sprintf(sc->gl_GlobalInvocationID_y.name, "thread_position_in_grid.y");
+		sprintf(sc->gl_GlobalInvocationID_z.name, "thread_position_in_grid.z");
+		sprintf(sc->gl_WorkGroupID_x.name, "threadgroup_position_in_grid.x");
+		sprintf(sc->gl_WorkGroupID_y.name, "threadgroup_position_in_grid.y");
+		sprintf(sc->gl_WorkGroupID_z.name, "threadgroup_position_in_grid.z");
 		break;
 	case 1:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.y * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "(thread_position_in_threadgroup.y + threadgroup_position_in_grid.x * %" PRIi64 ")", sc->localSize[1].data.i);
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "thread_position_in_threadgroup.z");
-		sprintf(sc->gl_WorkGroupID_x.data.s, "threadgroup_position_in_grid.y");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "threadgroup_position_in_grid.x");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "threadgroup_position_in_grid.z");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.y * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "(thread_position_in_threadgroup.y + threadgroup_position_in_grid.x * %" PRIi64 ")", sc->localSize[1].data.i);
+		sprintf(sc->gl_GlobalInvocationID_z.name, "thread_position_in_threadgroup.z");
+		sprintf(sc->gl_WorkGroupID_x.name, "threadgroup_position_in_grid.y");
+		sprintf(sc->gl_WorkGroupID_y.name, "threadgroup_position_in_grid.x");
+		sprintf(sc->gl_WorkGroupID_z.name, "threadgroup_position_in_grid.z");
 		break;
 	case 2:
-		sprintf(sc->gl_GlobalInvocationID_x.data.s, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.z * %" PRIi64 ")", sc->localSize[0].data.i);
-		sprintf(sc->gl_GlobalInvocationID_y.data.s, "thread_position_in_threadgroup.y");
-		sprintf(sc->gl_GlobalInvocationID_z.data.s, "(thread_position_in_threadgroup.z + threadgroup_position_in_grid.x * %" PRIi64 ")", sc->localSize[2].data.i);
-		sprintf(sc->gl_WorkGroupID_x.data.s, "threadgroup_position_in_grid.z");
-		sprintf(sc->gl_WorkGroupID_y.data.s, "threadgroup_position_in_grid.y");
-		sprintf(sc->gl_WorkGroupID_z.data.s, "threadgroup_position_in_grid.x");
+		sprintf(sc->gl_GlobalInvocationID_x.name, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.z * %" PRIi64 ")", sc->localSize[0].data.i);
+		sprintf(sc->gl_GlobalInvocationID_y.name, "thread_position_in_threadgroup.y");
+		sprintf(sc->gl_GlobalInvocationID_z.name, "(thread_position_in_threadgroup.z + threadgroup_position_in_grid.x * %" PRIi64 ")", sc->localSize[2].data.i);
+		sprintf(sc->gl_WorkGroupID_x.name, "threadgroup_position_in_grid.z");
+		sprintf(sc->gl_WorkGroupID_y.name, "threadgroup_position_in_grid.y");
+		sprintf(sc->gl_WorkGroupID_z.name, "threadgroup_position_in_grid.x");
 		break;
 }
-	sprintf(sc->gl_WorkGroupSize_x.data.s, "%" PRIi64 "", sc->localSize[0].data.i);
-	sprintf(sc->gl_WorkGroupSize_y.data.s, "%" PRIi64 "", sc->localSize[1].data.i);
-	sprintf(sc->gl_WorkGroupSize_z.data.s, "%" PRIi64 "", sc->localSize[2].data.i);
-	//sprintf(sc->cosDef.data.s, "native_cos");
-	//sprintf(sc->sinDef.data.s, "native_sin");
-	sprintf(sc->constDef.data.s, "constant");
+	sprintf(sc->gl_WorkGroupSize_x.name, "%" PRIi64 "", sc->localSize[0].data.i);
+	sprintf(sc->gl_WorkGroupSize_y.name, "%" PRIi64 "", sc->localSize[1].data.i);
+	sprintf(sc->gl_WorkGroupSize_z.name, "%" PRIi64 "", sc->localSize[2].data.i);
+	//sprintf(sc->cosDef.name, "native_cos");
+	//sprintf(sc->sinDef.name, "native_sin");
+	sprintf(sc->constDef.name, "constant");
 #endif
 	return res;
 }
@@ -507,9 +556,11 @@ static inline VkFFTResult freeMemoryParametersAPI(VkFFTApplication* app, VkFFTSp
 	PfDeallocateContainer(sc, &sc->halfDef);
 	PfDeallocateContainer(sc, &sc->floatDef);
 	PfDeallocateContainer(sc, &sc->doubleDef);
+	PfDeallocateContainer(sc, &sc->quadDef);
 	PfDeallocateContainer(sc, &sc->half2Def);
 	PfDeallocateContainer(sc, &sc->float2Def);
 	PfDeallocateContainer(sc, &sc->double2Def);
+	PfDeallocateContainer(sc, &sc->quad2Def);
 	PfDeallocateContainer(sc, &sc->intDef);
 	PfDeallocateContainer(sc, &sc->uintDef);
 	PfDeallocateContainer(sc, &sc->int64Def);
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_ManageMemory.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_ManageMemory.h
index 3669bee3..b60076cc 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_ManageMemory.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_ManageMemory.h
@@ -25,13 +25,13 @@
 #include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h"
 
 #if(VKFFT_BACKEND==0)
-static inline VkFFTResult findMemoryType(VkFFTApplication* app, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t* memoryTypeIndex) {
+static inline VkFFTResult findMemoryType(VkFFTApplication* app, pfUINT memoryTypeBits, pfUINT memorySize, VkMemoryPropertyFlags properties, uint32_t* memoryTypeIndex) {
 	VkPhysicalDeviceMemoryProperties memoryProperties = { 0 };
 
 	vkGetPhysicalDeviceMemoryProperties(app->configuration.physicalDevice[0], &memoryProperties);
 
-	for (uint64_t i = 0; i < memoryProperties.memoryTypeCount; ++i) {
-		if ((memoryTypeBits & ((uint64_t)1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties) && (memoryProperties.memoryHeaps[memoryProperties.memoryTypes[i].heapIndex].size >= memorySize))
+	for (pfUINT i = 0; i < memoryProperties.memoryTypeCount; ++i) {
+		if ((memoryTypeBits & ((pfUINT)1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties) && (memoryProperties.memoryHeaps[memoryProperties.memoryTypes[i].heapIndex].size >= memorySize))
 		{
 			memoryTypeIndex[0] = (uint32_t)i;
 			return VKFFT_SUCCESS;
@@ -65,22 +65,29 @@ static inline VkFFTResult allocateBufferVulkan(VkFFTApplication* app, VkBuffer*
 }
 #endif
 
-static inline VkFFTResult VkFFT_TransferDataFromCPU(VkFFTApplication* app, void* cpu_arr, void* input_buffer, uint64_t transferSize) {
+static inline VkFFTResult VkFFT_TransferDataFromCPU(VkFFTApplication* app, void* cpu_arr, void* input_buffer, pfUINT transferSize) {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
 	VkBuffer* buffer = (VkBuffer*)input_buffer;
 	VkDeviceSize bufferSize = transferSize;
 	VkResult res = VK_SUCCESS;
 	VkDeviceSize stagingBufferSize = bufferSize;
-	VkBuffer stagingBuffer = { 0 };
-	VkDeviceMemory stagingBufferMemory = { 0 };
-	resFFT = allocateBufferVulkan(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
-	if (resFFT != VKFFT_SUCCESS) return resFFT;
+	VkBuffer* stagingBuffer = VKFFT_ZERO_INIT;
+	VkDeviceMemory* stagingBufferMemory = VKFFT_ZERO_INIT;
+	if (!app->configuration.stagingBuffer){
+		stagingBuffer = (VkBuffer*)calloc(1, sizeof(VkBuffer));
+		stagingBufferMemory = (VkDeviceMemory*)calloc(1, sizeof(VkDeviceMemory));
+		resFFT = allocateBufferVulkan(app, stagingBuffer, stagingBufferMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
+		if (resFFT != VKFFT_SUCCESS) return resFFT;
+	}else{
+		stagingBuffer = app->configuration.stagingBuffer;
+		stagingBufferMemory = app->configuration.stagingBufferMemory;
+	}
 	void* data;
-	res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data);
+	res = vkMapMemory(app->configuration.device[0], stagingBufferMemory[0], 0, stagingBufferSize, 0, &data);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_MAP_MEMORY;
 	memcpy(data, cpu_arr, stagingBufferSize);
-	vkUnmapMemory(app->configuration.device[0], stagingBufferMemory);
+	vkUnmapMemory(app->configuration.device[0], stagingBufferMemory[0]);
 	VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 	commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0];
 	commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
@@ -96,7 +103,7 @@ static inline VkFFTResult VkFFT_TransferDataFromCPU(VkFFTApplication* app, void*
 	copyRegion.srcOffset = 0;
 	copyRegion.dstOffset = 0;
 	copyRegion.size = stagingBufferSize;
-	vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, &copyRegion);
+	vkCmdCopyBuffer(commandBuffer, stagingBuffer[0], buffer[0], 1, &copyRegion);
 	res = vkEndCommandBuffer(commandBuffer);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
 	VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
@@ -109,9 +116,12 @@ static inline VkFFTResult VkFFT_TransferDataFromCPU(VkFFTApplication* app, void*
 	res = vkResetFences(app->configuration.device[0], 1, app->configuration.fence);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
 	vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer);
-	vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0);
-	vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0);
-	return resFFT;
+	if (!app->configuration.stagingBuffer){
+		vkDestroyBuffer(app->configuration.device[0], stagingBuffer[0], 0);
+		vkFreeMemory(app->configuration.device[0], stagingBufferMemory[0], 0);
+		free(stagingBuffer);
+		free(stagingBufferMemory);
+	}
 #elif(VKFFT_BACKEND==1)
 	cudaError_t res = cudaSuccess;
 	void* buffer = ((void**)input_buffer)[0];
@@ -179,17 +189,24 @@ static inline VkFFTResult VkFFT_TransferDataFromCPU(VkFFTApplication* app, void*
 #endif
 	return resFFT;
 }
-static inline VkFFTResult VkFFT_TransferDataToCPU(VkFFTApplication* app, void* cpu_arr, void* output_buffer, uint64_t transferSize) {
+static inline VkFFTResult VkFFT_TransferDataToCPU(VkFFTApplication* app, void* cpu_arr, void* output_buffer, pfUINT transferSize) {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
 	VkBuffer* buffer = (VkBuffer*)output_buffer;
 	VkDeviceSize bufferSize = transferSize;
 	VkResult res = VK_SUCCESS;
-	uint64_t stagingBufferSize = bufferSize;
-	VkBuffer stagingBuffer = { 0 };
-	VkDeviceMemory stagingBufferMemory = { 0 };
-	resFFT = allocateBufferVulkan(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
-	if (resFFT != VKFFT_SUCCESS) return resFFT;
+	pfUINT stagingBufferSize = bufferSize;
+	VkBuffer* stagingBuffer = VKFFT_ZERO_INIT;
+	VkDeviceMemory* stagingBufferMemory = VKFFT_ZERO_INIT;
+	if (!app->configuration.stagingBuffer){
+		stagingBuffer = (VkBuffer*)calloc(1, sizeof(VkBuffer));
+		stagingBufferMemory = (VkDeviceMemory*)calloc(1, sizeof(VkDeviceMemory));
+		resFFT = allocateBufferVulkan(app, stagingBuffer, stagingBufferMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
+		if (resFFT != VKFFT_SUCCESS) return resFFT;
+	}else{
+		stagingBuffer = app->configuration.stagingBuffer;
+		stagingBufferMemory = app->configuration.stagingBufferMemory;
+	}
 	VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 	commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0];
 	commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
@@ -205,7 +222,7 @@ static inline VkFFTResult VkFFT_TransferDataToCPU(VkFFTApplication* app, void* c
 	copyRegion.srcOffset = 0;
 	copyRegion.dstOffset = 0;
 	copyRegion.size = stagingBufferSize;
-	vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer, 1, &copyRegion);
+	vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer[0], 1, &copyRegion);
 	res = vkEndCommandBuffer(commandBuffer);
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
 	VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
@@ -219,12 +236,16 @@ static inline VkFFTResult VkFFT_TransferDataToCPU(VkFFTApplication* app, void* c
 	if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
 	vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer);
 	void* data;
-	res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data);
+	res = vkMapMemory(app->configuration.device[0], stagingBufferMemory[0], 0, stagingBufferSize, 0, &data);
 	if (resFFT != VKFFT_SUCCESS) return resFFT;
 	memcpy(cpu_arr, data, stagingBufferSize);
-	vkUnmapMemory(app->configuration.device[0], stagingBufferMemory);
-	vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0);
-	vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0);
+	vkUnmapMemory(app->configuration.device[0], stagingBufferMemory[0]);
+	if (!app->configuration.stagingBuffer){
+		vkDestroyBuffer(app->configuration.device[0], stagingBuffer[0], 0);
+		vkFreeMemory(app->configuration.device[0], stagingBufferMemory[0], 0);
+		free(stagingBuffer);
+		free(stagingBufferMemory);
+	}
 #elif(VKFFT_BACKEND==1)
 	cudaError_t res = cudaSuccess;
 	void* buffer = ((void**)output_buffer)[0];
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h
index 3cea0503..26e00aac 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_UpdateBuffers.h
@@ -24,55 +24,55 @@
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h"
 
-static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) {
-	uint64_t initPageSize = -1;
-	uint64_t locBufferNum = 1;
-	uint64_t locBufferSize = -1;
+static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse) {
+	pfUINT initPageSize = -1;
+	pfUINT locBufferNum = 1;
+	pfUINT locBufferSize = -1;
 	if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && (
 		((axis_id == app->firstAxis) && (!inverse))
 		|| ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
 		) {
-		uint64_t totalSize = 0;
-		uint64_t locPageSize = initPageSize;
+		pfUINT totalSize = 0;
+		pfUINT locPageSize = initPageSize;
 		locBufferNum = app->configuration.inputBufferNum;
 		if (app->configuration.inputBufferSize) {
-			locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-			for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
+			locBufferSize = app->configuration.inputBufferSize[0];
+			for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
 				totalSize += app->configuration.inputBufferSize[i];
 				if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i];
 			}
 		}
-		axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-		axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+		axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+		axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 		//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 	}
 	else {
 		if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
-			uint64_t totalSize = 0;
-			uint64_t locPageSize = initPageSize;
+			pfUINT totalSize = 0;
+			pfUINT locPageSize = initPageSize;
 			locBufferNum = app->configuration.outputBufferNum;
 			if (app->configuration.outputBufferSize) {
-				locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-				for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+				locBufferSize = app->configuration.outputBufferSize[0];
+				for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 					totalSize += app->configuration.outputBufferSize[i];
 					if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
 				}
 			}
-			axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-			axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+			axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+			axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 			//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 		}
 		else {
-			uint64_t totalSize = 0;
-			uint64_t locPageSize = initPageSize;
+			pfUINT totalSize = 0;
+			pfUINT locPageSize = initPageSize;
 			if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
-				if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) {
-					locBufferNum = app->configuration.bufferNum;
+				if ((((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || (app->useBluesteinFFT[axis_id] && (axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload) && (axis->specializationConstants.reorderFourStep == 1) && (inverse == 1)))) {
+                    locBufferNum = app->configuration.bufferNum;
 					if (app->configuration.bufferSize) {
-						locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-						for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+						locBufferSize = app->configuration.bufferSize[0];
+						for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 							totalSize += app->configuration.bufferSize[i];
 							if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
 
@@ -82,8 +82,8 @@ static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFT
 				else {
 					locBufferNum = app->configuration.tempBufferNum;
 					if (app->configuration.tempBufferSize) {
-						locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)axis->specializationConstants.complexSize);
-						for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) {
+						locBufferSize = app->configuration.tempBufferSize[0];
+						for (pfUINT i = 0; i < app->configuration.tempBufferNum; i++) {
 							totalSize += app->configuration.tempBufferSize[i];
 							if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i];
 
@@ -94,8 +94,8 @@ static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFT
 			else {
 				locBufferNum = app->configuration.bufferNum;
 				if (app->configuration.bufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+					locBufferSize = app->configuration.bufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 						totalSize += app->configuration.bufferSize[i];
 						if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
 
@@ -103,8 +103,8 @@ static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFT
 				}
 			}
 
-			axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-			axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+			axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+			axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 			//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 		}
@@ -125,76 +125,104 @@ static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFT
 			(inverse)
 			|| (axis_id == app->lastAxis)))
 		) {
-		uint64_t totalSize = 0;
-		uint64_t locPageSize = initPageSize;
+		pfUINT totalSize = 0;
+		pfUINT locPageSize = initPageSize;
 		locBufferNum = app->configuration.outputBufferNum;
 		if (app->configuration.outputBufferSize) {
-			locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-			for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+			locBufferSize = app->configuration.outputBufferSize[0];
+			for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 				totalSize += app->configuration.outputBufferSize[i];
 				if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
 			}
 		}
-		axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-		axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+		axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+		axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize));
 		//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 	}
 	else {
-		uint64_t totalSize = 0;
-		uint64_t locPageSize = initPageSize;
-		if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
-			if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
-				locBufferNum = app->configuration.tempBufferNum;
-				if (app->configuration.tempBufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) {
-						totalSize += app->configuration.tempBufferSize[i];
-						if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i];
-					}
-				}
-			}
-			else {
-				locBufferNum = app->configuration.bufferNum;
-				if (app->configuration.bufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
-						totalSize += app->configuration.bufferSize[i];
-						if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
-					}
-				}
-			}
+		pfUINT totalSize = 0;
+		pfUINT locPageSize = initPageSize;
+        if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
+            if ((inverse) && (axis_id == app->firstAxis) && (
+                ((axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer) && (!app->useBluesteinFFT[axis_id]))
+                || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (axis->specializationConstants.actualInverse) && (app->configuration.inverseReturnToInputBuffer) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1))))
+                ) {
+                    locBufferNum = app->configuration.inputBufferNum;
+                    if (app->configuration.inputBufferSize) {
+                        locBufferSize = app->configuration.inputBufferSize[0];
+                        for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
+                            totalSize += app->configuration.inputBufferSize[i];
+                            if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i];
+                        }
+                    } 
+                }
+                else{
+                    if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
+								
+                        locBufferNum = app->configuration.tempBufferNum;
+                        if (app->configuration.tempBufferSize) {
+                            locBufferSize = app->configuration.tempBufferSize[0];
+                            for (pfUINT i = 0; i < app->configuration.tempBufferNum; i++) {
+                                totalSize += app->configuration.tempBufferSize[i];
+                                if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i];
+					        }
+				        }
+                    }
+                    else {
+                        locBufferNum = app->configuration.bufferNum;
+                        if (app->configuration.bufferSize) {
+                            locBufferSize = app->configuration.bufferSize[0];
+                            for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
+                                totalSize += app->configuration.bufferSize[i];
+                                if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
+                            }
+                        }
+                    }
+                }
 		}
 		else {
-			locBufferNum = app->configuration.bufferNum;
-			if (app->configuration.bufferSize) {
-				locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-				for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
-					totalSize += app->configuration.bufferSize[i];
-					if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
-				}
-			}
-		}
-		axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-		axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+            if ((inverse) && (axis_id == app->firstAxis) && (axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer)) {				
+                locBufferNum = app->configuration.inputBufferNum;
+                if (app->configuration.inputBufferSize) {
+                    locBufferSize = app->configuration.inputBufferSize[0];
+                    for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
+                        totalSize += app->configuration.inputBufferSize[i];
+                        if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i];
+                    }
+                } 
+            }
+            else {
+                locBufferNum = app->configuration.bufferNum;
+                if (app->configuration.bufferSize) {
+                    locBufferSize = app->configuration.bufferSize[0];
+                    for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
+                        totalSize += app->configuration.bufferSize[i];
+                        if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
+                    }
+                }
+            }
+        }
+		axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+		axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize));
 		//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 	}
 	if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
 	if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
 	if (app->configuration.performConvolution) {
-		uint64_t totalSize = 0;
-		uint64_t locPageSize = initPageSize;
+		pfUINT totalSize = 0;
+		pfUINT locPageSize = initPageSize;
 		locBufferNum = app->configuration.kernelNum;
 		if (app->configuration.kernelSize) {
-			locBufferSize = (uint64_t)ceil(app->configuration.kernelSize[0] / (double)axis->specializationConstants.complexSize);
-			for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
+			locBufferSize = app->configuration.kernelSize[0];
+			for (pfUINT i = 0; i < app->configuration.kernelNum; i++) {
 				totalSize += app->configuration.kernelSize[i];
 				if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i];
 			}
 		}
-		axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-		axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * axis->specializationConstants.complexSize));
+		axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+		axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize));
 		//if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 		if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
 	}
@@ -275,7 +303,7 @@ static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFT
 		deleteVkFFT(app);
 		return VKFFT_ERROR_MALLOC_FAILED;
 	}
-	for (uint64_t i = 0; i < axis->numBindings; ++i) {
+	for (pfUINT i = 0; i < axis->numBindings; ++i) {
 		descriptorSetLayoutBindings[i].binding = (uint32_t)i;
 		descriptorSetLayoutBindings[i].descriptorType = descriptorType;
 		descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
@@ -305,10 +333,10 @@ static inline VkFFTResult VkFFTConfigureDescriptors(VkFFTApplication* app, VkFFT
 #endif
 	return VKFFT_SUCCESS;
 }
-static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) {
-	uint64_t initPageSize = -1;
-	uint64_t locBufferNum = 1;
-	uint64_t locBufferSize = 0;
+static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse) {
+	pfUINT initPageSize = -1;
+	pfUINT locBufferNum = 1;
+	pfUINT locBufferSize = 0;
 	
 	{
 		if (inverse) {
@@ -316,52 +344,52 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 				((axis_id == app->firstAxis) && (!inverse))
 				|| ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
 				) {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 				locBufferNum = app->configuration.inputBufferNum;
 				if (app->configuration.inputBufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
+					locBufferSize = app->configuration.inputBufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
 						totalSize += app->configuration.inputBufferSize[i];
 						if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i];
 					}
 				}
-				axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 				//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
 			else {
 				if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
-					uint64_t totalSize = 0;
-					uint64_t locPageSize = initPageSize;
+					pfUINT totalSize = 0;
+					pfUINT locPageSize = initPageSize;
 					locBufferNum = app->configuration.outputBufferNum;
 					if (app->configuration.outputBufferSize) {
-						locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-						for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+						locBufferSize = app->configuration.outputBufferSize[0];
+						for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 							totalSize += app->configuration.outputBufferSize[i];
 							if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
 						}
 					}
-					axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-					axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+					axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+					axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 					//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 				}
 				else {
-					uint64_t totalSize = 0;
-					uint64_t locPageSize = initPageSize;
+					pfUINT totalSize = 0;
+					pfUINT locPageSize = initPageSize;
 					locBufferNum = app->configuration.bufferNum;
 					if (app->configuration.bufferSize) {
-						locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-						for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+						locBufferSize = app->configuration.bufferSize[0];
+						for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 							totalSize += app->configuration.bufferSize[i];
 							if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
 
 						}
 					}
-					axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-					axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+					axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+					axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 					//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 				}
@@ -381,35 +409,35 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 					(inverse)
 					|| (axis_id == app->lastAxis)))
 				) {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 				locBufferNum = app->configuration.outputBufferNum;
 				if (app->configuration.outputBufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+					locBufferSize = app->configuration.outputBufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 						totalSize += app->configuration.outputBufferSize[i];
 						if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
 					}
 				}
-				axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 				//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
 			else {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 
 				locBufferNum = app->configuration.bufferNum;
 				if (app->configuration.bufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+					locBufferSize = app->configuration.bufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 						totalSize += app->configuration.bufferSize[i];
 						if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
 					}
 				}
-				axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize));
 				//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
@@ -421,35 +449,35 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 	{
 		if (inverse) {
 			if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 				locBufferNum = app->configuration.outputBufferNum;
 				if (app->configuration.outputBufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+					locBufferSize = app->configuration.outputBufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 						totalSize += app->configuration.outputBufferSize[i];
 						if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
 					}
 				}
-				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize));
 				//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
 			else {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 				locBufferNum = app->configuration.bufferNum;
 				if (app->configuration.bufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+					locBufferSize = app->configuration.bufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 						totalSize += app->configuration.bufferSize[i];
 						if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
 
 					}
 				}
-				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize));
 				//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
@@ -468,35 +496,35 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 					(inverse)
 					|| (axis_id == app->lastAxis)))
 				) {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 				locBufferNum = app->configuration.outputBufferNum;
 				if (app->configuration.outputBufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
+					locBufferSize = app->configuration.outputBufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
 						totalSize += app->configuration.outputBufferSize[i];
 						if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
 					}
 				}
-				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize));
 				//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
 			else {
-				uint64_t totalSize = 0;
-				uint64_t locPageSize = initPageSize;
+				pfUINT totalSize = 0;
+				pfUINT locPageSize = initPageSize;
 
 				locBufferNum = app->configuration.bufferNum;
 				if (app->configuration.bufferSize) {
-					locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)axis->specializationConstants.complexSize);
-					for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
+					locBufferSize = app->configuration.bufferSize[0];
+					for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
 						totalSize += app->configuration.bufferSize[i];
 						if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
 					}
 				}
-				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+				axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : locPageSize;
+				axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize));
 				//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 
 			}
@@ -507,16 +535,16 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 	if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
 	if (app->configuration.performConvolution) {
 		//need fixing (not used now)
-		uint64_t totalSize = 0;
-		uint64_t locPageSize = initPageSize;
+		pfUINT totalSize = 0;
+		pfUINT locPageSize = initPageSize;
 		if (app->configuration.kernelSize) {
-			for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
+			for (pfUINT i = 0; i < app->configuration.kernelNum; i++) {
 				totalSize += app->configuration.kernelSize[i];
 				if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i];
 			}
 		}
-		axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (double)axis->specializationConstants.complexSize);
-		axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * axis->specializationConstants.complexSize));
+		axis->specializationConstants.kernelBlockSize = locPageSize;
+		axis->specializationConstants.kernelBlockNum = (pfUINT)pfceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize));
 		//if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / axis->specializationConstants.complexSize;
 		if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
 	}
@@ -567,7 +595,7 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 		deleteVkFFT(app);
 		return VKFFT_ERROR_MALLOC_FAILED;
 	}
-	for (uint64_t i = 0; i < axis->numBindings; ++i) {
+	for (pfUINT i = 0; i < axis->numBindings; ++i) {
 		descriptorSetLayoutBindings[i].binding = (uint32_t)i;
 		descriptorSetLayoutBindings[i].descriptorType = descriptorType;
 		descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
@@ -597,9 +625,9 @@ static inline VkFFTResult VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(V
 #endif
 	return VKFFT_SUCCESS;
 }
-static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFTAxis* axis, uint64_t planStage, VkFFTLaunchParams* launchParams) {
-	uint64_t performBufferSetUpdate = planStage;
-	uint64_t performOffsetUpdate = planStage;
+static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFTAxis* axis, pfUINT planStage, VkFFTLaunchParams* launchParams) {
+	pfUINT performBufferSetUpdate = planStage;
+	pfUINT performOffsetUpdate = planStage;
 	if (!planStage) {
 		if (launchParams != 0) {
 			if ((launchParams->buffer != 0) && (app->configuration.buffer != launchParams->buffer)) {
@@ -685,11 +713,11 @@ static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFT
 		if (planStage) axis->specializationConstants.performBufferSetUpdate = 1;
 		else {
 			if (!app->configuration.makeInversePlanOnly) {
-				for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-					for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
+				for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+					for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
 						app->localFFTPlan->axes[i][j].specializationConstants.performBufferSetUpdate = 1;
 					if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-						for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++)
+						for (pfUINT j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++)
 							app->localFFTPlan->inverseBluesteinAxes[i][j].specializationConstants.performBufferSetUpdate = 1;
 					}
 				}
@@ -698,11 +726,11 @@ static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFT
 				}
 			}
 			if (!app->configuration.makeForwardPlanOnly) {
-				for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-					for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
+				for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+					for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
 						app->localFFTPlan_inverse->axes[i][j].specializationConstants.performBufferSetUpdate = 1;
 					if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-						for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
+						for (pfUINT j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
 							app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].specializationConstants.performBufferSetUpdate = 1;
 					}
 				}
@@ -716,11 +744,11 @@ static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFT
 		if (planStage) axis->specializationConstants.performOffsetUpdate = 1;
 		else {
 			if (!app->configuration.makeInversePlanOnly) {
-				for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-					for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
+				for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+					for (pfUINT j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
 						app->localFFTPlan->axes[i][j].specializationConstants.performOffsetUpdate = 1;
 					if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
-						for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++)
+						for (pfUINT j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++)
 							app->localFFTPlan->inverseBluesteinAxes[i][j].specializationConstants.performOffsetUpdate = 1;
 					}
 				}
@@ -729,11 +757,11 @@ static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFT
 				}
 			}
 			if (!app->configuration.makeForwardPlanOnly) {
-				for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
-					for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
+				for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
+					for (pfUINT j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
 						app->localFFTPlan_inverse->axes[i][j].specializationConstants.performOffsetUpdate = 1;
 					if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
-						for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
+						for (pfUINT j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
 							app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].specializationConstants.performOffsetUpdate = 1;
 					}
 				}
@@ -745,7 +773,7 @@ static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFT
 	}
 	return VKFFT_SUCCESS;
 }
-static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) {
+static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse) {
 	if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) {
 		axis->specializationConstants.inputOffset.type = 31;
 		axis->specializationConstants.outputOffset.type = 31;
@@ -753,8 +781,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 #if(VKFFT_BACKEND==0)
 		const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 #endif
-		for (uint64_t i = 0; i < axis->numBindings; ++i) {
-			for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) {
+		for (pfUINT i = 0; i < axis->numBindings; ++i) {
+			for (pfUINT j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) {
 #if(VKFFT_BACKEND==0)
 				VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
 #endif
@@ -764,14 +792,14 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 						|| ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
 						) {
 						if (axis->specializationConstants.performBufferSetUpdate) {
-							uint64_t bufferId = 0;
-							uint64_t offset = j;
+							pfUINT bufferId = 0;
+							pfUINT offset = j;
 							if (app->configuration.inputBufferSize)
 							{
-								for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
-									if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+								for (pfUINT l = 0; l < app->configuration.inputBufferNum; ++l) {
+									if (offset >= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 										bufferId++;
-										offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+										offset -= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 									}
 									else {
 										l = app->configuration.inputBufferNum;
@@ -782,8 +810,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							axis->inputBuffer = app->configuration.inputBuffer;
 #if(VKFFT_BACKEND==0)
 							descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId];
-							descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
-							descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
+							descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize);
+							descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize);
 #endif
 						}
 						if (axis->specializationConstants.performOffsetUpdate) {
@@ -793,14 +821,14 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 					else {
 						if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.outputBufferSize)
 								{
-									for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.outputBufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.outputBufferNum;
@@ -811,8 +839,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 								axis->inputBuffer = app->configuration.outputBuffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -820,17 +848,17 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							}
 						}
 						else {
-							uint64_t bufferId = 0;
-							uint64_t offset = j;
+							pfUINT bufferId = 0;
+							pfUINT offset = j;
 							if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
 								if ((((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || (app->useBluesteinFFT[axis_id] && (axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload) && (axis->specializationConstants.reorderFourStep == 1) && (inverse == 1)))) {
 									if (axis->specializationConstants.performBufferSetUpdate) {
 										if (app->configuration.bufferSize)
 										{
-											for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-												if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+											for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+												if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 													bufferId++;
-													offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+													offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 												}
 												else {
 													l = app->configuration.bufferNum;
@@ -850,10 +878,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 								else {
 									if (axis->specializationConstants.performBufferSetUpdate) {
 										if (app->configuration.tempBufferSize) {
-											for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) {
-												if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+											for (pfUINT l = 0; l < app->configuration.tempBufferNum; ++l) {
+												if (offset >= (pfUINT)pfceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 													bufferId++;
-													offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+													offset -= (pfUINT)pfceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 												}
 												else {
 													l = app->configuration.tempBufferNum;
@@ -874,10 +902,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							else {
 								if (axis->specializationConstants.performBufferSetUpdate) {
 									if (app->configuration.bufferSize) {
-										for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.bufferNum;
@@ -896,8 +924,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							}
 #if(VKFFT_BACKEND==0)
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize);
 							}
 #endif
 						}
@@ -919,13 +947,13 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							|| (axis_id == app->lastAxis)))
 						) {
 						if (axis->specializationConstants.performBufferSetUpdate) {
-							uint64_t bufferId = 0;
-							uint64_t offset = j;
+							pfUINT bufferId = 0;
+							pfUINT offset = j;
 							if (app->configuration.outputBufferSize) {
-								for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
-									if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+								for (pfUINT l = 0; l < app->configuration.outputBufferNum; ++l) {
+									if (offset >= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 										bufferId++;
-										offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+										offset -= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 									}
 									else {
 										l = app->configuration.outputBufferNum;
@@ -936,8 +964,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							axis->outputBuffer = app->configuration.outputBuffer;
 #if(VKFFT_BACKEND==0)
 							descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
-							descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-							descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+							descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+							descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 						}
 						if (axis->specializationConstants.performOffsetUpdate) {
@@ -945,8 +973,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 						}
 					}
 					else {
-						uint64_t bufferId = 0;
-						uint64_t offset = j;
+						pfUINT bufferId = 0;
+						pfUINT offset = j;
 
 						if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
 							if ((inverse) && (axis_id == app->firstAxis) && (
@@ -955,10 +983,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 								) {
 								if (axis->specializationConstants.performBufferSetUpdate) {
 									if (app->configuration.inputBufferSize) {
-										for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.inputBufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.inputBufferNum;
@@ -979,10 +1007,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 								if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
 									if (axis->specializationConstants.performBufferSetUpdate) {
 										if (app->configuration.tempBufferSize) {
-											for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) {
-												if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+											for (pfUINT l = 0; l < app->configuration.tempBufferNum; ++l) {
+												if (offset >= (pfUINT)pfceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 													bufferId++;
-													offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+													offset -= (pfUINT)pfceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 												}
 												else {
 													l = app->configuration.tempBufferNum;
@@ -1002,10 +1030,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 								else {
 									if (axis->specializationConstants.performBufferSetUpdate) {
 										if (app->configuration.bufferSize) {
-											for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-												if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+											for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+												if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 													bufferId++;
-													offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+													offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 												}
 												else {
 													l = app->configuration.bufferNum;
@@ -1028,10 +1056,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							if ((inverse) && (axis_id == app->firstAxis) && (axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer)) {
 								if (axis->specializationConstants.performBufferSetUpdate) {
 									if (app->configuration.inputBufferSize) {
-										for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.inputBufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.inputBufferNum;
@@ -1051,10 +1079,10 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 							else {
 								if (axis->specializationConstants.performBufferSetUpdate) {
 									if (app->configuration.bufferSize) {
-										for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.bufferNum;
@@ -1074,8 +1102,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 						}
 #if(VKFFT_BACKEND==0)
 						if (axis->specializationConstants.performBufferSetUpdate) {
-							descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-							descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+							descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+							descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 						}
 #endif
 					}
@@ -1083,13 +1111,13 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 				}
 				if ((i == axis->specializationConstants.convolutionBindingID) && (app->configuration.performConvolution)) {
 					if (axis->specializationConstants.performBufferSetUpdate) {
-						uint64_t bufferId = 0;
-						uint64_t offset = j;
+						pfUINT bufferId = 0;
+						pfUINT offset = j;
 						if (app->configuration.kernelSize) {
-							for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) {
-								if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+							for (pfUINT l = 0; l < app->configuration.kernelNum; ++l) {
+								if (offset >= (pfUINT)pfceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 									bufferId++;
-									offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+									offset -= (pfUINT)pfceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 								}
 								else {
 									l = app->configuration.kernelNum;
@@ -1099,8 +1127,8 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 						}
 #if(VKFFT_BACKEND==0)
 						descriptorBufferInfo.buffer = app->configuration.kernel[bufferId];
-						descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * axis->specializationConstants.complexSize);
-						descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * axis->specializationConstants.complexSize);
+						descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize);
+						descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize);
 #endif
 					}
 					if (axis->specializationConstants.performOffsetUpdate) {
@@ -1169,13 +1197,13 @@ static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan*
 	}
 	return VKFFT_SUCCESS;
 }
-static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) {
+static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse) {
 	if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) {
 #if(VKFFT_BACKEND==0)
 		const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 #endif
-		for (uint64_t i = 0; i < axis->numBindings; ++i) {
-			for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) {
+		for (pfUINT i = 0; i < axis->numBindings; ++i) {
+			for (pfUINT j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) {
 #if(VKFFT_BACKEND==0)
 				VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
 #endif
@@ -1186,13 +1214,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 							|| ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
 							) {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.inputBufferSize) {
-									for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.inputBufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.inputBufferNum;
@@ -1203,8 +1231,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								axis->inputBuffer = app->configuration.inputBuffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -1214,13 +1242,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 						else {
 							if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
 								if (axis->specializationConstants.performBufferSetUpdate) {
-									uint64_t bufferId = 0;
-									uint64_t offset = j;
+									pfUINT bufferId = 0;
+									pfUINT offset = j;
 									if (app->configuration.outputBufferSize) {
-										for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.outputBufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.outputBufferNum;
@@ -1231,8 +1259,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 									axis->inputBuffer = app->configuration.outputBuffer;
 #if(VKFFT_BACKEND==0)
 									descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
-									descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
-									descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
+									descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize);
+									descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize);
 #endif
 								}
 								if (axis->specializationConstants.performOffsetUpdate) {
@@ -1241,13 +1269,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 							}
 							else {
 								if (axis->specializationConstants.performBufferSetUpdate) {
-									uint64_t bufferId = 0;
-									uint64_t offset = j;
+									pfUINT bufferId = 0;
+									pfUINT offset = j;
 									if (app->configuration.bufferSize) {
-										for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.bufferNum;
@@ -1258,8 +1286,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 									axis->inputBuffer = app->configuration.buffer;
 #if(VKFFT_BACKEND==0)
 									descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
-									descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
-									descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * axis->specializationConstants.complexSize);
+									descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize);
+									descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize);
 #endif
 								}
 								if (axis->specializationConstants.performOffsetUpdate) {
@@ -1283,13 +1311,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								|| (axis_id == app->lastAxis)))
 							) {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.outputBufferSize) {
-									for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.outputBufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.outputBufferNum;
@@ -1300,8 +1328,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								axis->inputBuffer = app->configuration.outputBuffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -1310,13 +1338,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 						}
 						else {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.bufferSize) {
-									for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.bufferNum;
@@ -1327,8 +1355,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								axis->inputBuffer = app->configuration.buffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -1341,13 +1369,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 					if (inverse) {
 						if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.outputBufferSize) {
-									for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.outputBufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.outputBufferNum;
@@ -1358,8 +1386,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								axis->outputBuffer = app->configuration.outputBuffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -1367,15 +1395,15 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 							}
 						}
 						else {
-							uint64_t bufferId = 0;
-							uint64_t offset = j;
+							pfUINT bufferId = 0;
+							pfUINT offset = j;
 							if (axis->specializationConstants.reorderFourStep == 1) {
 								if (axis->specializationConstants.performBufferSetUpdate) {
 									if (app->configuration.tempBufferSize) {
-										for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.tempBufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.tempBufferNum;
@@ -1386,8 +1414,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 									axis->outputBuffer = app->configuration.tempBuffer;
 #if(VKFFT_BACKEND==0)
 									descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId];
-									descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-									descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+									descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+									descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 								}
 								if (axis->specializationConstants.performOffsetUpdate) {
@@ -1397,10 +1425,10 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 							else {
 								if (axis->specializationConstants.performBufferSetUpdate) {
 									if (app->configuration.bufferSize) {
-										for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-											if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+										for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+											if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 												bufferId++;
-												offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+												offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 											}
 											else {
 												l = app->configuration.bufferNum;
@@ -1411,8 +1439,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 									axis->outputBuffer = app->configuration.buffer;
 #if(VKFFT_BACKEND==0)
 									descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
-									descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-									descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+									descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+									descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 								}
 								if (axis->specializationConstants.performOffsetUpdate) {
@@ -1436,13 +1464,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								|| (axis_id == app->lastAxis)))
 							) {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.outputBufferSize) {
-									for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.outputBufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.outputBufferNum;
@@ -1453,8 +1481,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								axis->outputBuffer = app->configuration.outputBuffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -1463,13 +1491,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 						}
 						else {
 							if (axis->specializationConstants.performBufferSetUpdate) {
-								uint64_t bufferId = 0;
-								uint64_t offset = j;
+								pfUINT bufferId = 0;
+								pfUINT offset = j;
 								if (app->configuration.bufferSize) {
-									for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
-										if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+									for (pfUINT l = 0; l < app->configuration.bufferNum; ++l) {
+										if (offset >= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 											bufferId++;
-											offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+											offset -= (pfUINT)pfceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 										}
 										else {
 											l = app->configuration.bufferNum;
@@ -1480,8 +1508,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 								axis->outputBuffer = app->configuration.buffer;
 #if(VKFFT_BACKEND==0)
 								descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
-								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
-								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize);
+								descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize);
+								descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize);
 #endif
 							}
 							if (axis->specializationConstants.performOffsetUpdate) {
@@ -1492,13 +1520,13 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 				}
 				if ((i == 2) && (app->configuration.performConvolution)) {
 					if (axis->specializationConstants.performBufferSetUpdate) {
-						uint64_t bufferId = 0;
-						uint64_t offset = j;
+						pfUINT bufferId = 0;
+						pfUINT offset = j;
 						if (app->configuration.kernelSize) {
-							for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) {
-								if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize))) {
+							for (pfUINT l = 0; l < app->configuration.kernelNum; ++l) {
+								if (offset >= (pfUINT)pfceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize))) {
 									bufferId++;
-									offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * axis->specializationConstants.complexSize));
+									offset -= (pfUINT)pfceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize));
 								}
 								else {
 									l = app->configuration.kernelNum;
@@ -1508,8 +1536,8 @@ static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTA
 						}
 #if(VKFFT_BACKEND==0)
 						descriptorBufferInfo.buffer = app->configuration.kernel[bufferId];
-						descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * axis->specializationConstants.complexSize);
-						descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * axis->specializationConstants.complexSize);
+						descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize);
+						descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize);
 #endif
 					}
 					if (axis->specializationConstants.performOffsetUpdate) {
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h
index d48c9756..5a80f161 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h
@@ -23,14 +23,14 @@
 #define VKFFT_AXISBLOCKSPLITTER_H
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 
-static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t allowedSharedMemory, uint64_t allowedSharedMemoryPow2) {
-	uint64_t maxBatchCoalesced = app->configuration.coalescedMemory / axis->specializationConstants.complexSize;
+static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT axis_id, pfUINT axis_upload_id, pfUINT allowedSharedMemory, pfUINT allowedSharedMemoryPow2) {
+	pfUINT maxBatchCoalesced = app->configuration.coalescedMemory / axis->specializationConstants.complexSize;
 	axis->groupedBatch = maxBatchCoalesced;
 
-	uint64_t maxSequenceLengthSharedMemory = allowedSharedMemory / axis->specializationConstants.complexSize;
-	uint64_t maxSequenceLengthSharedMemoryPow2 = allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
-	uint64_t maxSingleSizeStrided = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemory / (app->configuration.coalescedMemory) : allowedSharedMemory / axis->specializationConstants.complexSize;
-	uint64_t maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemoryPow2 / (app->configuration.coalescedMemory) : allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
+	pfUINT maxSequenceLengthSharedMemory = allowedSharedMemory / axis->specializationConstants.complexSize;
+	pfUINT maxSequenceLengthSharedMemoryPow2 = allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
+	pfUINT maxSingleSizeStrided = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemory / (app->configuration.coalescedMemory) : allowedSharedMemory / axis->specializationConstants.complexSize;
+	pfUINT maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemoryPow2 / (app->configuration.coalescedMemory) : allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
 	if (((FFTPlan->numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) {
 		axis->groupedBatch = (maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim.data.i > axis->groupedBatch) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim.data.i : axis->groupedBatch;
 	}
@@ -40,25 +40,25 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 	
 	if (app->configuration.groupedBatch[axis_id])
 	{
-		uint64_t maxThreadNum = app->configuration.maxThreadsNum;
+		pfUINT maxThreadNum = app->configuration.maxThreadsNum;
 		axis->specializationConstants.axisSwapped = 0;
-		uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
+		pfUINT r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
 		if (axis_id == 0) {
 			if (axis_upload_id == 0) {
-				axis->axisBlock[0] = (((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
+				axis->axisBlock[0] = (((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost > 1) ? ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
 				if (axis->specializationConstants.useRaderMult) {
-					uint64_t locMaxBatchCoalesced = ((axis_id == 0) && (((axis_upload_id == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (axis->specializationConstants.numAxisUploads == 1))) ? 1 : maxBatchCoalesced;
-					uint64_t final_rader_thread_count = 0;
-					for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+					pfUINT locMaxBatchCoalesced = ((axis_id == 0) && (((axis_upload_id == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (axis->specializationConstants.numAxisUploads == 1))) ? 1 : maxBatchCoalesced;
+					pfUINT final_rader_thread_count = 0;
+					for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 						if (axis->specializationConstants.raderContainer[i].type == 1) {
-							uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
-							uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+							pfUINT temp_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
+							pfUINT active_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 							if (active_rader > 1) {
-								if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * locMaxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+								if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * locMaxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 							}
-							uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+							pfUINT local_estimate_rader_threadnum = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 
-							uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[0] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+							pfUINT temp_rader_thread_count = ((pfUINT)pfceil(axis->axisBlock[0] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 							if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum;
 							if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count;
 						}
@@ -81,13 +81,13 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 				if (((axis->specializationConstants.fftDim.data.i % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) {
 					/*#if (VKFFT_BACKEND==0)
 										if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) {
-											uint64_t temp = axis->axisBlock[1];
+											pfUINT temp = axis->axisBlock[1];
 											axis->axisBlock[1] = axis->axisBlock[0];
 											axis->axisBlock[0] = temp;
 											axis->specializationConstants.axisSwapped = 1;
 										}
 					#else*/
-					uint64_t temp = axis->axisBlock[1];
+					pfUINT temp = axis->axisBlock[1];
 					axis->axisBlock[1] = axis->axisBlock[0];
 					axis->axisBlock[0] = temp;
 					axis->specializationConstants.axisSwapped = 1;
@@ -97,19 +97,19 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 				axis->axisBlock[3] = axis->specializationConstants.fftDim.data.i;
 			}
 			else {
-				axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? (uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost : 1;
+				axis->axisBlock[1] = ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? (pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost : 1;
 				if (axis->specializationConstants.useRaderMult) {
-					uint64_t final_rader_thread_count = 0;
-					for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+					pfUINT final_rader_thread_count = 0;
+					for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 						if (axis->specializationConstants.raderContainer[i].type == 1) {
-							uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
-							uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+							pfUINT temp_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
+							pfUINT active_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 							if (active_rader > 1) {
-								if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+								if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 							}
-							uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+							pfUINT local_estimate_rader_threadnum = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 
-							uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+							pfUINT temp_rader_thread_count = ((pfUINT)pfceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 							if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum;
 							if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count;
 						}
@@ -121,10 +121,10 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 					if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum;
 				}
 
-				uint64_t scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch;
+				pfUINT scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch;
 				if ((scale > 1) && ((axis->specializationConstants.fftDim.data.i * axis->groupedBatch * scale <= maxSequenceLengthSharedMemory))) axis->groupedBatch *= scale;
 
-				axis->axisBlock[0] = ((uint64_t)axis->specializationConstants.stageStartSize.data.i > axis->groupedBatch) ? axis->groupedBatch : (uint64_t)axis->specializationConstants.stageStartSize.data.i;
+				axis->axisBlock[0] = ((pfUINT)axis->specializationConstants.stageStartSize.data.i > axis->groupedBatch) ? axis->groupedBatch : (pfUINT)axis->specializationConstants.stageStartSize.data.i;
 				if (app->configuration.vendorID == 0x10DE) {
 					while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[0] > maxBatchCoalesced)) {
 						axis->axisBlock[0] /= 2;
@@ -133,7 +133,7 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 				}
 				if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
 				if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
-					for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
+					for (pfUINT i = 1; i <= axis->axisBlock[0]; i++) {
 						if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
 						{
 							axis->axisBlock[0] /= i;
@@ -150,19 +150,19 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 		}
 		if (axis_id >= 1) {
 
-			axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
+			axis->axisBlock[1] = ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
 			if (axis->specializationConstants.useRaderMult) {
-				uint64_t final_rader_thread_count = 0;
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				pfUINT final_rader_thread_count = 0;
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (axis->specializationConstants.raderContainer[i].type == 1) {
-						uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
-						uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+						pfUINT temp_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
+						pfUINT active_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 						if (active_rader > 1) {
-							if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+							if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 						}
-						uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+						pfUINT local_estimate_rader_threadnum = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 
-						uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+						pfUINT temp_rader_thread_count = ((pfUINT)pfceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 						if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum;
 						if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count;
 					}
@@ -204,7 +204,7 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 	else {
 		axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch;
 	}*/
-	//if (axis->groupedBatch * (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0) < app->configuration.warpSize) axis->groupedBatch = app->configuration.warpSize / (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0);
+	//if (axis->groupedBatch * (pfUINT)pfceil(axis->specializationConstants.fftDim / 8.0) < app->configuration.warpSize) axis->groupedBatch = app->configuration.warpSize / (pfUINT)pfceil(axis->specializationConstants.fftDim / 8.0);
 	//axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch;
 	//axis->groupedBatch = 8;
 	//shared memory bank conflict resolve
@@ -212,11 +212,11 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 	if (app->configuration.vendorID == 0x10DE) {
 		if (FFTPlan->numAxisUploads[axis_id] == 2) {
 			if ((axis_upload_id > 0) || (axis->specializationConstants.fftDim.data.i <= 512)) {
-				if ((uint64_t)(axis->specializationConstants.fftDim.data.i * (64 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
+				if ((pfUINT)(axis->specializationConstants.fftDim.data.i * (64 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
 					axis->groupedBatch = 64 / axis->specializationConstants.complexSize;
 					maxBatchCoalesced = 64 / axis->specializationConstants.complexSize;
 				}
-				if ((uint64_t)(axis->specializationConstants.fftDim.data.i * (128 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
+				if ((pfUINT)(axis->specializationConstants.fftDim.data.i * (128 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
 					axis->groupedBatch = 128 / axis->specializationConstants.complexSize;
 					maxBatchCoalesced = 128 / axis->specializationConstants.complexSize;
 				}
@@ -224,11 +224,11 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 		}
 		//#endif
 		if (FFTPlan->numAxisUploads[axis_id] == 3) {
-			if ((uint64_t)(axis->specializationConstants.fftDim.data.i * (64 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
+			if ((pfUINT)(axis->specializationConstants.fftDim.data.i * (64 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
 				axis->groupedBatch = 64 / axis->specializationConstants.complexSize;
 				maxBatchCoalesced = 64 / axis->specializationConstants.complexSize;
 			}
-			if ((uint64_t)(axis->specializationConstants.fftDim.data.i * (128 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
+			if ((pfUINT)(axis->specializationConstants.fftDim.data.i * (128 / axis->specializationConstants.complexSize)) <= maxSequenceLengthSharedMemory) {
 				axis->groupedBatch = 128 / axis->specializationConstants.complexSize;
 				maxBatchCoalesced = 128 / axis->specializationConstants.complexSize;
 			}
@@ -236,47 +236,47 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 	}
 	else {
 		if ((FFTPlan->numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim.data.i * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) {
-			axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
+			axis->groupedBatch = (pfUINT)pfceil(axis->groupedBatch / 2.0);
 		}
 		//#endif
-		if ((FFTPlan->numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && ((uint64_t)axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory / (2 * axis->specializationConstants.complexSize))) {
-			axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
+		if ((FFTPlan->numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && ((pfUINT)axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory / (2 * axis->specializationConstants.complexSize))) {
+			axis->groupedBatch = (pfUINT)pfceil(axis->groupedBatch / 2.0);
 		}
 	}
 	if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced;
 	axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
 	//half bandiwdth technique
-	if (!((axis_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && ((uint64_t)axis->specializationConstants.fftDim.data.i > maxSingleSizeStrided)) {
+	if (!((axis_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && ((pfUINT)axis->specializationConstants.fftDim.data.i > maxSingleSizeStrided)) {
 		axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim.data.i;
 		if (axis->groupedBatch == 0) axis->groupedBatch = 1;
 	}
 
 	if ((app->configuration.halfThreads) && (axis->groupedBatch * axis->specializationConstants.fftDim.data.i * axis->specializationConstants.complexSize >= app->configuration.sharedMemorySize))
-		axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
+		axis->groupedBatch = (pfUINT)pfceil(axis->groupedBatch / 2.0);
 	if (axis->groupedBatch > app->configuration.warpSize) axis->groupedBatch = (axis->groupedBatch / app->configuration.warpSize) * app->configuration.warpSize;
 	if (axis->groupedBatch > 2 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
 	if (axis->groupedBatch > 4 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (4 * maxBatchCoalesced)) * (4 * maxBatchCoalesced);
-	//uint64_t maxThreadNum = (axis_id) ? (maxSingleSizeStrided * app->configuration.coalescedMemory / axis->specializationConstants.complexSize) / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost) : maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost);
+	//pfUINT maxThreadNum = (axis_id) ? (maxSingleSizeStrided * app->configuration.coalescedMemory / axis->specializationConstants.complexSize) / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost) : maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost);
 	//if (maxThreadNum > app->configuration.maxThreadsNum) maxThreadNum = app->configuration.maxThreadsNum;
-	uint64_t maxThreadNum = app->configuration.maxThreadsNum;
+	pfUINT maxThreadNum = app->configuration.maxThreadsNum;
 	axis->specializationConstants.axisSwapped = 0;
-	uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
+	pfUINT r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
 	if (axis_id == 0) {
 		if (axis_upload_id == 0) {
-			axis->axisBlock[0] = (((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
+			axis->axisBlock[0] = (((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost > 1) ? ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
 			if (axis->specializationConstants.useRaderMult) {
-				uint64_t locMaxBatchCoalesced = ((axis_id == 0) && (((axis_upload_id == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (axis->specializationConstants.numAxisUploads == 1))) ? 1 : maxBatchCoalesced;
-				uint64_t final_rader_thread_count = 0;
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				pfUINT locMaxBatchCoalesced = ((axis_id == 0) && (((axis_upload_id == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (axis->specializationConstants.numAxisUploads == 1))) ? 1 : maxBatchCoalesced;
+				pfUINT final_rader_thread_count = 0;
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (axis->specializationConstants.raderContainer[i].type == 1) {
-						uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
-						uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+						pfUINT temp_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
+						pfUINT active_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 						if (active_rader > 1) {
-							if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * locMaxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+							if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * locMaxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 						}
-						uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+						pfUINT local_estimate_rader_threadnum = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 
-						uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[0] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+						pfUINT temp_rader_thread_count = ((pfUINT)pfceil(axis->axisBlock[0] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 						if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum;
 						if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count;
 					}
@@ -293,13 +293,13 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 				axis->axisBlock[1] = axis->groupedBatch;
 			else {
 				//axis->axisBlock[1] = (axis->axisBlock[0] < app->configuration.warpSize) ? app->configuration.warpSize / axis->axisBlock[0] : 1;
-				uint64_t estimate_batch = (((axis->axisBlock[0] / app->configuration.warpSize) == 1) && ((axis->axisBlock[0] / (double)app->configuration.warpSize) < 1.5)) ? app->configuration.aimThreads / app->configuration.warpSize : app->configuration.aimThreads / axis->axisBlock[0];
+				pfUINT estimate_batch = (((axis->axisBlock[0] / app->configuration.warpSize) == 1) && ((axis->axisBlock[0] / (double)app->configuration.warpSize) < 1.5)) ? app->configuration.aimThreads / app->configuration.warpSize : app->configuration.aimThreads / axis->axisBlock[0];
 				if (estimate_batch == 0) estimate_batch = 1;
 				axis->axisBlock[1] = ((axis->axisBlock[0] < app->configuration.aimThreads) && ((axis->axisBlock[0] < app->configuration.warpSize) || (axis->specializationConstants.useRader))) ? estimate_batch : 1;
 			}
 
-			uint64_t currentAxisBlock1 = axis->axisBlock[1];
-			for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) {
+			pfUINT currentAxisBlock1 = axis->axisBlock[1];
+			for (pfUINT i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) {
 				if (((FFTPlan->numAxisUploads[0] > 1) && (!(((FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim.data.i) % axis->axisBlock[1]) == 0))) || ((FFTPlan->numAxisUploads[0] == 1) && (!(((FFTPlan->actualFFTSizePerAxis[axis_id][1] / r2cmult) % axis->axisBlock[1]) == 0)))) {
 					if (i * axis->specializationConstants.fftDim.data.i * axis->specializationConstants.complexSize <= allowedSharedMemory) axis->axisBlock[1] = i;
 					i = 2 * currentAxisBlock1;
@@ -307,9 +307,9 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 			}
 			if (((axis->specializationConstants.fftDim.data.i % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemoryPow2) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) {
 				//we plan to swap - this reduces bank conflicts
-				axis->axisBlock[1] = (uint64_t)pow(2, (uint64_t)ceil(log2((double)axis->axisBlock[1])));
+				axis->axisBlock[1] = (pfUINT)pow(2, (pfUINT)pfceil(log2((double)axis->axisBlock[1])));
 			}
-			if ((FFTPlan->numAxisUploads[0] > 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim.data.i) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim.data.i);
+			if ((FFTPlan->numAxisUploads[0] > 1) && ((pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim.data.i) < axis->axisBlock[1])) axis->axisBlock[1] = (pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim.data.i);
 			if ((axis->specializationConstants.mergeSequencesR2C != 0) && (axis->specializationConstants.fftDim.data.i * axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) {
 				axis->specializationConstants.mergeSequencesR2C = 0;
 				/*if ((!inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!(app->configuration.isInputFormatted))) {
@@ -326,7 +326,7 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 				}*/
 				r2cmult = 1;
 			}
-			if ((FFTPlan->numAxisUploads[0] == 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult);
+			if ((FFTPlan->numAxisUploads[0] == 1) && ((pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult) < axis->axisBlock[1])) axis->axisBlock[1] = (pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult);
 			if (app->configuration.vendorID == 0x10DE) {
 				while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[1] > maxBatchCoalesced)) {
 					axis->axisBlock[1] /= 2;
@@ -336,7 +336,7 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 			if (axis->axisBlock[1] > app->configuration.maxComputeWorkGroupSize[1]) axis->axisBlock[1] = app->configuration.maxComputeWorkGroupSize[1];
 			//if (axis->axisBlock[0] * axis->axisBlock[1] > app->configuration.maxThreadsNum) axis->axisBlock[1] /= 2;
 			if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
-				for (uint64_t i = 1; i <= axis->axisBlock[1]; i++) {
+				for (pfUINT i = 1; i <= axis->axisBlock[1]; i++) {
 					if ((axis->axisBlock[1] / i) * axis->axisBlock[0] <= maxThreadNum)
 					{
 						axis->axisBlock[1] /= i;
@@ -350,13 +350,13 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 			if (((axis->specializationConstants.fftDim.data.i % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) {
 				/*#if (VKFFT_BACKEND==0)
 									if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) {
-										uint64_t temp = axis->axisBlock[1];
+										pfUINT temp = axis->axisBlock[1];
 										axis->axisBlock[1] = axis->axisBlock[0];
 										axis->axisBlock[0] = temp;
 										axis->specializationConstants.axisSwapped = 1;
 									}
 				#else*/
-				uint64_t temp = axis->axisBlock[1];
+				pfUINT temp = axis->axisBlock[1];
 				axis->axisBlock[1] = axis->axisBlock[0];
 				axis->axisBlock[0] = temp;
 				axis->specializationConstants.axisSwapped = 1;
@@ -366,19 +366,19 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 			axis->axisBlock[3] = axis->specializationConstants.fftDim.data.i;
 		}
 		else {
-			axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? (uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost : 1;
+			axis->axisBlock[1] = ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? (pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost : 1;
 			if (axis->specializationConstants.useRaderMult) {
-				uint64_t final_rader_thread_count = 0;
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				pfUINT final_rader_thread_count = 0;
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (axis->specializationConstants.raderContainer[i].type == 1) {
-						uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
-						uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+						pfUINT temp_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
+						pfUINT active_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 						if (active_rader > 1) {
-							if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+							if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 						}
-						uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+						pfUINT local_estimate_rader_threadnum = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 
-						uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+						pfUINT temp_rader_thread_count = ((pfUINT)pfceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 						if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum;
 						if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count;
 					}
@@ -390,10 +390,10 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 				if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum;
 			}
 
-			uint64_t scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch;
+			pfUINT scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch;
 			if ((scale > 1) && ((axis->specializationConstants.fftDim.data.i * axis->groupedBatch * scale <= maxSequenceLengthSharedMemory))) axis->groupedBatch *= scale;
 
-			axis->axisBlock[0] = ((uint64_t)axis->specializationConstants.stageStartSize.data.i > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize.data.i;
+			axis->axisBlock[0] = ((pfUINT)axis->specializationConstants.stageStartSize.data.i > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize.data.i;
 			if (app->configuration.vendorID == 0x10DE) {
 				while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[0] > maxBatchCoalesced)) {
 					axis->axisBlock[0] /= 2;
@@ -402,7 +402,7 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 			}
 			if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
 			if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
-				for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
+				for (pfUINT i = 1; i <= axis->axisBlock[0]; i++) {
 					if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
 					{
 						axis->axisBlock[0] /= i;
@@ -419,19 +419,19 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 	}
 	if (axis_id >= 1) {
 
-		axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
+		axis->axisBlock[1] = ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((pfUINT)pfceil(axis->specializationConstants.fftDim.data.i / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1;
 		if (axis->specializationConstants.useRaderMult) {
-			uint64_t final_rader_thread_count = 0;
-			for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+			pfUINT final_rader_thread_count = 0;
+			for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 				if (axis->specializationConstants.raderContainer[i].type == 1) {
-					uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
-					uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+					pfUINT temp_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2));
+					pfUINT active_rader = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 					if (active_rader > 1) {
-						if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+						if ((((double)active_rader - (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 					}
-					uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+					pfUINT local_estimate_rader_threadnum = (pfUINT)pfceil((axis->specializationConstants.fftDim.data.i / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 
-					uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
+					pfUINT temp_rader_thread_count = ((pfUINT)pfceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2);
 					if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum;
 					if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count;
 				}
@@ -452,7 +452,7 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
 		}
 		if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
 		if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
-			for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
+			for (pfUINT i = 1; i <= axis->axisBlock[0]; i++) {
 				if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
 				{
 					axis->axisBlock[0] /= i;
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_ManageLUT.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_ManageLUT.h
index e3074a38..83911c24 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_ManageLUT.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_ManageLUT.h
@@ -23,8 +23,9 @@
 #define VKFFT_MANAGELUT_H
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
 #include "vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_ManageMemory.h"
+#include "vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h"
 
-static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t inverse){
+static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT inverse){
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
 	VkResult res = VK_SUCCESS;
@@ -40,9 +41,9 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 #endif
 	//allocate LUT
 	if (app->configuration.useLUT == 1) {
-		uint64_t dimMult = 1;
-		uint64_t maxStageSum = 0;
-		for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
+		pfUINT dimMult = 1;
+		pfUINT maxStageSum = 0;
+		for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++) {
 			if (i > 0) {
 				switch (axis->specializationConstants.stageRadix[i]) {
 				case 2:
@@ -61,7 +62,7 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 					maxStageSum += dimMult * 5;
 					break;
 				case 7:
-					maxStageSum += dimMult * 6;
+						maxStageSum += dimMult * 6;
 					break;
 				case 8:
 					maxStageSum += dimMult * 3;
@@ -73,13 +74,19 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 					maxStageSum += dimMult * 9;
 					break;
 				case 11:
-					maxStageSum += dimMult * 10;
+					if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+						maxStageSum += dimMult * 11;
+					else 
+						maxStageSum += dimMult * 10;
 					break;
 				case 12:
 					maxStageSum += dimMult * 11;
 					break;
 				case 13:
-					maxStageSum += dimMult * 12;
+					if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+						maxStageSum += dimMult * 13;
+					else 
+						maxStageSum += dimMult * 12;
 					break;
 				case 14:
 					maxStageSum += dimMult * 13;
@@ -103,10 +110,10 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 		axis->specializationConstants.maxStageSumLUT = (int)maxStageSum;
 
 		dimMult = 1;
-		for (uint64_t k = 0; k < axis->specializationConstants.numRaderPrimes; k++) {
+		for (pfUINT k = 0; k < axis->specializationConstants.numRaderPrimes; k++) {
 			if (axis->specializationConstants.raderContainer[k].type == 0) {
 				axis->specializationConstants.raderContainer[k].RaderRadixOffsetLUT = maxStageSum;
-				for (uint64_t i = 0; i < axis->specializationConstants.raderContainer[k].numStages; i++) {
+				for (pfUINT i = 0; i < axis->specializationConstants.raderContainer[k].numStages; i++) {
 					if (i > 0) {
 						switch (axis->specializationConstants.raderContainer[k].stageRadix[i]) {
 						case 2:
@@ -125,7 +132,7 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 							maxStageSum += dimMult * 5;
 							break;
 						case 7:
-							maxStageSum += dimMult * 6;
+								maxStageSum += dimMult * 6;
 							break;
 						case 8:
 							maxStageSum += dimMult * 3;
@@ -137,13 +144,19 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 							maxStageSum += dimMult * 9;
 							break;
 						case 11:
-							maxStageSum += dimMult * 10;
+							if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+								maxStageSum += dimMult * 11;
+							else 
+								maxStageSum += dimMult * 10;
 							break;
 						case 12:
 							maxStageSum += dimMult * 11;
 							break;
 						case 13:
-							maxStageSum += dimMult * 12;
+							if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+								maxStageSum += dimMult * 13;
+							else 
+								maxStageSum += dimMult * 12;
 							break;
 						case 14:
 							maxStageSum += dimMult * 13;
@@ -170,11 +183,11 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 		}
 		//iFFT LUT
 		dimMult = 1;
-		for (uint64_t k = 0; k < axis->specializationConstants.numRaderPrimes; k++) {
+		for (pfUINT k = 0; k < axis->specializationConstants.numRaderPrimes; k++) {
 			if (axis->specializationConstants.raderContainer[k].type == 0) {
 				axis->specializationConstants.raderContainer[k].RaderRadixOffsetLUTiFFT = maxStageSum;
-				for (int64_t i = axis->specializationConstants.raderContainer[k].numStages - 1; i >= 0; i--) {
-					if (i < (int64_t)axis->specializationConstants.raderContainer[k].numStages - 1) {
+				for (pfINT i = axis->specializationConstants.raderContainer[k].numStages - 1; i >= 0; i--) {
+					if (i < (pfINT)axis->specializationConstants.raderContainer[k].numStages - 1) {
 						switch (axis->specializationConstants.raderContainer[k].stageRadix[i]) {
 						case 2:
 							maxStageSum += dimMult;
@@ -192,7 +205,7 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 							maxStageSum += dimMult * 5;
 							break;
 						case 7:
-							maxStageSum += dimMult * 6;
+								maxStageSum += dimMult * 6;
 							break;
 						case 8:
 							maxStageSum += dimMult * 3;
@@ -204,13 +217,19 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 							maxStageSum += dimMult * 9;
 							break;
 						case 11:
-							maxStageSum += dimMult * 10;
+							if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+								maxStageSum += dimMult * 11;
+							else 
+								maxStageSum += dimMult * 10;
 							break;
 						case 12:
 							maxStageSum += dimMult * 11;
 							break;
 						case 13:
-							maxStageSum += dimMult * 12;
+							if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+								maxStageSum += dimMult * 13;
+							else 
+								maxStageSum += dimMult * 12;
 							break;
 						case 14:
 							maxStageSum += dimMult * 13;
@@ -235,18 +254,446 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				dimMult = 1;
 			}
 		}
+		if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+			pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+			if (axis->specializationConstants.axis_upload_id > 0) {
+				if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
+					axis->specializationConstants.startDCT3LUT.type = 31;
+					axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
+					if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT.data.i += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i;
+					axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2)) * 4 * sizeof(double);
+				}
+				else {
+					if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+						axis->specializationConstants.startDCT3LUT.type = 31;
+						axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
+						if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT.data.i += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i;
+						axis->specializationConstants.startDCT4LUT.type = 31;
+						axis->specializationConstants.startDCT4LUT.data.i = (axis->specializationConstants.startDCT3LUT.data.i + (app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2));
+						axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2) + app->configuration.size[axis->specializationConstants.axis_id] / 2) * 4 * sizeof(double);
+					}
+					else
+						axis->bufferLUTSize = (maxStageSum) * 4 * sizeof(double);
+				}
+				if (app->configuration.useLUT_4step == 1) axis->bufferLUTSize += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i * 4 * sizeof(double);
+			}
+			else {
+				if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
+					axis->specializationConstants.startDCT3LUT.type = 31;
+					axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
+					axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2)) * 4 * sizeof(double);
+				}
+				else {
+					if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+						axis->specializationConstants.startDCT3LUT.type = 31;
+						axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
+						axis->specializationConstants.startDCT4LUT.type = 31;
+						axis->specializationConstants.startDCT4LUT.data.i = (axis->specializationConstants.startDCT3LUT.data.i + (app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2));
+						axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2) + app->configuration.size[axis->specializationConstants.axis_id] / 2) * 4 * sizeof(double);
+
+					}
+					else
+						axis->bufferLUTSize = (maxStageSum) * 4 * sizeof(double);
+				}
+			}
+			if (axis->specializationConstants.useRader) {
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+					if (!axis->specializationConstants.inline_rader_kernel) {
+						axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (4 * sizeof(double));
+						axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 4 * sizeof(double);
+					}
+				}
+			}
+			if (axis->bufferLUTSize == 0) axis->bufferLUTSize = 2 * sizeof(double);
+			double* tempLUT = (double*)malloc(axis->bufferLUTSize);
+			if (!tempLUT) {
+				deleteVkFFT(app);
+				return VKFFT_ERROR_MALLOC_FAILED;
+			}
+			pfUINT localStageSize = axis->specializationConstants.stageRadix[0];
+			pfUINT localStageSum = 0;
+
+			PfContainer in = VKFFT_ZERO_INIT;
+			PfContainer temp1 = VKFFT_ZERO_INIT;
+			in.type = 22;
+
+			for (pfUINT i = 1; i < axis->specializationConstants.numStages; i++) {
+				if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
+					for (pfUINT k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
+						for (pfUINT j = 0; j < localStageSize; j++) {
+							in.data.d = pfcos(j * double_PI / localStageSize / pow(2, k));
+							PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+							tempLUT[4 * (j + localStageSum)] = (double)temp1.data.dd[0].data.d;
+							tempLUT[4 * (j + localStageSum) + 1] = (double)temp1.data.dd[1].data.d;
+
+							in.data.d = pfsin(j * double_PI / localStageSize / pow(2, k));
+							PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+							tempLUT[4 * (j + localStageSum) + 2] = (double)temp1.data.dd[0].data.d;
+							tempLUT[4 * (j + localStageSum) + 3] = (double)temp1.data.dd[1].data.d;
+						}
+						localStageSum += localStageSize;
+					}
+				}
+				else if (axis->specializationConstants.rader_generator[i] > 0) {
+					for (pfUINT j = 0; j < localStageSize; j++) {
+						for (pfINT k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) {
+							in.data.d = pfcos(j * pfFPinit("2.0") * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+							tempLUT[4* (k + localStageSum)] = (double)temp1.data.dd[0].data.d;
+							tempLUT[4 * (k + localStageSum) + 1] = (double)temp1.data.dd[1].data.d;
+
+							in.data.d = pfsin(j * pfFPinit("2.0") * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+							tempLUT[4 * (k + localStageSum) + 2] = (double)temp1.data.dd[0].data.d;
+							tempLUT[4 * (k + localStageSum) + 3] = (double)temp1.data.dd[1].data.d;
+						}
+						localStageSum += (axis->specializationConstants.stageRadix[i]);
+					}
+				}
+				else {
+					for (pfUINT k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
+						for (pfUINT j = 0; j < localStageSize; j++) {
+							in.data.d = pfcos(j * pfFPinit("2.0") * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+							tempLUT[4 * (j + localStageSum)] = (double)temp1.data.dd[0].data.d;
+							tempLUT[4 * (j + localStageSum) + 1] = (double)temp1.data.dd[1].data.d;
+							in.data.d = pfsin(j * pfFPinit("2.0") * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+							tempLUT[4 * (j + localStageSum) + 2] = (double)temp1.data.dd[0].data.d;
+							tempLUT[4 * (j + localStageSum) + 3] = (double)temp1.data.dd[1].data.d;
+						}
+						localStageSum += localStageSize;
+					}
+				}
+				localStageSize *= axis->specializationConstants.stageRadix[i];
+			}
+
+
+			if (axis->specializationConstants.useRader) {
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+					if (axis->specializationConstants.raderContainer[i].type) {
+						if (!axis->specializationConstants.inline_rader_kernel) {
+							for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+								pfUINT g_pow = 1;
+								for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+									g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
+								}
+								in.data.d = pfcos(pfFPinit("2.0") * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+								PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)temp1.data.dd[0].data.d;
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)temp1.data.dd[1].data.d;
+								in.data.d = (-pfsin(pfFPinit("2.0") * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
+								PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 2] = (double)temp1.data.dd[0].data.d;
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 3] = (double)temp1.data.dd[1].data.d;
+							}
+						}
+					}
+					else {
+						localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0];
+						localStageSum = 0;
+						for (pfUINT l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) {
+							if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) {
+								for (pfUINT k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										in.data.d = pfcos(j * double_PI / localStageSize / pow(2, k));
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)temp1.data.dd[1].data.d;
+										in.data.d = pfsin(j * double_PI / localStageSize / pow(2, k));
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 2] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 3] = (double)temp1.data.dd[1].data.d;
+									}
+									localStageSum += localStageSize;
+								}
+							}
+							else {
+								for (pfUINT k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										in.data.d = pfcos(j * pfFPinit("2.0") * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)temp1.data.dd[1].data.d;
+										in.data.d = pfsin(j * pfFPinit("2.0") * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 2] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 3] = (double)temp1.data.dd[1].data.d;
+									}
+									localStageSum += localStageSize;
+								}
+							}
+							localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l];
+						}
+
+						localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1];
+						localStageSum = 0;
+						for (pfINT l = (pfINT)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) {
+							if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) {
+								for (pfUINT k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										in.data.d = pfcos(j * double_PI / localStageSize / pow(2, k));
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)temp1.data.dd[1].data.d;
+										in.data.d = pfsin(j * double_PI / localStageSize / pow(2, k));
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 2] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 3] = (double)temp1.data.dd[1].data.d;
+									}
+									localStageSum += localStageSize;
+								}
+							}
+							else {
+								for (pfUINT k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										in.data.d = pfcos(j * pfFPinit("2.0") * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)temp1.data.dd[1].data.d;
+										in.data.d = pfsin(j * pfFPinit("2.0") * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 2] = (double)temp1.data.dd[0].data.d;
+										tempLUT[4 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 3] = (double)temp1.data.dd[1].data.d;
+									}
+									localStageSum += localStageSize;
+								}
+							}
+							localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l];
+						}
 
-		if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-			long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
+						if (!axis->specializationConstants.inline_rader_kernel) {
+							double* raderFFTkernel = (double*)axis->specializationConstants.raderContainer[i].raderFFTkernel;
+							for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+								in.data.d = (((pfLD)raderFFTkernel[4 * j] + (pfLD)raderFFTkernel[4 * j + 1])/ (pfLD)(axis->specializationConstants.raderContainer[i].prime - 1));
+								PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)temp1.data.dd[0].data.d;
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)temp1.data.dd[1].data.d;
+								in.data.d = (((pfLD)raderFFTkernel[4 * j + 2] + (pfLD)raderFFTkernel[4 * j + 3])/ (pfLD)(axis->specializationConstants.raderContainer[i].prime - 1));
+								PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 2] = (double)temp1.data.dd[0].data.d;
+								tempLUT[4 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 3] = (double)temp1.data.dd[1].data.d;
+							}
+						}
+					}
+				}
+			}
+			if ((axis->specializationConstants.axis_upload_id > 0) && (app->configuration.useLUT_4step == 1)) {
+				for (pfUINT i = 0; i < (pfUINT)axis->specializationConstants.stageStartSize.data.i; i++) {
+					for (pfUINT j = 0; j < (pfUINT)axis->specializationConstants.fftDim.data.i; j++) {
+						pfLD angle = pfFPinit("2.0") * double_PI * ((i * j) / (pfLD)(axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i));
+						in.data.d = pfcos(angle);
+						PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+						tempLUT[maxStageSum * 4 + 4 * (i + j * axis->specializationConstants.stageStartSize.data.i)] = (double)temp1.data.dd[0].data.d;
+						tempLUT[maxStageSum * 4 + 4 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 1] = (double)temp1.data.dd[1].data.d;
+						in.data.d = pfsin(angle);
+						PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+						tempLUT[maxStageSum * 4 + 4 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 2] = (double)temp1.data.dd[0].data.d;
+						tempLUT[maxStageSum * 4 + 4 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 3] = (double)temp1.data.dd[1].data.d;
+					}
+				}
+			}
+			if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2; j++) {
+					pfLD angle = (double_PI / pfFPinit("2.0") / (pfLD)(app->configuration.size[axis->specializationConstants.axis_id])) * j;
+					in.data.d = pfcos(angle);
+					PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j] = (double)temp1.data.dd[0].data.d;
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j + 1] = (double)temp1.data.dd[1].data.d;
+					in.data.d = pfsin(angle);
+					PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j + 2] = (double)temp1.data.dd[0].data.d;
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j + 3] = (double)temp1.data.dd[1].data.d;
+				}
+			}
+			if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2; j++) {
+					pfLD angle = (double_PI / pfFPinit("2.0") / (pfLD)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * j;
+					in.data.d = pfcos(angle);
+					PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j] = (double)temp1.data.dd[0].data.d;
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j + 1] = (double)temp1.data.dd[1].data.d;
+					in.data.d = pfsin(angle);
+					PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j + 2] = (double)temp1.data.dd[0].data.d;
+					tempLUT[4 * axis->specializationConstants.startDCT3LUT.data.i + 4 * j + 3] = (double)temp1.data.dd[1].data.d;
+				}
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2; j++) {
+					pfLD angle = (-double_PI / pfFPinit("8.0") / (pfLD)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * (2 * j + 1);
+					in.data.d = pfcos(angle);
+					PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+					tempLUT[4 * axis->specializationConstants.startDCT4LUT.data.i + 4 * j] = (double)temp1.data.dd[0].data.d;
+					tempLUT[4 * axis->specializationConstants.startDCT4LUT.data.i + 4 * j + 1] = (double)temp1.data.dd[1].data.d;
+					in.data.d = pfsin(angle);
+					PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+					tempLUT[4 * axis->specializationConstants.startDCT4LUT.data.i + 4 * j + 2] = (double)temp1.data.dd[0].data.d;
+					tempLUT[4 * axis->specializationConstants.startDCT4LUT.data.i + 4 * j + 3] = (double)temp1.data.dd[1].data.d;
+				}
+			}
+			PfDeallocateContainer(&axis->specializationConstants, &temp1);
+			axis->referenceLUT = 0;
+			if (axis->specializationConstants.reverseBluesteinMultiUpload == 1) {
+				axis->bufferLUT = FFTPlan->axes[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id].bufferLUT;
+#if(VKFFT_BACKEND==0)
+				axis->bufferLUTDeviceMemory = FFTPlan->axes[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
+#endif
+				axis->bufferLUTSize = FFTPlan->axes[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id].bufferLUTSize;
+				axis->referenceLUT = 1;
+			}
+			else {
+				if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
+					axis->bufferLUT = app->localFFTPlan_inverse->axes[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id].bufferLUT;
+#if(VKFFT_BACKEND==0)
+					axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
+#endif
+					axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id].bufferLUTSize;
+					axis->referenceLUT = 1;
+				}
+				else {
+					pfUINT checkRadixOrder = 1;
+					for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++)
+						if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0;
+					if (checkRadixOrder) {
+						for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+							if (axis->specializationConstants.raderContainer[i].type == 0) {
+								for (pfUINT k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
+									if (FFTPlan->axes[0][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0;
+								}
+							}
+						}
+					}
+					if (checkRadixOrder && (axis->specializationConstants.axis_id >= 1) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[0][0].specializationConstants.fft_dim_full.data.i) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] == 1) && (axis->specializationConstants.fft_dim_full.data.i < axis->specializationConstants.maxSingleSizeStrided.data.i / axis->specializationConstants.registerBoost)) && (((!axis->specializationConstants.performDCT) && (!axis->specializationConstants.performDST)) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[0]))) {
+						axis->bufferLUT = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUT;
+#if(VKFFT_BACKEND==0)
+						axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
+#endif
+						axis->bufferLUTSize = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUTSize;
+						axis->referenceLUT = 1;
+					}
+					else {
+                        for (int p = 1; p < axis->specializationConstants.axis_id; p++){
+                            if(axis->referenceLUT == 0){
+                                checkRadixOrder = 1;
+                                for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++)
+                                    if (FFTPlan->axes[p][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0;
+                                if (checkRadixOrder) {
+                                    for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+                                        if (axis->specializationConstants.raderContainer[i].type == 0) {
+                                            for (pfUINT k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
+                                                if (FFTPlan->axes[p][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0;
+                                            }
+                                        }
+                                    }
+                                }
+                                if (checkRadixOrder && (axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[p][0].specializationConstants.fft_dim_full.data.i) && (((!axis->specializationConstants.performDCT) && (!axis->specializationConstants.performDST)) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[p]))) {
+                                    axis->bufferLUT = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUT;
+#if(VKFFT_BACKEND==0)
+                                    axis->bufferLUTDeviceMemory = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
+#endif
+                                    axis->bufferLUTSize = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUTSize;
+                                    axis->referenceLUT = 1;
+                                }
+                            }
+                        }
+                        if(axis->referenceLUT == 0){
+#if(VKFFT_BACKEND==0)
+							resFFT = allocateBufferVulkan(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
+							if (resFFT != VKFFT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return resFFT;
+							}
+							resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+							if (resFFT != VKFFT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return resFFT;
+							}
+#elif(VKFFT_BACKEND==1)
+							res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
+							if (res != cudaSuccess) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+							}
+							resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+							if (resFFT != VKFFT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return resFFT;
+							}
+#elif(VKFFT_BACKEND==2)
+							res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
+							if (res != hipSuccess) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+							}
+							resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+							if (resFFT != VKFFT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return resFFT;
+							}
+#elif(VKFFT_BACKEND==3)
+							axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
+							if (res != CL_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+							}
+#elif(VKFFT_BACKEND==4)
+							ze_device_mem_alloc_desc_t device_desc = VKFFT_ZERO_INIT;
+							device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+							res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT);
+							if (res != ZE_RESULT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+							}
+							resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+							if (resFFT != VKFFT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return resFFT;
+							}
+#elif(VKFFT_BACKEND==5)
+							axis->bufferLUT = app->configuration.device->newBuffer(axis->bufferLUTSize, MTL::ResourceStorageModePrivate);
+							resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+							if (resFFT != VKFFT_SUCCESS) {
+								deleteVkFFT(app);
+								free(tempLUT);
+								tempLUT = 0;
+								return resFFT;
+							}
+#endif
+						}
+					}
+				}
+			}
+			free(tempLUT);
+			tempLUT = 0;		
+		}
+		else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+			pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
 			if (axis->specializationConstants.axis_upload_id > 0) {
-				if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
+				if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
 					axis->specializationConstants.startDCT3LUT.type = 31;
 					axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 					if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT.data.i += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i;
 					axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2)) * 2 * sizeof(double);
 				}
 				else {
-					if ((app->configuration.performDCT == 4) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+					if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
 						axis->specializationConstants.startDCT3LUT.type = 31;
 						axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 						if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT.data.i += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i;
@@ -260,13 +707,13 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				if (app->configuration.useLUT_4step == 1) axis->bufferLUTSize += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i * 2 * sizeof(double);
 			}
 			else {
-				if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
+				if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
 					axis->specializationConstants.startDCT3LUT.type = 31;
 					axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 					axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2)) * 2 * sizeof(double);
 				}
 				else {
-					if ((app->configuration.performDCT == 4) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+					if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
 						axis->specializationConstants.startDCT3LUT.type = 31;
 						axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 						axis->specializationConstants.startDCT4LUT.type = 31;
@@ -279,7 +726,7 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				}
 			}
 			if (axis->specializationConstants.useRader) {
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (!axis->specializationConstants.inline_rader_kernel) {
 						axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (2 * sizeof(double));
 						axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 2 * sizeof(double);
@@ -292,32 +739,32 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				deleteVkFFT(app);
 				return VKFFT_ERROR_MALLOC_FAILED;
 			}
-			uint64_t localStageSize = axis->specializationConstants.stageRadix[0];
-			uint64_t localStageSum = 0;
-			for (uint64_t i = 1; i < axis->specializationConstants.numStages; i++) {
+			pfUINT localStageSize = axis->specializationConstants.stageRadix[0];
+			pfUINT localStageSum = 0;
+			for (pfUINT i = 1; i < axis->specializationConstants.numStages; i++) {
 				if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
-					for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
-						for (uint64_t j = 0; j < localStageSize; j++) {
-							tempLUT[2 * (j + localStageSum)] = (double)cos(j * double_PI / localStageSize / pow(2, k));
-							tempLUT[2 * (j + localStageSum) + 1] = (double)sin(j * double_PI / localStageSize / pow(2, k));
+					for (pfUINT k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
+						for (pfUINT j = 0; j < localStageSize; j++) {
+							tempLUT[2 * (j + localStageSum)] = (double)pfcos(j * double_PI / localStageSize / pow(2, k));
+							tempLUT[2 * (j + localStageSum) + 1] = (double)pfsin(j * double_PI / localStageSize / pow(2, k));
 						}
 						localStageSum += localStageSize;
 					}
 				}
 				else if (axis->specializationConstants.rader_generator[i] > 0) {
-					for (uint64_t j = 0; j < localStageSize; j++) {
-						for (int64_t k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) {
-							tempLUT[2 * (k + localStageSum)] = (double)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
-							tempLUT[2 * (k + localStageSum) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+					for (pfUINT j = 0; j < localStageSize; j++) {
+						for (pfINT k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) {
+							tempLUT[2 * (k + localStageSum)] = (double)pfcos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							tempLUT[2 * (k + localStageSum) + 1] = (double)pfsin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 						}
 						localStageSum += (axis->specializationConstants.stageRadix[i]);
 					}
 				}
 				else {
-					for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
-						for (uint64_t j = 0; j < localStageSize; j++) {
-							tempLUT[2 * (j + localStageSum)] = (double)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
-							tempLUT[2 * (j + localStageSum) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+					for (pfUINT k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
+						for (pfUINT j = 0; j < localStageSize; j++) {
+							tempLUT[2 * (j + localStageSum)] = (double)pfcos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							tempLUT[2 * (j + localStageSum) + 1] = (double)pfsin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 						}
 						localStageSum += localStageSize;
 					}
@@ -327,37 +774,37 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 
 
 			if (axis->specializationConstants.useRader) {
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (axis->specializationConstants.raderContainer[i].type) {
 						if (!axis->specializationConstants.inline_rader_kernel) {
-							for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
-								uint64_t g_pow = 1;
-								for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+							for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+								pfUINT g_pow = 1;
+								for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
 									g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
 								}
-								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
-								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
+								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)pfcos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)(-pfsin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
 							}
 						}
 					}
 					else {
 						localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0];
 						localStageSum = 0;
-						for (uint64_t l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) {
+						for (pfUINT l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) {
 							if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) {
-								for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)cos(j * double_PI / localStageSize / pow(2, k));
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)sin(j * double_PI / localStageSize / pow(2, k));
+								for (pfUINT k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)pfcos(j * double_PI / localStageSize / pow(2, k));
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)pfsin(j * double_PI / localStageSize / pow(2, k));
 									}
 									localStageSum += localStageSize;
 								}
 							}
 							else {
-								for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+								for (pfUINT k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)pfcos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)pfsin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
 									}
 									localStageSum += localStageSize;
 								}
@@ -367,21 +814,21 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 
 						localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1];
 						localStageSum = 0;
-						for (int64_t l = (int64_t)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) {
+						for (pfINT l = (pfINT)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) {
 							if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) {
-								for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)cos(j * double_PI / localStageSize / pow(2, k));
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)sin(j * double_PI / localStageSize / pow(2, k));
+								for (pfUINT k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)pfcos(j * double_PI / localStageSize / pow(2, k));
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)pfsin(j * double_PI / localStageSize / pow(2, k));
 									}
 									localStageSum += localStageSize;
 								}
 							}
 							else {
-								for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+								for (pfUINT k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)pfcos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)pfsin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
 									}
 									localStageSum += localStageSize;
 								}
@@ -391,40 +838,40 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 
 						if (!axis->specializationConstants.inline_rader_kernel) {
 							double* raderFFTkernel = (double*)axis->specializationConstants.raderContainer[i].raderFFTkernel;
-							for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
-								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)(raderFFTkernel[2 * j] / (long double)(axis->specializationConstants.raderContainer[i].prime - 1));
-								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)(raderFFTkernel[2 * j + 1] / (long double)(axis->specializationConstants.raderContainer[i].prime - 1));
+							for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)(raderFFTkernel[2 * j] / (pfLD)(axis->specializationConstants.raderContainer[i].prime - 1));
+								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)(raderFFTkernel[2 * j + 1] / (pfLD)(axis->specializationConstants.raderContainer[i].prime - 1));
 							}
 						}
 					}
 				}
 			}
 			if ((axis->specializationConstants.axis_upload_id > 0) && (app->configuration.useLUT_4step == 1)) {
-				for (uint64_t i = 0; i < (uint64_t)axis->specializationConstants.stageStartSize.data.i; i++) {
-					for (uint64_t j = 0; j < (uint64_t)axis->specializationConstants.fftDim.data.i; j++) {
-						long double angle = 2 * double_PI * ((i * j) / (long double)(axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i));
-						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i)] = (double)cos(angle);
-						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 1] = (double)sin(angle);
+				for (pfUINT i = 0; i < (pfUINT)axis->specializationConstants.stageStartSize.data.i; i++) {
+					for (pfUINT j = 0; j < (pfUINT)axis->specializationConstants.fftDim.data.i; j++) {
+						pfLD angle = 2 * double_PI * ((i * j) / (pfLD)(axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i));
+						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i)] = (double)pfcos(angle);
+						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 1] = (double)pfsin(angle);
 					}
 				}
 			}
-			if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
-				for (uint64_t j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2; j++) {
-					long double angle = (double_PI / 2.0 / (long double)(app->configuration.size[axis->specializationConstants.axis_id])) * j;
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (double)cos(angle);
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (double)sin(angle);
+			if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2; j++) {
+					pfLD angle = (double_PI / 2.0 / (pfLD)(app->configuration.size[axis->specializationConstants.axis_id])) * j;
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (double)pfcos(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (double)pfsin(angle);
 				}
 			}
-			if ((app->configuration.performDCT == 4) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
-				for (uint64_t j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2; j++) {
-					long double angle = (double_PI / 2.0 / (long double)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * j;
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (double)cos(angle);
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (double)sin(angle);
+			if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2; j++) {
+					pfLD angle = (double_PI / 2.0 / (pfLD)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * j;
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (double)pfcos(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (double)pfsin(angle);
 				}
-				for (uint64_t j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2; j++) {
-					long double angle = (-double_PI / 8.0 / (long double)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * (2 * j + 1);
-					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j] = (double)cos(angle);
-					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j + 1] = (double)sin(angle);
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2; j++) {
+					pfLD angle = (-double_PI / 8.0 / (pfLD)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * (2 * j + 1);
+					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j] = (double)pfcos(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j + 1] = (double)pfsin(angle);
 				}
 			}
 			axis->referenceLUT = 0;
@@ -446,19 +893,19 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 					axis->referenceLUT = 1;
 				}
 				else {
-					uint64_t checkRadixOrder = 1;
-					for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++)
+					pfUINT checkRadixOrder = 1;
+					for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++)
 						if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0;
 					if (checkRadixOrder) {
-						for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+						for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 							if (axis->specializationConstants.raderContainer[i].type == 0) {
-								for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
+								for (pfUINT k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
 									if (FFTPlan->axes[0][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0;
 								}
 							}
 						}
 					}
-					if (checkRadixOrder && (axis->specializationConstants.axis_id >= 1) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[0][0].specializationConstants.fft_dim_full.data.i) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] == 1) && (axis->specializationConstants.fft_dim_full.data.i < axis->specializationConstants.maxSingleSizeStrided.data.i / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[0]))) {
+					if (checkRadixOrder && (axis->specializationConstants.axis_id >= 1) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[0][0].specializationConstants.fft_dim_full.data.i) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] == 1) && (axis->specializationConstants.fft_dim_full.data.i < axis->specializationConstants.maxSingleSizeStrided.data.i / axis->specializationConstants.registerBoost)) && ((((!axis->specializationConstants.performDCT) && (!axis->specializationConstants.performDST)) && (!axis->specializationConstants.performDST)) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[0]))) {
 						axis->bufferLUT = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUT;
 #if(VKFFT_BACKEND==0)
 						axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
@@ -470,18 +917,18 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
                         for (int p = 1; p < axis->specializationConstants.axis_id; p++){
                             if(axis->referenceLUT == 0){
                                 checkRadixOrder = 1;
-                                for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++)
+                                for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++)
                                     if (FFTPlan->axes[p][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0;
                                 if (checkRadixOrder) {
-                                    for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+                                    for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
                                         if (axis->specializationConstants.raderContainer[i].type == 0) {
-                                            for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
+                                            for (pfUINT k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
                                                 if (FFTPlan->axes[p][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0;
                                             }
                                         }
                                     }
                                 }
-                                if (checkRadixOrder && (axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[p][0].specializationConstants.fft_dim_full.data.i) && ((!app->configuration.performDCT) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[p]))) {
+                                if (checkRadixOrder && (axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[p][0].specializationConstants.fft_dim_full.data.i) && (((!axis->specializationConstants.performDCT) && (!axis->specializationConstants.performDST)) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[p]))) {
                                     axis->bufferLUT = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUT;
 #if(VKFFT_BACKEND==0)
                                     axis->bufferLUTDeviceMemory = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
@@ -582,14 +1029,14 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 		else {
 			double double_PI = 3.14159265358979323846264338327950288419716939937510;
 			if (axis->specializationConstants.axis_upload_id > 0) {
-				if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
+				if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
 					axis->specializationConstants.startDCT3LUT.type = 31;
 					axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 					if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT.data.i += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i;
 					axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2)) * 2 * sizeof(float);
 				}
 				else {
-					if ((app->configuration.performDCT == 4) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+					if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
 						axis->specializationConstants.startDCT3LUT.type = 31;
 						axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 						if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT.data.i += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i;
@@ -603,13 +1050,13 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				if (app->configuration.useLUT_4step == 1) axis->bufferLUTSize += axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i * 2 * sizeof(float);
 			}
 			else {
-				if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
+				if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
 					axis->specializationConstants.startDCT3LUT.type = 31;
 					axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 					axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2)) * 2 * sizeof(float);
 				}
 				else {
-					if ((app->configuration.performDCT == 4) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+					if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
 						axis->specializationConstants.startDCT3LUT.type = 31;
 						axis->specializationConstants.startDCT3LUT.data.i = (maxStageSum);
 						axis->specializationConstants.startDCT4LUT.type = 31;
@@ -621,7 +1068,7 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				}
 			}
 			if (axis->specializationConstants.useRader) {
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (!axis->specializationConstants.inline_rader_kernel) {
 						axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (2 * sizeof(float));
 						axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 2 * sizeof(float);
@@ -634,32 +1081,32 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 				deleteVkFFT(app);
 				return VKFFT_ERROR_MALLOC_FAILED;
 			}
-			uint64_t localStageSize = axis->specializationConstants.stageRadix[0];
-			uint64_t localStageSum = 0;
-			for (uint64_t i = 1; i < axis->specializationConstants.numStages; i++) {
+			pfUINT localStageSize = axis->specializationConstants.stageRadix[0];
+			pfUINT localStageSum = 0;
+			for (pfUINT i = 1; i < axis->specializationConstants.numStages; i++) {
 				if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
-					for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
-						for (uint64_t j = 0; j < localStageSize; j++) {
-							tempLUT[2 * (j + localStageSum)] = (float)cos(j * double_PI / localStageSize / pow(2, k));
-							tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k));
+					for (pfUINT k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
+						for (pfUINT j = 0; j < localStageSize; j++) {
+							tempLUT[2 * (j + localStageSum)] = (float)pfcos(j * double_PI / localStageSize / pow(2, k));
+							tempLUT[2 * (j + localStageSum) + 1] = (float)pfsin(j * double_PI / localStageSize / pow(2, k));
 						}
 						localStageSum += localStageSize;
 					}
 				}
 				else if (axis->specializationConstants.rader_generator[i] > 0) {
-					for (uint64_t j = 0; j < localStageSize; j++) {
-						for (int64_t k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) {
-							tempLUT[2 * (k + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
-							tempLUT[2 * (k + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+					for (pfUINT j = 0; j < localStageSize; j++) {
+						for (pfINT k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) {
+							tempLUT[2 * (k + localStageSum)] = (float)pfcos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							tempLUT[2 * (k + localStageSum) + 1] = (float)pfsin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 						}
 						localStageSum += (axis->specializationConstants.stageRadix[i]);
 					}
 				}
 				else {
-					for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
-						for (uint64_t j = 0; j < localStageSize; j++) {
-							tempLUT[2 * (j + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
-							tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+					for (pfUINT k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
+						for (pfUINT j = 0; j < localStageSize; j++) {
+							tempLUT[2 * (j + localStageSum)] = (float)pfcos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+							tempLUT[2 * (j + localStageSum) + 1] = (float)pfsin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 						}
 						localStageSum += localStageSize;
 					}
@@ -668,37 +1115,37 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 			}
 
 			if (axis->specializationConstants.useRader) {
-				for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+				for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 					if (axis->specializationConstants.raderContainer[i].type) {
 						if (!axis->specializationConstants.inline_rader_kernel) {
-							for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
-								uint64_t g_pow = 1;
-								for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+							for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+								pfUINT g_pow = 1;
+								for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
 									g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
 								}
-								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
-								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
+								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(pfcos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
+								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(-pfsin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
 							}
 						}
 					}
 					else {
 						localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0];
 						localStageSum = 0;
-						for (uint64_t l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) {
+						for (pfUINT l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) {
 							if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) {
-								for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)cos(j * double_PI / localStageSize / pow(2, k));
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k));
+								for (pfUINT k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)pfcos(j * double_PI / localStageSize / pow(2, k));
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)pfsin(j * double_PI / localStageSize / pow(2, k));
 									}
 									localStageSum += localStageSize;
 								}
 							}
 							else {
-								for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+								for (pfUINT k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)pfcos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)pfsin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
 									}
 									localStageSum += localStageSize;
 								}
@@ -707,21 +1154,21 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 						}
 						localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1];
 						localStageSum = 0;
-						for (int64_t l = (int64_t)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) {
+						for (pfINT l = (pfINT)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) {
 							if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) {
-								for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)cos(j * double_PI / localStageSize / pow(2, k));
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k));
+								for (pfUINT k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)pfcos(j * double_PI / localStageSize / pow(2, k));
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)pfsin(j * double_PI / localStageSize / pow(2, k));
 									}
 									localStageSum += localStageSize;
 								}
 							}
 							else {
-								for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
-									for (uint64_t j = 0; j < localStageSize; j++) {
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
-										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+								for (pfUINT k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) {
+									for (pfUINT j = 0; j < localStageSize; j++) {
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)pfcos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
+										tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)pfsin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize);
 									}
 									localStageSum += localStageSize;
 								}
@@ -730,7 +1177,7 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 						}
 						if (!axis->specializationConstants.inline_rader_kernel) {
 							float* raderFFTkernel = (float*)axis->specializationConstants.raderContainer[i].raderFFTkernel;
-							for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+							for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
 								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(raderFFTkernel[2 * j] / (axis->specializationConstants.raderContainer[i].prime - 1));
 								tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(raderFFTkernel[2 * j + 1] / (axis->specializationConstants.raderContainer[i].prime - 1));
 							}
@@ -740,31 +1187,31 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 			}
 
 			if ((axis->specializationConstants.axis_upload_id > 0) && (app->configuration.useLUT_4step == 1)) {
-				for (uint64_t i = 0; i < (uint64_t)axis->specializationConstants.stageStartSize.data.i; i++) {
-					for (uint64_t j = 0; j < (uint64_t)axis->specializationConstants.fftDim.data.i; j++) {
+				for (pfUINT i = 0; i < (pfUINT)axis->specializationConstants.stageStartSize.data.i; i++) {
+					for (pfUINT j = 0; j < (pfUINT)axis->specializationConstants.fftDim.data.i; j++) {
 						double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize.data.i * axis->specializationConstants.fftDim.data.i));
-						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i)] = (float)cos(angle);
-						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 1] = (float)sin(angle);
+						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i)] = (float)pfcos(angle);
+						tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize.data.i) + 1] = (float)pfsin(angle);
 					}
 				}
 			}
-			if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
-				for (uint64_t j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2; j++) {
+			if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2 + 2; j++) {
 					double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis->specializationConstants.axis_id])) * j;
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (float)cos(angle);
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (float)sin(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (float)pfcos(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (float)pfsin(angle);
 				}
 			}
-			if ((app->configuration.performDCT == 4) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
-				for (uint64_t j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2; j++) {
+			if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis->specializationConstants.axis_id] % 2 == 0)) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 4 + 2; j++) {
 					double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * j;
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (float)cos(angle);
-					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (float)sin(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j] = (float)pfcos(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT3LUT.data.i + 2 * j + 1] = (float)pfsin(angle);
 				}
-				for (uint64_t j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2; j++) {
+				for (pfUINT j = 0; j < app->configuration.size[axis->specializationConstants.axis_id] / 2; j++) {
 					double angle = (-double_PI / 8.0 / (double)(app->configuration.size[axis->specializationConstants.axis_id] / 2)) * (2 * j + 1);
-					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j] = (float)cos(angle);
-					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j + 1] = (float)sin(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j] = (float)pfcos(angle);
+					tempLUT[2 * axis->specializationConstants.startDCT4LUT.data.i + 2 * j + 1] = (float)pfsin(angle);
 				}
 			}
 			axis->referenceLUT = 0;
@@ -787,19 +1234,19 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
 					axis->referenceLUT = 1;
 				}
 				else {
-					uint64_t checkRadixOrder = 1;
-					for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++)
+					pfUINT checkRadixOrder = 1;
+					for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++)
 						if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0;
 					if (checkRadixOrder) {
-						for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+						for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 							if (axis->specializationConstants.raderContainer[i].type == 0) {
-								for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
+								for (pfUINT k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
 									if (FFTPlan->axes[0][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0;
 								}
 							}
 						}
 					}
-					if (checkRadixOrder && (axis->specializationConstants.axis_id >= 1) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[0][0].specializationConstants.fft_dim_full.data.i) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] == 1) && (axis->specializationConstants.fft_dim_full.data.i < axis->specializationConstants.maxSingleSizeStrided.data.i / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[0]))) {
+					if (checkRadixOrder && (axis->specializationConstants.axis_id >= 1) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[0][0].specializationConstants.fft_dim_full.data.i) && (FFTPlan->numAxisUploads[axis->specializationConstants.axis_id] == 1) && (axis->specializationConstants.fft_dim_full.data.i < axis->specializationConstants.maxSingleSizeStrided.data.i / axis->specializationConstants.registerBoost)) && ((((!axis->specializationConstants.performDCT) && (!axis->specializationConstants.performDST)) && (!axis->specializationConstants.performDST)) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[0]))) {
 						axis->bufferLUT = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUT;
 #if(VKFFT_BACKEND==0)
 						axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
@@ -811,18 +1258,18 @@ static inline VkFFTResult VkFFT_AllocateLUT(VkFFTApplication* app, VkFFTPlan* FF
                         for (int p = 1; p < axis->specializationConstants.axis_id; p++){
                             if(axis->referenceLUT == 0){
                                 checkRadixOrder = 1;
-                                for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++)
+                                for (pfUINT i = 0; i < axis->specializationConstants.numStages; i++)
                                     if (FFTPlan->axes[p][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0;
                                 if (checkRadixOrder) {
-                                    for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+                                    for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
                                         if (axis->specializationConstants.raderContainer[i].type == 0) {
-                                            for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
+                                            for (pfUINT k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) {
                                                 if (FFTPlan->axes[p][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0;
                                             }
                                         }
                                     }
                                 }
-                                if (checkRadixOrder && (axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[p][0].specializationConstants.fft_dim_full.data.i) && ((!app->configuration.performDCT) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[p]))) {
+                                if (checkRadixOrder && (axis->specializationConstants.fft_dim_full.data.i == FFTPlan->axes[p][0].specializationConstants.fft_dim_full.data.i) && (((!axis->specializationConstants.performDCT) && (!axis->specializationConstants.performDST)) || (app->configuration.size[axis->specializationConstants.axis_id] == app->configuration.size[p]))) {
                                     axis->bufferLUT = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUT;
 #if(VKFFT_BACKEND==0)
                                     axis->bufferLUTDeviceMemory = FFTPlan->axes[p][axis->specializationConstants.axis_upload_id].bufferLUTDeviceMemory;
@@ -942,7 +1389,7 @@ static inline VkFFTResult VkFFT_AllocateRaderUintLUT(VkFFTApplication* app, VkFF
 	if (axis->specializationConstants.raderUintLUT) {
 		if (app->bufferRaderUintLUT[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id] == 0) {
 			app->bufferRaderUintLUTSize[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id] = 0;
-			for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+			for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 				app->bufferRaderUintLUTSize[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id] += axis->specializationConstants.raderContainer[i].prime * sizeof(uint32_t);
 			}
 			uint32_t* tempRaderUintLUT = (uint32_t*)malloc(app->bufferRaderUintLUTSize[axis->specializationConstants.axis_id][axis->specializationConstants.axis_upload_id]);
@@ -950,14 +1397,14 @@ static inline VkFFTResult VkFFT_AllocateRaderUintLUT(VkFFTApplication* app, VkFF
 				deleteVkFFT(app);
 				return VKFFT_ERROR_MALLOC_FAILED;
 			}
-			uint64_t current_offset = 0;
-			for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+			pfUINT current_offset = 0;
+			for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 				if (axis->specializationConstants.raderContainer[i].prime > 0) {
 					axis->specializationConstants.raderContainer[i].raderUintLUToffset = (int)current_offset;
-					uint64_t g_pow = 1;
+					pfUINT g_pow = 1;
 					tempRaderUintLUT[current_offset] = 1;
 					current_offset++;
-					for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1; t++) {
+					for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1; t++) {
 						g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
 						tempRaderUintLUT[current_offset] = (uint32_t)g_pow;
 						current_offset++;
@@ -1049,11 +1496,11 @@ static inline VkFFTResult VkFFT_AllocateRaderUintLUT(VkFFTApplication* app, VkFF
 			tempRaderUintLUT = 0;
 		}
 		else {
-			uint64_t current_offset = 0;
-			for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+			pfUINT current_offset = 0;
+			for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 				if (axis->specializationConstants.raderContainer[i].prime > 0) {
 					axis->specializationConstants.raderContainer[i].raderUintLUToffset = (int)current_offset;
-					uint64_t g_pow = 1;
+					pfUINT g_pow = 1;
 					current_offset += axis->specializationConstants.raderContainer[i].prime;
 				}
 			}
@@ -1068,7 +1515,7 @@ static inline VkFFTResult VkFFT_AllocateRaderUintLUT(VkFFTApplication* app, VkFF
 	return resFFT;
 }
 
-static inline VkFFTResult VkFFT_AllocateLUT_R2C(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t inverse) {
+static inline VkFFTResult VkFFT_AllocateLUT_R2C(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, pfUINT inverse) {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
 	VkResult res = VK_SUCCESS;
@@ -1083,18 +1530,138 @@ static inline VkFFTResult VkFFT_AllocateLUT_R2C(VkFFTApplication* app, VkFFTPlan
 #elif(VKFFT_BACKEND==5)
 	#endif
 	if (app->configuration.useLUT == 1) {
-		if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-			long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
+		if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+			PfContainer in = VKFFT_ZERO_INIT;
+			PfContainer temp1 = VKFFT_ZERO_INIT;
+			in.type = 22;
+			
+			pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+			axis->bufferLUTSize = (app->configuration.size[0] / 2) * 4 * sizeof(double);
+			double* tempLUT = (double*)malloc(axis->bufferLUTSize);
+			if (!tempLUT) {
+				deleteVkFFT(app);
+				return VKFFT_ERROR_MALLOC_FAILED;
+			}
+			for (pfUINT i = 0; i < app->configuration.size[0] / 2; i++) {
+				pfLD angle = double_PI * i / (app->configuration.size[0] / 2);
+				
+				in.data.d = pfcos(angle);
+				PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+				tempLUT[4 * i] = (double)temp1.data.dd[0].data.d;
+				tempLUT[4 * i + 1] = (double)temp1.data.dd[1].data.d;
+				in.data.d = pfsin(angle);
+				PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+				tempLUT[4 * i + 2] = (double)temp1.data.dd[0].data.d;
+				tempLUT[4 * i + 3] = (double)temp1.data.dd[1].data.d;
+			}
+			axis->referenceLUT = 0;
+			PfDeallocateContainer(&axis->specializationConstants, &temp1);
+			if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
+				axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT;
+#if(VKFFT_BACKEND==0)
+				axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory;
+#endif
+				axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize;
+				axis->referenceLUT = 1;
+			}
+			else {
+#if(VKFFT_BACKEND==0)
+				resFFT = allocateBufferVulkan(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
+				if (resFFT != VKFFT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return resFFT;
+				}
+				resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+				if (resFFT != VKFFT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return resFFT;
+				}
+#elif(VKFFT_BACKEND==1)
+				res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
+				if (res != cudaSuccess) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+				}
+				resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+				if (resFFT != VKFFT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return resFFT;
+				}
+#elif(VKFFT_BACKEND==2)
+				res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
+				if (res != hipSuccess) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+				}
+				resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+				if (resFFT != VKFFT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return resFFT;
+				}
+#elif(VKFFT_BACKEND==3)
+				axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
+				if (res != CL_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+				}
+#elif(VKFFT_BACKEND==4)
+				ze_device_mem_alloc_desc_t device_desc = VKFFT_ZERO_INIT;
+				device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+				res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT);
+				if (res != ZE_RESULT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+				}
+				resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+				if (resFFT != VKFFT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return resFFT;
+				}
+#elif(VKFFT_BACKEND==5)
+				axis->bufferLUT = app->configuration.device->newBuffer(axis->bufferLUTSize, MTL::ResourceStorageModePrivate);
+
+				resFFT = VkFFT_TransferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
+				if (resFFT != VKFFT_SUCCESS) {
+					deleteVkFFT(app);
+					free(tempLUT);
+					tempLUT = 0;
+					return resFFT;
+				}
+#endif
+				free(tempLUT);
+				tempLUT = 0;
+			}
+		}
+		else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+			pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
 			axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(double);
 			double* tempLUT = (double*)malloc(axis->bufferLUTSize);
 			if (!tempLUT) {
 				deleteVkFFT(app);
 				return VKFFT_ERROR_MALLOC_FAILED;
 			}
-			for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) {
-				long double angle = double_PI * i / (app->configuration.size[0] / 2);
-				tempLUT[2 * i] = (double)cos(angle);
-				tempLUT[2 * i + 1] = (double)sin(angle);
+			for (pfUINT i = 0; i < app->configuration.size[0] / 2; i++) {
+				pfLD angle = double_PI * i / (app->configuration.size[0] / 2);
+				tempLUT[2 * i] = (double)pfcos(angle);
+				tempLUT[2 * i + 1] = (double)pfsin(angle);
 			}
 			axis->referenceLUT = 0;
 			if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
@@ -1199,10 +1766,10 @@ static inline VkFFTResult VkFFT_AllocateLUT_R2C(VkFFTApplication* app, VkFFTPlan
 				deleteVkFFT(app);
 				return VKFFT_ERROR_MALLOC_FAILED;
 			}
-			for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) {
+			for (pfUINT i = 0; i < app->configuration.size[0] / 2; i++) {
 				double angle = double_PI * i / (app->configuration.size[0] / 2);
-				tempLUT[2 * i] = (float)cos(angle);
-				tempLUT[2 * i + 1] = (float)sin(angle);
+				tempLUT[2 * i] = (float)pfcos(angle);
+				tempLUT[2 * i + 1] = (float)pfsin(angle);
 			}
 			axis->referenceLUT = 0;
 			if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_RecursiveFFTGenerators.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_RecursiveFFTGenerators.h
index acb681db..c524e0db 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_RecursiveFFTGenerators.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_RecursiveFFTGenerators.h
@@ -25,16 +25,19 @@
 
 #include "vkFFT/vkFFT_PlanManagement/vkFFT_API_handles/vkFFT_ManageMemory.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h"
+#include "vkFFT/vkFFT_CodeGen/vkFFT_MathUtils/vkFFT_MathUtils.h"
 #ifdef VkFFT_use_FP128_Bluestein_RaderFFT
 #include "fftw3.h"
 #endif
+
 static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration);
 
-static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id) {
+static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFTPlan* FFTPlan, pfUINT axis_id) {
 	//generate two arrays used for Blueestein convolution and post-convolution multiplication
 	VkFFTResult resFFT = VKFFT_SUCCESS;
-	uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
+	pfUINT bufferSize = (pfUINT)sizeof(float) * 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
 	if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) bufferSize *= sizeof(double) / sizeof(float);
+	if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) bufferSize *= 4;
 	app->bufferBluesteinSize[axis_id] = bufferSize;
 #if(VKFFT_BACKEND==0)
 	VkResult res = VK_SUCCESS;
@@ -118,27 +121,28 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 		if (!phaseVectors_fp64) {
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		long double* phaseVectors_fp128 = (long double*)malloc(2 * bufferSize);
+		pfLD* phaseVectors_fp128 = (pfLD*)malloc(2 * bufferSize);
 		if (!phaseVectors_fp128) {
 			free(phaseVectors_fp64);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		long double* phaseVectors_fp128_out = (long double*)malloc(2 * bufferSize);
+		pfLD* phaseVectors_fp128_out = (pfLD*)malloc(2 * bufferSize);
 		if (!phaseVectors_fp128) {
 			free(phaseVectors_fp64);
 			free(phaseVectors_fp128);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		uint64_t phaseVectorsNonZeroSize = (((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id];
+		pfUINT phaseVectorsNonZeroSize = ((((app->configuration.performDCT == 4) || (app->configuration.performDST == 4)) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id];
 		if (app->configuration.performDCT == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] - 2;
-		long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
-		for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-			uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
-			long double angle = double_PI * rm / phaseVectorsNonZeroSize;
-			phaseVectors_fp128[2 * i] = (i < phaseVectorsNonZeroSize) ? cos(angle) : 0;
-			phaseVectors_fp128[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? -sin(angle) : 0;
+		if (app->configuration.performDST == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] + 2;
+		pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+		for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+			pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+			pfLD angle = double_PI * rm / phaseVectorsNonZeroSize;
+			phaseVectors_fp128[2 * i] = (i < phaseVectorsNonZeroSize) ? pfcos(angle) : 0;
+			phaseVectors_fp128[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? -pfsin(angle) : 0;
 		}
-		for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
+		for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
 			phaseVectors_fp128[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_fp128[2 * i];
 			phaseVectors_fp128[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_fp128[2 * i + 1];
 		}
@@ -147,8 +151,8 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 			p = fftwl_plan_dft_1d((int)(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]), (fftwl_complex*)phaseVectors_fp128, (fftwl_complex*)phaseVectors_fp128_out, -1, FFTW_ESTIMATE);
 			fftwl_execute(p);
 			fftwl_destroy_plan(p);
-			for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-				uint64_t out = 0;
+			for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+				pfUINT out = 0;
 				if (FFTPlan->numAxisUploads[axis_id] == 1) {
 					out = i;
 				}
@@ -169,10 +173,10 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 				return resFFT;
 			}
 		}
-		for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+		for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
 			phaseVectors_fp128[2 * i + 1] = -phaseVectors_fp128[2 * i + 1];
 		}
-		for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+		for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
 			phaseVectors_fp64[2 * i] = (double)phaseVectors_fp128[2 * i];
 			phaseVectors_fp64[2 * i + 1] = (double)phaseVectors_fp128[2 * i + 1];
 		}
@@ -188,8 +192,8 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 			p = fftwl_plan_dft_1d((int)(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]), (fftwl_complex*)phaseVectors_fp128, (fftwl_complex*)phaseVectors_fp128_out, -1, FFTW_ESTIMATE);
 			fftwl_execute(p);
 			fftwl_destroy_plan(p);
-			for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-				uint64_t out = 0;
+			for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+				pfUINT out = 0;
 				if (FFTPlan->numAxisUploads[axis_id] == 1) {
 					out = i;
 				}
@@ -216,7 +220,7 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 			fftwl_execute(p);
 			fftwl_destroy_plan(p);
 
-			for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+			for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
 				phaseVectors_fp64[2 * i] = (double)phaseVectors_fp128_out[2 * i];
 				phaseVectors_fp64[2 * i + 1] = (double)phaseVectors_fp128_out[2 * i + 1];
 			}
@@ -241,6 +245,7 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 		kernelPreparationConfiguration.size[1] = 1;
 		kernelPreparationConfiguration.size[2] = 1;
 		kernelPreparationConfiguration.doublePrecision = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory);
+		kernelPreparationConfiguration.quadDoubleDoublePrecision = (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory);
 		kernelPreparationConfiguration.useLUT = 1;
 		kernelPreparationConfiguration.useLUT_4step = 1;
 		kernelPreparationConfiguration.registerBoost = 1;
@@ -271,6 +276,8 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 		kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice;
 		kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
 		kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory;
+		if (app->configuration.stagingBuffer != 0)	kernelPreparationConfiguration.stagingBuffer = app->configuration.stagingBuffer;
+		if (app->configuration.stagingBufferMemory != 0)	kernelPreparationConfiguration.stagingBufferMemory = app->configuration.stagingBufferMemory;
 #elif(VKFFT_BACKEND==3)
 		kernelPreparationConfiguration.context = app->configuration.context;
 #elif(VKFFT_BACKEND==4)
@@ -295,20 +302,46 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 			deleteVkFFT(&kernelPreparationApplication);
 			return VKFFT_ERROR_MALLOC_FAILED;
 		}
-		uint64_t phaseVectorsNonZeroSize = (((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id];
+		pfUINT phaseVectorsNonZeroSize = ((((app->configuration.performDCT == 4) || (app->configuration.performDST == 4)) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id];
 		if (app->configuration.performDCT == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] - 2;
-
+		if (app->configuration.performDST == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] + 2;
 		if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) {
-			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-				long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
+			if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+				PfContainer in = VKFFT_ZERO_INIT;
+				PfContainer temp1 = VKFFT_ZERO_INIT;
+				in.type = 22;
+				pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+				double* phaseVectors_cast = (double*)phaseVectors;
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+					pfLD angle = double_PI * rm / phaseVectorsNonZeroSize;
+					in.data.d = pfcos(angle);
+					PfConvToDoubleDouble(&FFTPlan->axes[axis_id][0].specializationConstants, &temp1, &in);
+					phaseVectors_cast[4 * i] = (i < phaseVectorsNonZeroSize) ? (double)temp1.data.dd[0].data.d : 0;
+					phaseVectors_cast[4 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)temp1.data.dd[1].data.d : 0;
+					in.data.d = pfsin(angle);
+					PfConvToDoubleDouble(&FFTPlan->axes[axis_id][0].specializationConstants, &temp1, &in);
+					phaseVectors_cast[4 * i + 2] = (i < phaseVectorsNonZeroSize) ? (double)-temp1.data.dd[0].data.d : 0;
+					phaseVectors_cast[4 * i + 3] = (i < phaseVectorsNonZeroSize) ? (double)-temp1.data.dd[1].data.d : 0;
+				}
+				for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[4 * i];
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[4 * i + 1];
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 2] = phaseVectors_cast[4 * i + 2];
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 3] = phaseVectors_cast[4 * i + 3];
+				}
+				PfDeallocateContainer(&FFTPlan->axes[axis_id][0].specializationConstants, &temp1);
+			}
+			else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+				pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
 				double* phaseVectors_cast = (double*)phaseVectors;
-				for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-					uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
-					long double angle = double_PI * rm / phaseVectorsNonZeroSize;
-					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0;
-					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)-sin(angle) : 0;
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+					pfLD angle = double_PI * rm / phaseVectorsNonZeroSize;
+					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)pfcos(angle) : 0;
+					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)-pfsin(angle) : 0;
 				}
-				for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
+				for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 				}
@@ -316,13 +349,13 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 			else {
 				double double_PI = 3.14159265358979323846264338327950288419716939937510;
 				float* phaseVectors_cast = (float*)phaseVectors;
-				for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-					uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
 					double angle = double_PI * rm / phaseVectorsNonZeroSize;
-					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0;
-					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)-sin(angle) : 0;
+					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)pfcos(angle) : 0;
+					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)-pfsin(angle) : 0;
 				}
-				for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
+				for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 				}
@@ -508,31 +541,64 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 #endif
 		}
 		if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) {
-			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+			if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
 				double* phaseVectors_cast = (double*)phaseVectors;
-				for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					phaseVectors_cast[4 * i + 2] = -phaseVectors_cast[4 * i + 2];
+					phaseVectors_cast[4 * i + 3] = -phaseVectors_cast[4 * i + 3];
+				}
+			}
+			else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+				double* phaseVectors_cast = (double*)phaseVectors;
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
 					phaseVectors_cast[2 * i + 1] = -phaseVectors_cast[2 * i + 1];
 				}
 
 			}
 			else {
 				float* phaseVectors_cast = (float*)phaseVectors;
-				for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
 					phaseVectors_cast[2 * i + 1] = -phaseVectors_cast[2 * i + 1];
 				}
 			}
 		}
 		else {
-			if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-				long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
+			if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+				PfContainer in = VKFFT_ZERO_INIT;
+				PfContainer temp1 = VKFFT_ZERO_INIT;
+				in.type = 22;
+				pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
 				double* phaseVectors_cast = (double*)phaseVectors;
-				for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-					uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
-					long double angle = double_PI * rm / phaseVectorsNonZeroSize;
-					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0;
-					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)sin(angle) : 0;
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+					pfLD angle = double_PI * rm / phaseVectorsNonZeroSize;
+					in.data.d = pfcos(angle);
+					PfConvToDoubleDouble(&FFTPlan->axes[axis_id][0].specializationConstants, &temp1, &in);
+					phaseVectors_cast[4 * i] = (i < phaseVectorsNonZeroSize) ? (double)temp1.data.dd[0].data.d : 0;
+					phaseVectors_cast[4 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)temp1.data.dd[1].data.d : 0;
+					in.data.d = pfsin(angle);
+					PfConvToDoubleDouble(&FFTPlan->axes[axis_id][0].specializationConstants, &temp1, &in);
+					phaseVectors_cast[4 * i + 2] = (i < phaseVectorsNonZeroSize) ? (double)temp1.data.dd[0].data.d : 0;
+					phaseVectors_cast[4 * i + 3] = (i < phaseVectorsNonZeroSize) ? (double)temp1.data.dd[1].data.d : 0;
 				}
-				for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
+				for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[4 * i];
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[4 * i + 1];
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 2] = phaseVectors_cast[4 * i + 2];
+					phaseVectors_cast[4 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 3] = phaseVectors_cast[4 * i + 3];
+				}
+				PfDeallocateContainer(&FFTPlan->axes[axis_id][0].specializationConstants, &temp1);
+			}
+			else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+				pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+				double* phaseVectors_cast = (double*)phaseVectors;
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+					pfLD angle = double_PI * rm / phaseVectorsNonZeroSize;
+					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)pfcos(angle) : 0;
+					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)pfsin(angle) : 0;
+				}
+				for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 				}
@@ -540,13 +606,13 @@ static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFT
 			else {
 				double double_PI = 3.14159265358979323846264338327950288419716939937510;
 				float* phaseVectors_cast = (float*)phaseVectors;
-				for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
-					uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
+				for (pfUINT i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
+					pfUINT rm = (i * i) % (2 * phaseVectorsNonZeroSize);
 					double angle = double_PI * rm / phaseVectorsNonZeroSize;
-					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0;
-					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)sin(angle) : 0;
+					phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)pfcos(angle) : 0;
+					phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)pfsin(angle) : 0;
 				}
-				for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
+				for (pfUINT i = 1; i < phaseVectorsNonZeroSize; i++) {
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 					phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 				}
@@ -928,16 +994,16 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 	//generate Rader FFTKernel
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 	if (axis->specializationConstants.useRader) {
-		for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
+		for (pfUINT i = 0; i < axis->specializationConstants.numRaderPrimes; i++) {
 			if (axis->specializationConstants.raderContainer[i].type == 0) {
-				for (uint64_t j = 0; j < app->numRaderFFTPrimes; j++) {
+				for (pfUINT j = 0; j < app->numRaderFFTPrimes; j++) {
 					if (app->rader_primes[j] == axis->specializationConstants.raderContainer[i].prime) {
 						axis->specializationConstants.raderContainer[i].raderFFTkernel = app->raderFFTkernel[j];
 					}
 				}
 				if (axis->specializationConstants.raderContainer[i].raderFFTkernel) continue;
 
-				uint64_t write_id = app->numRaderFFTPrimes;
+				pfUINT write_id = app->numRaderFFTPrimes;
 				app->rader_primes[write_id] = axis->specializationConstants.raderContainer[i].prime;
 				app->numRaderFFTPrimes++;
 
@@ -945,28 +1011,28 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 
 #ifdef VkFFT_use_FP128_Bluestein_RaderFFT
 				if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-					long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
+					pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
 					double* raderFFTkernel = (double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2);
 					if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED;
 					axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel;
 					app->raderFFTkernel[write_id] = (void*)raderFFTkernel;
 					app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2;
 
-					long double* raderFFTkernel_temp = (long double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(long double) * 2);
+					pfLD* raderFFTkernel_temp = (pfLD*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(pfLD) * 2);
 					if (!raderFFTkernel_temp) return VKFFT_ERROR_MALLOC_FAILED;
-					for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
-						uint64_t g_pow = 1;
-						for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+					for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+						pfUINT g_pow = 1;
+						for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
 							g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
 						}
-						raderFFTkernel_temp[2 * j] = cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
-						raderFFTkernel_temp[2 * j + 1] = -sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+						raderFFTkernel_temp[2 * j] = pfcos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+						raderFFTkernel_temp[2 * j + 1] = -pfsin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
 					}
 					fftwl_plan p;
 					p = fftwl_plan_dft_1d((int)(axis->specializationConstants.raderContainer[i].prime - 1), (fftwl_complex*)raderFFTkernel_temp, (fftwl_complex*)raderFFTkernel_temp, -1, FFTW_ESTIMATE);
 					fftwl_execute(p);
 					fftwl_destroy_plan(p);
-					for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+					for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
 						raderFFTkernel[2 * j] = (double)raderFFTkernel_temp[2 * j];
 						raderFFTkernel[2 * j + 1] = (double)raderFFTkernel_temp[2 * j + 1];
 					}
@@ -974,20 +1040,47 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 					continue;
 				}
 #endif
-				if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-					long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
+				if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+					PfContainer in = VKFFT_ZERO_INIT;
+					PfContainer temp1 = VKFFT_ZERO_INIT;
+					in.type = 22;
+					pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+					double* raderFFTkernel = (double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 4);
+					if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED;
+					axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel;
+					app->raderFFTkernel[write_id] = (void*)raderFFTkernel;
+					app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 4;
+					for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+						pfUINT g_pow = 1;
+						for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+							g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
+						}
+						pfLD angle = g_pow * double_PI * pfFPinit("2.0") / axis->specializationConstants.raderContainer[i].prime;
+						in.data.d = pfcos(angle);
+						PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+						raderFFTkernel[4 * j] = (double)temp1.data.dd[0].data.d;
+						raderFFTkernel[4 * j + 1] = (double)temp1.data.dd[1].data.d;
+						in.data.d = -pfsin(angle);
+						PfConvToDoubleDouble(&axis->specializationConstants, &temp1, &in);
+						raderFFTkernel[4 * j + 2] = (double)temp1.data.dd[0].data.d;
+						raderFFTkernel[4 * j + 3] = (double)temp1.data.dd[1].data.d;
+					}
+					PfDeallocateContainer(&axis->specializationConstants, &temp1);
+				}
+				else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+					pfLD double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
 					double* raderFFTkernel = (double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2);
 					if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED;
 					axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel;
 					app->raderFFTkernel[write_id] = (void*)raderFFTkernel;
 					app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2;
-					for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
-						uint64_t g_pow = 1;
-						for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+					for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+						pfUINT g_pow = 1;
+						for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
 							g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
 						}
-						raderFFTkernel[2 * j] = (double)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
-						raderFFTkernel[2 * j + 1] = (double)-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+						raderFFTkernel[2 * j] = (double)pfcos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+						raderFFTkernel[2 * j + 1] = (double)-pfsin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
 					}
 				}
 				else {
@@ -997,13 +1090,13 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 					axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel;
 					app->raderFFTkernel[write_id] = (void*)raderFFTkernel;
 					app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(float) * 2;
-					for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
-						uint64_t g_pow = 1;
-						for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
+					for (pfUINT j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later
+						pfUINT g_pow = 1;
+						for (pfUINT t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) {
 							g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime;
 						}
-						raderFFTkernel[2 * j] = (float)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
-						raderFFTkernel[2 * j + 1] = (float)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
+						raderFFTkernel[2 * j] = (float)pfcos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime);
+						raderFFTkernel[2 * j + 1] = (float)(-pfsin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime));
 					}
 				}
 
@@ -1015,6 +1108,7 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 				kernelPreparationConfiguration.size[1] = 1;
 				kernelPreparationConfiguration.size[2] = 1;
 				kernelPreparationConfiguration.doublePrecision = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory);
+				kernelPreparationConfiguration.quadDoubleDoublePrecision = (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory);
 				kernelPreparationConfiguration.useLUT = 1;
 				kernelPreparationConfiguration.fixMinRaderPrimeFFT = 17;
 				kernelPreparationConfiguration.fixMinRaderPrimeMult = 17;
@@ -1029,6 +1123,8 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 				kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice;
 				kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
 				kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory;
+				if (app->configuration.stagingBuffer != 0)	kernelPreparationConfiguration.stagingBuffer = app->configuration.stagingBuffer;
+				if (app->configuration.stagingBufferMemory != 0)	kernelPreparationConfiguration.stagingBufferMemory = app->configuration.stagingBufferMemory;
 #elif(VKFFT_BACKEND==3)
 				kernelPreparationConfiguration.context = app->configuration.context;
 #elif(VKFFT_BACKEND==4)
@@ -1040,8 +1136,9 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 				kernelPreparationConfiguration.queue = app->configuration.queue;
 #endif			
 
-				uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * kernelPreparationConfiguration.size[0] * kernelPreparationConfiguration.size[1] * kernelPreparationConfiguration.size[2];
+				pfUINT bufferSize = (pfUINT)sizeof(float) * 2 * kernelPreparationConfiguration.size[0] * kernelPreparationConfiguration.size[1] * kernelPreparationConfiguration.size[2];
 				if (kernelPreparationConfiguration.doublePrecision) bufferSize *= sizeof(double) / sizeof(float);
+				if (kernelPreparationConfiguration.quadDoubleDoublePrecision) bufferSize *= 2 * sizeof(double) / sizeof(float);
 
 				kernelPreparationConfiguration.bufferSize = &bufferSize;
 				resFFT = initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration);
@@ -1293,10 +1390,13 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 			}
 		}
 		if (app->configuration.loadApplicationFromString) {
-			uint64_t offset = 0;
-			for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) {
-				uint64_t current_size = 0;
-				if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+			pfUINT offset = 0;
+			for (pfUINT i = 0; i < app->numRaderFFTPrimes; i++) {
+				pfUINT current_size = 0;
+				if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+					current_size = (app->rader_primes[i] - 1) * sizeof(double) * 4;
+				}
+				else if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
 					current_size = (app->rader_primes[i] - 1) * sizeof(double) * 2;
 				}
 				else {
@@ -1307,7 +1407,7 @@ static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkF
 					if (!app->raderFFTkernel[i]) return VKFFT_ERROR_MALLOC_FAILED;
 					memcpy(app->raderFFTkernel[i], (char*)app->configuration.loadApplicationString + app->applicationStringOffsetRader + offset, current_size);
 				}
-				for (uint64_t j = 0; j < axis->specializationConstants.numRaderPrimes; j++) {
+				for (pfUINT j = 0; j < axis->specializationConstants.numRaderPrimes; j++) {
 					if ((app->rader_primes[i] == axis->specializationConstants.raderContainer[j].prime) && (axis->specializationConstants.raderContainer[j].type == 0))
 						axis->specializationConstants.raderContainer[j].raderFFTkernel = app->raderFFTkernel[i];
 				}
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h
index 71a2208c..2fb6eb27 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h
@@ -22,8 +22,294 @@
 #ifndef VKFFT_SCHEDULER_H
 #define VKFFT_SCHEDULER_H
 #include "vkFFT/vkFFT_Structs/vkFFT_Structs.h"
+static inline VkFFTResult VkFFTGetRegistersPerThreadQuad(VkFFTApplication* app, int fft_length, int extraSharedMemoryForPow2, pfUINT max_rhs, int useRader, int* loc_multipliers, int* registers_per_thread_per_radix, int* registers_per_thread, int* min_registers_per_thread, int* isGoodSequence) {
+	for (int i = 0; i < 33; i++) {
+		registers_per_thread_per_radix[i] = 0;
+	}
+	registers_per_thread[0] = 0;
+	min_registers_per_thread[0] = 10000000;
+
+	if (loc_multipliers[2] > 0) {
+		if (loc_multipliers[3] > 0) {
+			if (loc_multipliers[5] > 0) {
+				if (loc_multipliers[7] > 0) {
+					registers_per_thread_per_radix[2] = 6;
+					registers_per_thread_per_radix[3] = 6;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 7;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+				else
+				{
+					registers_per_thread_per_radix[2] = 6;
+					registers_per_thread_per_radix[3] = 6;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 0;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+			}
+			else
+			{
+				if (loc_multipliers[7] > 0) {
+					switch (loc_multipliers[2]) {
+					case 1:
+						registers_per_thread_per_radix[2] = 6;
+						registers_per_thread_per_radix[3] = 6;
+						registers_per_thread_per_radix[5] = 0;
+						registers_per_thread_per_radix[7] = 7;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+						break;
+					case 2:
+						registers_per_thread_per_radix[2] = 6;
+						registers_per_thread_per_radix[3] = 6;
+						registers_per_thread_per_radix[5] = 0;
+						registers_per_thread_per_radix[7] = 7;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+						break;
+					default:
+						registers_per_thread_per_radix[2] = 8;
+						registers_per_thread_per_radix[3] = 6;
+						registers_per_thread_per_radix[5] = 0;
+						registers_per_thread_per_radix[7] = 7;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+						break;
+					}
+				}
+				else
+				{
+					registers_per_thread_per_radix[2] = 6;
+					registers_per_thread_per_radix[3] = 6;
+					registers_per_thread_per_radix[5] = 0;
+					registers_per_thread_per_radix[7] = 0;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+			}
+		}
+		else {
+			if (loc_multipliers[5] > 0) {
+				if (loc_multipliers[7] > 0) {
+					switch (loc_multipliers[2]) {
+					case 1:
+						registers_per_thread_per_radix[2] = 6;
+						registers_per_thread_per_radix[3] = 0;
+						registers_per_thread_per_radix[5] = 5;
+						registers_per_thread_per_radix[7] = 7;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+						break;
+					case 2:
+						registers_per_thread_per_radix[2] = 8;
+						registers_per_thread_per_radix[3] = 0;
+						registers_per_thread_per_radix[5] = 5;
+						registers_per_thread_per_radix[7] = 7;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+						break;
+					default:
+						registers_per_thread_per_radix[2] = 8;
+						registers_per_thread_per_radix[3] = 0;
+						registers_per_thread_per_radix[5] = 5;
+						registers_per_thread_per_radix[7] = 7;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+						break;
+					}
+				}
+				else
+				{
+					registers_per_thread_per_radix[2] = 4;
+					registers_per_thread_per_radix[3] = 0;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 0;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+			}
+			else
+			{
+				if (loc_multipliers[7] > 0) {
+					registers_per_thread_per_radix[2] = 8;
+					registers_per_thread_per_radix[3] = 0;
+					registers_per_thread_per_radix[5] = 0;
+					registers_per_thread_per_radix[7] = 7;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+				else
+				{
+					int max_loc_multipliers_pow2 = 0;
+					pfUINT active_threads_y = max_rhs / 64; //estimate workbalance across CU (assume we have 64 CU)
+					if (active_threads_y == 0) active_threads_y = 1;
+					int testMinStages = 10000000;
+					int maxRadixMinStages = 1;
+					int fixMaxCheckRadix2 = 3;
+
+					for (int i = 1; i <= fixMaxCheckRadix2; i++) {
+						int numStages = (int)pfceil(log2(fft_length) / ((double)i));
+						if (numStages < testMinStages) {
+							testMinStages = numStages;
+							maxRadixMinStages = i;
+						}
+					}
+					for (int i = maxRadixMinStages; i >= 1; i--) {
+						pfUINT active_threads_x = (active_threads_y * fft_length) / ((int)pow(2, i));
+						if (active_threads_x >= 128) {
+							max_loc_multipliers_pow2 = i;
+							i = 1;
+						}
 
-static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, int fft_length, int extraSharedMemoryForPow2, uint64_t max_rhs, int useRader, int* loc_multipliers, int* registers_per_thread_per_radix, int* registers_per_thread, int* min_registers_per_thread, int* isGoodSequence) {
+					}
+					if (max_loc_multipliers_pow2 < 3) max_loc_multipliers_pow2 = 3;
+
+					int final_loc_multipliers_pow2 = 1;
+					int num_stages_min = (int)log2(fft_length);
+					for (int i = 2; i <= max_loc_multipliers_pow2; i++) {
+						int num_stages = (int)pfceil(((int)log2(fft_length)) / (double)i);
+						if (num_stages < num_stages_min) {
+							final_loc_multipliers_pow2 = i;
+							num_stages_min = num_stages;
+						}
+
+					}
+					registers_per_thread_per_radix[2] = (loc_multipliers[2] > final_loc_multipliers_pow2) ? (int)pow(2, final_loc_multipliers_pow2) : (int)pow(2, loc_multipliers[2]);
+					registers_per_thread_per_radix[2] = (loc_multipliers[2] < 3) ? (int)pow(2, loc_multipliers[2]) : registers_per_thread_per_radix[2];
+					registers_per_thread_per_radix[3] = 0;
+					registers_per_thread_per_radix[5] = 0;
+					registers_per_thread_per_radix[7] = 0;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+			}
+		}
+	}
+	else {
+		if (loc_multipliers[3] > 0) {
+			if (loc_multipliers[5] > 0) {
+				if (loc_multipliers[7] > 0) {
+					registers_per_thread_per_radix[2] = 0;
+					registers_per_thread_per_radix[3] = 6;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 7;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+				else
+				{
+					registers_per_thread_per_radix[2] = 0;
+					registers_per_thread_per_radix[3] = 3;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 0;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+			}
+			else
+			{
+				if (loc_multipliers[7] > 0) {
+					registers_per_thread_per_radix[2] = 0;
+					registers_per_thread_per_radix[3] = 6;
+					registers_per_thread_per_radix[5] = 0;
+					registers_per_thread_per_radix[7] = 7;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+				else
+				{
+					if (loc_multipliers[3] == 1) {
+						registers_per_thread_per_radix[2] = 0;
+						registers_per_thread_per_radix[3] = 3;
+						registers_per_thread_per_radix[5] = 0;
+						registers_per_thread_per_radix[7] = 0;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+					}
+					else {
+						registers_per_thread_per_radix[2] = 0;
+						registers_per_thread_per_radix[3] = 9;
+						registers_per_thread_per_radix[5] = 0;
+						registers_per_thread_per_radix[7] = 0;
+						registers_per_thread_per_radix[11] = 0;
+						registers_per_thread_per_radix[13] = 0;
+					}
+				}
+			}
+		}
+		else {
+			if (loc_multipliers[5] > 0) {
+				if (loc_multipliers[7] > 0) {
+					registers_per_thread_per_radix[2] = 0;
+					registers_per_thread_per_radix[3] = 0;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 7;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+				else
+				{
+					registers_per_thread_per_radix[2] = 0;
+					registers_per_thread_per_radix[3] = 0;
+					registers_per_thread_per_radix[5] = 5;
+					registers_per_thread_per_radix[7] = 0;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+			}
+			else
+			{
+				if (loc_multipliers[7] > 0) {
+					registers_per_thread_per_radix[2] = 0;
+					registers_per_thread_per_radix[3] = 0;
+					registers_per_thread_per_radix[5] = 0;
+					registers_per_thread_per_radix[7] = 7;
+					registers_per_thread_per_radix[11] = 0;
+					registers_per_thread_per_radix[13] = 0;
+				}
+				else
+				{
+					min_registers_per_thread[0] = 2;
+					registers_per_thread[0] = 2;
+					//Rader-only sequence
+					//return VKFFT_ERROR_UNSUPPORTED_RADIX;
+				}
+			}
+		}
+
+	}
+
+	registers_per_thread_per_radix[32] = ((registers_per_thread_per_radix[2] % 32) == 0) ? registers_per_thread_per_radix[2] : 0;
+	registers_per_thread_per_radix[16] = ((registers_per_thread_per_radix[2] % 16) == 0) ? registers_per_thread_per_radix[2] : 0;
+	registers_per_thread_per_radix[8] = ((registers_per_thread_per_radix[2] % 8) == 0) ? registers_per_thread_per_radix[2] : 0;
+	registers_per_thread_per_radix[4] = ((registers_per_thread_per_radix[2] % 4) == 0) ? registers_per_thread_per_radix[2] : 0;
+	if ((registers_per_thread_per_radix[2] >= 12) && (registers_per_thread_per_radix[3] >= 12)) {
+		registers_per_thread_per_radix[12] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[3]) ? registers_per_thread_per_radix[3] : registers_per_thread_per_radix[2];
+		if ((registers_per_thread_per_radix[12] % 12) != 0) registers_per_thread_per_radix[12] = 0;
+	}
+	registers_per_thread_per_radix[6] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[3]) ? registers_per_thread_per_radix[3] : registers_per_thread_per_radix[2];
+	registers_per_thread_per_radix[9] = ((registers_per_thread_per_radix[3] % 9) == 0) ? registers_per_thread_per_radix[3] : 0;
+	registers_per_thread_per_radix[10] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[5]) ? registers_per_thread_per_radix[5] : registers_per_thread_per_radix[2];
+	registers_per_thread_per_radix[14] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[7]) ? registers_per_thread_per_radix[7] : registers_per_thread_per_radix[2];
+	registers_per_thread_per_radix[15] = (registers_per_thread_per_radix[3] > registers_per_thread_per_radix[5]) ? registers_per_thread_per_radix[5] : registers_per_thread_per_radix[3];
+
+	for (int i = 0; i < 33; i++) {
+		if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i];
+		if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i];
+	}
+	if ((registers_per_thread[0] > 16) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0;
+	else isGoodSequence[0] = 1;
+	return VKFFT_SUCCESS;
+}
+
+static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, int fft_length, int extraSharedMemoryForPow2, pfUINT max_rhs, int useRader, int* loc_multipliers, int* registers_per_thread_per_radix, int* registers_per_thread, int* min_registers_per_thread, int* isGoodSequence) {
+	if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+		VkFFTGetRegistersPerThreadQuad(app, fft_length, extraSharedMemoryForPow2, max_rhs, useRader, loc_multipliers, registers_per_thread_per_radix, registers_per_thread, min_registers_per_thread, isGoodSequence);
+		return VKFFT_SUCCESS;
+	}
 	for (int i = 0; i < 33; i++) {
 		registers_per_thread_per_radix[i] = 0;
 	}
@@ -906,7 +1192,7 @@ static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, int
 						}
 						else {
 							int max_loc_multipliers_pow2 = 0;
-							uint64_t active_threads_y = max_rhs / 64; //estimate workbalance across CU (assume we have 64 CU)
+							pfUINT active_threads_y = max_rhs / 64; //estimate workbalance across CU (assume we have 64 CU)
 							if (active_threads_y == 0) active_threads_y = 1;
 							int testMinStages = 10000000;
 							int maxRadixMinStages = 1;
@@ -915,14 +1201,14 @@ static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, int
 							fixMaxCheckRadix2 = (((fft_length >= 1024) || (fft_length == 256)) && (extraSharedMemoryForPow2) && (!useRader)) ? 5 : 3;
 #endif
 							for (int i = 1; i <= fixMaxCheckRadix2; i++) {
-								int numStages = (int)ceil(log2(fft_length) / ((double)i));
+								int numStages = (int)pfceil(log2(fft_length) / ((double)i));
 								if (numStages < testMinStages) {
 									testMinStages = numStages;
 									maxRadixMinStages = i;
 								}
 							}
 							for (int i = maxRadixMinStages; i >= 1; i--) {
-								uint64_t active_threads_x = (active_threads_y * fft_length) / ((int)pow(2, i));
+								pfUINT active_threads_x = (active_threads_y * fft_length) / ((int)pow(2, i));
 								if (active_threads_x >= 128) {
 									max_loc_multipliers_pow2 = i;
 									i = 1;
@@ -934,7 +1220,7 @@ static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, int
 							int final_loc_multipliers_pow2 = 1;
 							int num_stages_min = (int)log2(fft_length);
 							for (int i = 2; i <= max_loc_multipliers_pow2; i++) {
-								int num_stages = (int)ceil(((int)log2(fft_length)) / (double)i);
+								int num_stages = (int)pfceil(((int)log2(fft_length)) / (double)i);
 								if (num_stages < num_stages_min) {
 									final_loc_multipliers_pow2 = i;
 									num_stages_min = num_stages;
@@ -1376,6 +1662,7 @@ static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, int
 	else isGoodSequence[0] = 1;
 	return VKFFT_SUCCESS;
 }
+
 static inline VkFFTResult VkFFTGetRegistersPerThreadOptimizeShared(int fft_length, int* registers_per_thread_per_radix, int* registers_per_thread, int* min_registers_per_thread) {
 	//try to split sequence in supported radix to optimize sm usage
 	int numStages = 20;
@@ -1426,7 +1713,7 @@ static inline VkFFTResult VkFFTGetRegistersPerThreadOptimizeShared(int fft_lengt
 	for (int i = 0; i < 33; i++) {
 		if (registers_per_thread_per_radix[i] != 0) {
 			double ratio = (registers_per_thread[0] / (double)registers_per_thread_per_radix[i]);
-			int ratio_ceil = (int)ceil(ratio);
+			int ratio_ceil = (int)pfceil(ratio);
 			int ratio_floor = (int)floor(ratio);
 			double ratio2 = ((registers_per_thread_per_radix[i] * ratio_ceil) / (double)registers_per_thread[0]);
 			double ratio3 = (registers_per_thread[0] / (double)(registers_per_thread_per_radix[i] * ratio_floor));
@@ -1443,11 +1730,11 @@ static inline VkFFTResult VkFFTGetRegistersPerThreadOptimizeShared(int fft_lengt
 	}
 	return VKFFT_SUCCESS;
 }
-static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRaderContainer** raderContainer_input, uint64_t* tempSequence, int* numRaderPrimes, int fft_radix_part) {
+static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRaderContainer** raderContainer_input, pfUINT* tempSequence, int* numRaderPrimes, int fft_radix_part) {
 	VkFFTResult res = VKFFT_SUCCESS;
-	uint64_t locTempSequence = tempSequence[0];
-	uint64_t tempSequence_copy = tempSequence[0];
-	uint64_t limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1);
+	pfUINT locTempSequence = tempSequence[0];
+	pfUINT tempSequence_copy = tempSequence[0];
+	pfUINT limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1);
 	for (int i = (int)app->configuration.fixMinRaderPrimeMult; i < limit; i++) {
 		if (locTempSequence % i == 0) {
 			numRaderPrimes[0]++;
@@ -1464,7 +1751,7 @@ static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRa
 	raderContainer_input[0] = (VkFFTRaderContainer*)calloc(sizeof(VkFFTRaderContainer), numRaderPrimes[0]);
 	if (raderContainer_input[0] == 0) return VKFFT_ERROR_MALLOC_FAILED;
 	VkFFTRaderContainer* raderContainer = raderContainer_input[0];
-	uint64_t tempSequence_temp = 1;
+	pfUINT tempSequence_temp = 1;
 	limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1);
 	for (int i = (int)app->configuration.fixMinRaderPrimeMult; i < limit; i++) {
 		if (tempSequence[0] % i == 0) {
@@ -1475,7 +1762,7 @@ static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRa
 				continue;
 			}
 			//Sophie Germain safe prime check
-			uint64_t tempSequence2 = i - 1;
+			pfUINT tempSequence2 = i - 1;
 			for (int j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) {
 				if (tempSequence2 % j == 0) {
 					tempSequence2 /= j;
@@ -1586,12 +1873,12 @@ static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRa
 }
 static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* raderContainer, int numRaderPrimes, int fftDim, int* min_registers_per_thread, int* registers_per_thread, int* registers_per_thread_per_radix) {
 	VkFFTResult res = VKFFT_SUCCESS;
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (raderContainer[i].type == 0) {
 			if (raderContainer[i].min_registers_per_thread / min_registers_per_thread[0] >= 2) {
 				min_registers_per_thread[0] *= (raderContainer[i].min_registers_per_thread / min_registers_per_thread[0]);
 				for (int j = 0; j < 33; j++) {
-					if ((registers_per_thread_per_radix[j] > 0) && (registers_per_thread_per_radix[j] < min_registers_per_thread[0])) registers_per_thread_per_radix[j] *= (int)ceil(min_registers_per_thread[0] / (double)registers_per_thread_per_radix[j]);
+					if ((registers_per_thread_per_radix[j] > 0) && (registers_per_thread_per_radix[j] < min_registers_per_thread[0])) registers_per_thread_per_radix[j] *= (int)pfceil(min_registers_per_thread[0] / (double)registers_per_thread_per_radix[j]);
 				}
 				for (int j = 0; j < 33; j++) {
 					if (registers_per_thread_per_radix[j] > registers_per_thread[0]) registers_per_thread[0] = registers_per_thread_per_radix[j];
@@ -1600,7 +1887,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
 			else if (min_registers_per_thread[0] / raderContainer[i].min_registers_per_thread >= 2) {
 				raderContainer[i].min_registers_per_thread *= (min_registers_per_thread[0] / raderContainer[i].min_registers_per_thread);
 				for (int j = 0; j < 33; j++) {
-					if ((raderContainer[i].registers_per_thread_per_radix[j] > 0) && (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread)) raderContainer[i].registers_per_thread_per_radix[j] *= (int)ceil(raderContainer[i].min_registers_per_thread / (double)raderContainer[i].registers_per_thread_per_radix[j]);
+					if ((raderContainer[i].registers_per_thread_per_radix[j] > 0) && (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread)) raderContainer[i].registers_per_thread_per_radix[j] *= (int)pfceil(raderContainer[i].min_registers_per_thread / (double)raderContainer[i].registers_per_thread_per_radix[j]);
 				}
 				for (int j = 0; j < 33; j++) {
 					if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j];
@@ -1618,12 +1905,12 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
 				}
 			}
 
-			for (int64_t j = 2; j < 33; j++) {
+			for (pfINT j = 2; j < 33; j++) {
 				if (raderContainer[i].registers_per_thread_per_radix[j] != 0) {
-					double scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim);
-					while (((int)ceil(fftDim / (double)min_registers_per_thread[0])) < (raderContainer[i].containerFFTNum * scaling)) {
+					double scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim);
+					while (((int)pfceil(fftDim / (double)min_registers_per_thread[0])) < (raderContainer[i].containerFFTNum * scaling)) {
 						raderContainer[i].registers_per_thread_per_radix[j] += (int)j;
-						scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim);
+						scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim);
 					}
 					if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j];
 				}
@@ -1632,9 +1919,9 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
 		}
 	}
 	//try to increase registers usage closer to registers_per_thread across all primes
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (raderContainer[i].type == 0) {
-			for (int64_t j = 2; j < 33; j++) {
+			for (pfINT j = 2; j < 33; j++) {
 				if (raderContainer[i].registers_per_thread_per_radix[j] > 0) {
 					while ((raderContainer[i].registers_per_thread_per_radix[j] + j) <= registers_per_thread[0] + 1) {// fix
 						raderContainer[i].registers_per_thread_per_radix[j] += (int)j;
@@ -1643,7 +1930,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
 			}
 			raderContainer[i].registers_per_thread = 0;
 			raderContainer[i].min_registers_per_thread = 10000000;
-			for (int64_t j = 2; j < 33; j++) {
+			for (pfINT j = 2; j < 33; j++) {
 				if (raderContainer[i].registers_per_thread_per_radix[j] > 0) {
 					if (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread) {
 						raderContainer[i].min_registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j];
@@ -1656,13 +1943,13 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
 		}
 	}
 	//subprimes optimization
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (raderContainer[i].numSubPrimes) {
 			res = VkFFTOptimizeRaderFFTRegisters(raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix);
 			if (res != VKFFT_SUCCESS) return res;
 		}
 	}
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (min_registers_per_thread[0] > raderContainer[i].min_registers_per_thread) min_registers_per_thread[0] = raderContainer[i].min_registers_per_thread;
 		if (registers_per_thread[0] < raderContainer[i].registers_per_thread) registers_per_thread[0] = raderContainer[i].registers_per_thread;
 	}
@@ -1817,7 +2104,7 @@ static inline VkFFTResult VkFFTOptimizeRadixKernels(int* registers_per_thread_pe
 }
 static inline VkFFTResult VkFFTGetRaderFFTStages(VkFFTRaderContainer* raderContainer, int numRaderPrimes, int* stageid, int* stageRadix, int* stage_rader_generator) {
 	VkFFTResult res = VKFFT_SUCCESS;
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (raderContainer[i].multiplier > 0) {
 			stageRadix[stageid[0]] = raderContainer[i].prime;
 			stage_rader_generator[stageid[0]] = raderContainer[i].generator;
@@ -1828,7 +2115,7 @@ static inline VkFFTResult VkFFTGetRaderFFTStages(VkFFTRaderContainer* raderConta
 			//find primitive root
 		}
 	}
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (raderContainer[i].type == 0) {
 			if (raderContainer[i].numSubPrimes > 0) {
 				res = VkFFTGetRaderFFTStages(raderContainer[i].container, raderContainer[i].numSubPrimes, &raderContainer[i].numStages, raderContainer[i].stageRadix, raderContainer[i].stage_rader_generator);
@@ -1876,7 +2163,7 @@ static inline VkFFTResult VkFFTGetRaderFFTStages(VkFFTRaderContainer* raderConta
 }
 static inline VkFFTResult VkFFTMinMaxRegisterCheck(int numStages, int* stageRadix, int* min_registers_per_thread, int* registers_per_thread, int* registers_per_thread_per_radix, VkFFTRaderContainer* raderContainer, int numRaderPrimes, int* stage_rader_generator) {
 	VkFFTResult res = VKFFT_SUCCESS;
-	for (int64_t j = 0; j < (int64_t)numStages; j++) {
+	for (pfINT j = 0; j < (pfINT)numStages; j++) {
 		if (stage_rader_generator[j] == 0) {
 			if (registers_per_thread_per_radix[stageRadix[j]] > 0) {
 				if (registers_per_thread_per_radix[stageRadix[j]] < min_registers_per_thread[0]) {
@@ -1888,10 +2175,10 @@ static inline VkFFTResult VkFFTMinMaxRegisterCheck(int numStages, int* stageRadi
 			}
 		}
 		else {
-			for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+			for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 				if (raderContainer[i].prime == stageRadix[j]) {
 					if (raderContainer[i].type == 0) {
-						for (int64_t j2 = 0; j2 < (int64_t)raderContainer[i].numStages; j2++) {
+						for (pfINT j2 = 0; j2 < (pfINT)raderContainer[i].numStages; j2++) {
 							if (raderContainer[i].stage_rader_generator[j] == 0) {
 								if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] > 0) {
 									if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] < min_registers_per_thread[0]) {
@@ -1917,15 +2204,15 @@ static inline VkFFTResult VkFFTMinMaxRegisterCheck(int numStages, int* stageRadi
 static inline VkFFTResult VkFFTGetRaderFFTThreadsNum(VkFFTRaderContainer* raderContainer, int numRaderPrimes, int* numThreads) {
 	VkFFTResult res = VKFFT_SUCCESS;
 
-	for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) {
+	for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
 		if (raderContainer[i].type == 0) {
 			if (raderContainer[i].numSubPrimes > 0) {
 				res = VkFFTGetRaderFFTThreadsNum(raderContainer[i].container, raderContainer[i].numSubPrimes, numThreads);
 				if (res != VKFFT_SUCCESS) return res;
 			}
-			for (int64_t j = 0; j < (int64_t)raderContainer[i].numStages; j++) {
+			for (pfINT j = 0; j < (pfINT)raderContainer[i].numStages; j++) {
 				if (raderContainer[i].stage_rader_generator[j] == 0) {
-					if (raderContainer[i].containerFFTNum * (int)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]) > numThreads[0]) numThreads[0] = raderContainer[i].containerFFTNum * (int)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]);
+					if (raderContainer[i].containerFFTNum * (int)pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]) > numThreads[0]) numThreads[0] = raderContainer[i].containerFFTNum * (int)pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]);
 				}
 			}
 		}
@@ -1938,20 +2225,24 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 	VkFFTAxis* axes = FFTPlan->axes[axis_id];
 
 	int complexSize;
-	if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory)
-		complexSize = (2 * sizeof(double));
-	else
-		if (app->configuration.halfPrecision)
-			complexSize = (2 * sizeof(float));
+	if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory)
+		complexSize = (4 * sizeof(double));
+	else 
+	{
+		if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory)
+			complexSize = (2 * sizeof(double));
 		else
-			complexSize = (2 * sizeof(float));
-
+			if (app->configuration.halfPrecision)
+				complexSize = (2 * sizeof(float));
+			else
+				complexSize = (2 * sizeof(float));
+	}
 	int usedSharedMemory = ((app->configuration.size[axis_id] & (app->configuration.size[axis_id] - 1)) == 0) ? (int)app->configuration.sharedMemorySizePow2 : (int)app->configuration.sharedMemorySize;
 	int maxSequenceLengthSharedMemory = usedSharedMemory / complexSize;
 	int maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
 
 	int nonStridedAxisId = (app->configuration.considerAllAxesStrided) ? -1 : 0;
-	uint64_t max_rhs = 1;
+	pfUINT max_rhs = 1;
 	for (int i = 0; i < app->configuration.FFTdim; i++) {
 		FFTPlan->actualFFTSizePerAxis[axis_id][i] = app->configuration.size[i];
 		if ((FFTPlan->actualFFTSizePerAxis[axis_id][i] > 0)) max_rhs *= FFTPlan->actualFFTSizePerAxis[axis_id][i];
@@ -1975,7 +2266,10 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 	if (app->configuration.performDCT == 1) {
 		FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = 2 * app->configuration.size[axis_id] - 2; // now in actualFFTSize - modified dimension size for R2C/DCT
 	}
-	if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
+	if (app->configuration.performDST == 1) {
+		FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = 2 * app->configuration.size[axis_id] + 2; // now in actualFFTSize - modified dimension size for R2C/DCT
+	}
+	if (((app->configuration.performDCT == 4) || (app->configuration.performDST == 4)) && (app->configuration.size[axis_id] % 2 == 0)) {
 		FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT
 		//FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] * 8; // now in actualFFTSize - modified dimension size for R2C/DCT
 	}
@@ -1992,7 +2286,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 		multipliers[i] = 0;
 	}
 
-	uint64_t tempSequence = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
+	pfUINT tempSequence = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
 	for (int i = 2; i < app->configuration.fixMinRaderPrimeMult; i++) {
 		if (tempSequence % i == 0) {
 			tempSequence /= i;
@@ -2010,7 +2304,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			rader_multipliers[i] = 0;
 			rader_primes[i] = 0;
 		}
-		uint64_t tempSequence_temp = 1;
+		pfUINT tempSequence_temp = 1;
 		int maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / ((int)app->configuration.coalescedMemory) : usedSharedMemory / complexSize;
 		int limit_max_rader_prime = ((axis_id == nonStridedAxisId) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] <= maxSequenceLengthSharedMemory)) ? maxSequenceLengthSharedMemory : maxSequenceLengthSharedMemoryStrided_temp;
 		if (limit_max_rader_prime > app->configuration.fixMaxRaderPrimeFFT) limit_max_rader_prime = (int)app->configuration.fixMaxRaderPrimeFFT;
@@ -2023,7 +2317,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 					continue;
 				}
 				//Sophie Germain safe prime check
-				uint64_t tempSequence2 = i - 1;
+				pfUINT tempSequence2 = i - 1;
 				for (int j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) {
 					if (tempSequence2 % j == 0) {
 						tempSequence2 /= j;
@@ -2149,13 +2443,13 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 		else {
 			while (!FFTSizeSelected) {
 				if (axis_id == nonStridedAxisId) {
-					if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory))))  tempSequence = (int)pow(2, (int)ceil(log2(tempSequence)));
+					if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)pfceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)pfceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory))))  tempSequence = (int)pow(2, (int)pfceil(log2(tempSequence)));
 				}
 				else {
 					int maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / ((int)app->configuration.coalescedMemory) : usedSharedMemory / complexSize;
-					if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp))))  tempSequence = (int)pow(2, (int)ceil(log2(tempSequence)));
+					if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)pfceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)pfceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp))))  tempSequence = (int)pow(2, (int)pfceil(log2(tempSequence)));
 				}
-				uint64_t testSequence = tempSequence;
+				pfUINT testSequence = tempSequence;
 				for (int i = 0; i < 33; i++) {
 					multipliers[i] = 0;
 				}
@@ -2208,7 +2502,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			}
 			if (app->configuration.fixMaxRadixBluestein > 0) {
 				while (!FFTSizeSelected) {
-					uint64_t testSequence = tempSequence;
+					pfUINT testSequence = tempSequence;
 					for (int i = 0; i < 33; i++) {
 						multipliers[i] = 0;
 					}
@@ -2226,13 +2520,13 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			else {
 				while (!FFTSizeSelected) {
 					if (axis_id == nonStridedAxisId) {
-						if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory))))  tempSequence = (int)pow(2, (int)ceil(log2(tempSequence)));
+						if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)pfceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)pfceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory))))  tempSequence = (int)pow(2, (int)pfceil(log2(tempSequence)));
 					}
 					else {
 						int maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / ((int)app->configuration.coalescedMemory) : usedSharedMemory / complexSize;
-						if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp))))  tempSequence = (int)pow(2, (int)ceil(log2(tempSequence)));
+						if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((int)pow(2, (int)pfceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((int)pow(2, (int)pfceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp))))  tempSequence = (int)pow(2, (int)pfceil(log2(tempSequence)));
 					}
-					uint64_t testSequence = tempSequence;
+					pfUINT testSequence = tempSequence;
 					for (int i = 0; i < 33; i++) {
 						multipliers[i] = 0;
 					}
@@ -2280,8 +2574,8 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 	int maxSingleSizeStrided = (!app->configuration.performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided;
 	int numPasses = 1;
 	int numPassesHalfBandwidth = 1;
-	uint64_t temp;
-	temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStrided);
+	pfUINT temp;
+	temp = (axis_id == nonStridedAxisId) ? (pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStrided);
 	if (temp > 1) {//more passes than one
 		for (int i = 1; i <= app->configuration.registerBoost4Step; i++) {
 			if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) {
@@ -2292,11 +2586,11 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 		if ((!app->configuration.performConvolution)) maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost;
 		temp = ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided;
 		if (app->configuration.reorderFourStep && (!app->useBluesteinFFT[axis_id]))
-			numPasses = (int)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided));
+			numPasses = (int)pfceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided));
 		else
-			numPasses += (int)ceil(log2(temp) / log2(maxSingleSizeStrided));
+			numPasses += (int)pfceil(log2(temp) / log2(maxSingleSizeStrided));
 	}
-	registerBoost = ((axis_id == nonStridedAxisId) && ((app->useBluesteinFFT[axis_id]) || (!app->configuration.reorderFourStep) || (numPasses == 1))) ? (int)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (int)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)pow(maxSequenceLengthSharedMemoryStrided, numPasses));
+	registerBoost = ((axis_id == nonStridedAxisId) && ((app->useBluesteinFFT[axis_id]) || (!app->configuration.reorderFourStep) || (numPasses == 1))) ? (int)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (int)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)pow(maxSequenceLengthSharedMemoryStrided, numPasses));
 	int canBoost = 0;
 	for (int i = registerBoost; i <= app->configuration.registerBoost; i++) {
 		if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) {
@@ -2314,12 +2608,12 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 	int maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
 	if ((axes->specializationConstants.performBandwidthBoost)) {
 		maxSingleSizeStridedHalfBandwidth = (app->configuration.coalescedMemory / axes->specializationConstants.performBandwidthBoost > complexSize) ? usedSharedMemory / ((int)app->configuration.coalescedMemory / axes->specializationConstants.performBandwidthBoost) : usedSharedMemory / complexSize;
-		temp = (axis_id == nonStridedAxisId) ? (int)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (int)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth);
+		temp = (axis_id == nonStridedAxisId) ? (int)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (int)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth);
 		//temp = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided;
 		if (temp > 1) {//more passes than two
-			temp = ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id])) ? (int)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (int)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth);
+			temp = ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id])) ? (int)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (int)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth);
 			for (int i = 0; i < 5; i++) {
-				temp = (int)ceil(temp / (double)maxSingleSizeStrided);
+				temp = (int)pfceil(temp / (double)maxSingleSizeStrided);
 				numPassesHalfBandwidth++;
 				if (temp == 1) i = 5;
 			}
@@ -2327,18 +2621,19 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			temp = ((axis_id == 0) && (!app->configuration.reorderFourStep)) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStridedHalfBandwidth;
 
 			if (app->configuration.reorderFourStep)
-				numPassesHalfBandwidth = (int)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStridedHalfBandwidth));
+				numPassesHalfBandwidth = (int)pfceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStridedHalfBandwidth));
 			else
-				numPassesHalfBandwidth = 1 + (int)ceil(log2(temp) / log2(maxSingleSizeStridedHalfBandwidth));
+				numPassesHalfBandwidth = 1 + (int)pfceil(log2(temp) / log2(maxSingleSizeStridedHalfBandwidth));
 			if ((numPassesHalfBandwidth == 2)&& (!app->configuration.reorderFourStep)&&(registerBoost>1)) //switch back for two step and don't do half bandwidth on strided accesses if register boost and no 4-step reordering
 			*/
 		}
 		if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth;
 		else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
 	}
-	if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.swapTo3Stage4Step) && (app->configuration.swapTo3Stage4Step >= 131072)) numPasses = 3;//Force set to 3 stage 4 step algorithm
+	if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.swapTo2Stage4Step) && (numPasses < 3)) numPasses = 2;//Force set to 2 stage 4 step algorithm
+	if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.swapTo3Stage4Step) && (app->configuration.swapTo3Stage4Step >= 65536)) numPasses = 3;//Force set to 3 stage 4 step algorithm
 	if (forceRaderTwoUpload && (numPasses == 1)) numPasses = 2;//Force set Rader cases that use more than 512 or maxNumThreads threads per one of Rader primes
-	uint64_t* locAxisSplit = FFTPlan->axisSplit[axis_id];
+	pfUINT* locAxisSplit = FFTPlan->axisSplit[axis_id];
 	if (numPasses == 1) {
 		locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
 	}
@@ -2409,7 +2704,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 						}
 					}
 				}*/
-				int sqrtSequence = (int)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]));
+				int sqrtSequence = (int)pfceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]));
 				for (int i = 0; i < sqrtSequence; i++) {
 					if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) {
 						if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) {
@@ -2422,7 +2717,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 				}
 			}
 			else {
-				int sqrtSequence = (int)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]));
+				int sqrtSequence = (int)pfceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]));
 				for (int i = 0; i < sqrtSequence; i++) {
 					if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) {
 						if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) {
@@ -2537,7 +2832,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) {
 				for (int i = 0; i < maxSequenceLengthSharedMemory; i++) {
 					if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) {
-						int sqrt3Sequence = (int)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)));
+						int sqrt3Sequence = (int)pfceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)));
 						for (int j = 0; j < sqrt3Sequence; j++) {
 							if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) {
 								if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) {
@@ -2554,10 +2849,10 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 				}
 			}
 			else {
-				int sqrt3Sequence = (int)ceil(pow(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0));
+				int sqrt3Sequence = (int)pfceil(pow(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0));
 				for (int i = 0; i < sqrt3Sequence; i++) {
 					if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrt3Sequence - i) == 0) {
-						int sqrt2Sequence = (int)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)));
+						int sqrt2Sequence = (int)pfceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)));
 						for (int j = 0; j < sqrt2Sequence; j++) {
 							if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) {
 								if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) {
@@ -2581,9 +2876,9 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 		//printf("sequence length exceeds boundaries\n");
 		return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH;
 	}
-	if ((numPasses > 1) && (app->configuration.performDCT > 0)) {
+	if ((numPasses > 1) && ((app->configuration.performDCT > 0) || (app->configuration.performDST > 0))) {
 		//printf("sequence length exceeds boundaries\n");
-		return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT;
+		return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R;
 	}
 	if ((numPasses > 1) && (app->configuration.performR2C > 0) && (axis_id == 0) && (app->configuration.size[axis_id] % 2 != 0)) {
 		//printf("sequence length exceeds boundaries\n");
@@ -2607,7 +2902,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 	if (app->useBluesteinFFT[axis_id]) {
 		if ((app->configuration.performR2C) && (axis_id == 0)) {
 			if (FFTPlan->multiUploadR2C) {
-				uint64_t tempSize = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize;
+				pfUINT tempSize = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize;
 				for (int i = 1; i < app->configuration.FFTdim; i++)
 					tempSize *= FFTPlan->actualFFTSizePerAxis[axis_id][i];
 		
@@ -2615,7 +2910,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			}
 		}
 		else {
-			uint64_t tempSize = FFTPlan->actualFFTSizePerAxis[axis_id][0] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize;
+			pfUINT tempSize = FFTPlan->actualFFTSizePerAxis[axis_id][0] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize;
 			for (int i = 1; i < app->configuration.FFTdim; i++)
 				tempSize *= FFTPlan->actualFFTSizePerAxis[axis_id][i];
 		
@@ -2673,7 +2968,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			if (res != VKFFT_SUCCESS) return res;
 		}
 		
-		for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+		for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 			if (axes[k].specializationConstants.raderContainer[i].type == 0) {
 				if (axes[k].specializationConstants.useRaderFFT < axes[k].specializationConstants.raderContainer[i].prime) axes[k].specializationConstants.useRaderFFT = axes[k].specializationConstants.raderContainer[i].prime;
 				if (axes[k].specializationConstants.raderContainer[i].containerFFTNum > app->configuration.maxThreadsNum) return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH;
@@ -2719,15 +3014,15 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 		int rader_min_registers = min_registers_per_thread;
 
 		if (axes[k].specializationConstants.useRaderMult) {
-			for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+			for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 				if (axes[k].specializationConstants.raderContainer[i].type == 1) {
-					int temp_rader = (int)ceil((locAxisSplit[k] / (double)((rader_min_registers / 2 + scale_registers_rader) * 2)) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2));
-					int active_rader = (int)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+					int temp_rader = (int)pfceil((locAxisSplit[k] / (double)((rader_min_registers / 2 + scale_registers_rader) * 2)) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2));
+					int active_rader = (int)pfceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 					if (active_rader > 1) {
-						if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((int)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+						if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((int)pfceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 					}
 
-					int local_estimate_rader_threadnum = (int)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2) * maxBatchCoalesced;
+					int local_estimate_rader_threadnum = (int)pfceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2) * maxBatchCoalesced;
 					if ((maxBatchCoalesced * locAxisSplit[k] / ((rader_min_registers / 2 + scale_registers_rader) * 2 * registerBoost)) > local_estimate_rader_threadnum) local_estimate_rader_threadnum = (maxBatchCoalesced * (int)locAxisSplit[k] / ((rader_min_registers / 2 + scale_registers_rader) * 2 * registerBoost));
 					if ((local_estimate_rader_threadnum > app->configuration.maxThreadsNum) || ((((locAxisSplit[k] / min_registers_per_thread) > 256) || (local_estimate_rader_threadnum > 256)) && (((rader_min_registers / 2 + scale_registers_rader) * 2) <= 4))) {
 						scale_registers_rader++;
@@ -2748,7 +3043,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 				}
 			}
 
-			for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+			for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 				if (axes[k].specializationConstants.raderContainer[i].type == 0) {
 					for (int j = 2; j < 33; j++) {
 						if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] != 0) {
@@ -2766,7 +3061,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 					registers_per_thread = registers_per_thread_per_radix[i];
 				}
 			}
-			for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+			for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 				if (axes[k].specializationConstants.raderContainer[i].type == 0) {
 					for (int j = 2; j < 33; j++) {
 						if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < new_min_registers)) new_min_registers = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j];
@@ -2778,16 +3073,16 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			}
 			min_registers_per_thread = (new_min_registers == 1e7) ? registers_per_thread : new_min_registers;
 		}
-		if ((int)ceil((maxBatchCoalesced * locAxisSplit[k] / (double)(min_registers_per_thread * registerBoost)) > app->configuration.maxThreadsNum) || (axes[k].specializationConstants.useRader && (estimate_rader_threadnum > app->configuration.maxThreadsNum)))
+		if ((int)pfceil((maxBatchCoalesced * locAxisSplit[k] / (double)(min_registers_per_thread * registerBoost)) > app->configuration.maxThreadsNum) || (axes[k].specializationConstants.useRader && (estimate_rader_threadnum > app->configuration.maxThreadsNum)))
 		{
 			int scaleRegistersNum = 1;
 			if ((axis_id == 0) && (k == 0) && (maxBatchCoalesced > 1)) {
 				maxBatchCoalesced = (int)(app->configuration.maxThreadsNum * (min_registers_per_thread * registerBoost) / locAxisSplit[k]);
 				if (maxBatchCoalesced < 1) maxBatchCoalesced = 1;
 			}
-			if (((int)ceil(maxBatchCoalesced * locAxisSplit[k] / (double)(min_registers_per_thread * registerBoost * scaleRegistersNum))) > app->configuration.maxThreadsNum) {
+			if (((int)pfceil(maxBatchCoalesced * locAxisSplit[k] / (double)(min_registers_per_thread * registerBoost * scaleRegistersNum))) > app->configuration.maxThreadsNum) {
 				for (int i = 2; i < locAxisSplit[k]; i++) {
-					if ((((int)ceil(maxBatchCoalesced * locAxisSplit[k] / (double)(min_registers_per_thread * registerBoost * i))) <= app->configuration.maxThreadsNum)) {
+					if ((((int)pfceil(maxBatchCoalesced * locAxisSplit[k] / (double)(min_registers_per_thread * registerBoost * i))) <= app->configuration.maxThreadsNum)) {
 						scaleRegistersNum = i;
 						i = (int)locAxisSplit[k];
 					}
@@ -2804,16 +3099,16 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			for (int i = 2; i < 33; i++) {
 				if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < new_min_registers)) new_min_registers = registers_per_thread_per_radix[i];
 			}
-			for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+			for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 				if (axes[k].specializationConstants.raderContainer[i].type == 0) {
 					for (int j = 2; j < 33; j++) {
 						if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < new_min_registers)) new_min_registers = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j];
 					}
 				}
 			}
-			if ((int)ceil((maxBatchCoalesced * locAxisSplit[k] / (double)(new_min_registers * registerBoost))) > app->configuration.maxThreadsNum) {
+			if ((int)pfceil((maxBatchCoalesced * locAxisSplit[k] / (double)(new_min_registers * registerBoost))) > app->configuration.maxThreadsNum) {
 				// if we get here, there can be trouble with small primes, as we can have one thread do at max one fftDim. This is only an issue for small primes in sequences close to shared memory limit sizes for extremely big shared memory sizes (>136KB)
-				for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+				for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 					if (axes[k].specializationConstants.raderContainer[i].type == 0) {
 						for (int j = 2; j < 33; j++) {
 							if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] != 0) {
@@ -2839,7 +3134,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 					min_registers_per_thread = registers_per_thread_per_radix[i];
 				}
 			}
-			for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) {
+			for (pfINT i = 0; i < (pfINT)axes[k].specializationConstants.numRaderPrimes; i++) {
 				if (axes[k].specializationConstants.raderContainer[i].type == 0) {
 					for (int j = 2; j < 33; j++) {
 						if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > registers_per_thread) {
@@ -2886,7 +3181,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 		axes[k].specializationConstants.numStages = 0;
 		axes[k].specializationConstants.fftDim.type = 31;
 		axes[k].specializationConstants.fftDim.data.i = locAxisSplit[k];
-		int tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep)||(app->useBluesteinFFT[axis_id]))) ? (int)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeNonStrided) : (int)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeStrided);
+		int tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep)||(app->useBluesteinFFT[axis_id]))) ? (int)pfceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeNonStrided) : (int)pfceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeStrided);
 		int switchRegisterBoost = 0;
 		if (tempRegisterBoost > 1) {
 			if (loc_multipliers[tempRegisterBoost] > 0) {
@@ -2921,10 +3216,10 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
 			axes[k].specializationConstants.rader_min_registers = rader_min_registers;
 			for (int i = 0; i < axes[k].specializationConstants.numRaderPrimes; i++) {
 				if (axes[k].specializationConstants.raderContainer[i].type == 1) {
-					int temp_rader = (int)ceil((locAxisSplit[k] / (double)axes[k].specializationConstants.rader_min_registers) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2));
-					int active_rader = (int)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader);
+					int temp_rader = (int)pfceil((locAxisSplit[k] / (double)axes[k].specializationConstants.rader_min_registers) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2));
+					int active_rader = (int)pfceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader);
 					if (active_rader > 1) {
-						if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((int)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
+						if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((int)pfceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--;
 					}
 					axes[k].specializationConstants.raderRegisters = (active_rader * 2 > axes[k].specializationConstants.raderRegisters) ? active_rader * 2 : axes[k].specializationConstants.raderRegisters;
 					if (active_rader * 2 > registers_per_thread) registers_per_thread = active_rader * 2;
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h
index 956cef91..42c22002 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h
@@ -30,7 +30,7 @@
 #include "vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_FFT.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h"
-static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse, uint64_t reverseBluesteinMultiUpload) {
+static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, pfUINT axis_id, pfUINT axis_upload_id, pfUINT inverse, pfUINT reverseBluesteinMultiUpload) {
 	//get radix stages
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
@@ -60,7 +60,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 		FFTPlan->actualFFTSizePerAxis[axis_id][1] = app->actualNumBatches;
 	}
 	axis->specializationConstants.numBatches.type = 31;
-	axis->specializationConstants.numBatches.data.i = (int64_t)app->configuration.numberBatches;
+	axis->specializationConstants.numBatches.data.i = (pfINT)app->configuration.numberBatches;
 	axis->specializationConstants.warpSize = (int)app->configuration.warpSize;
 	axis->specializationConstants.numSharedBanks = (int)app->configuration.numSharedBanks;
 	axis->specializationConstants.useUint64 = (int)app->configuration.useUint64;
@@ -86,39 +86,47 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 	axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength;
 	axis->specializationConstants.maxTempLength = app->configuration.maxTempLength;
 
-	axis->specializationConstants.double_PI = 3.14159265358979323846264338327950288419716939937510L;
-
-	if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-		axis->specializationConstants.precision = 1;
-		axis->specializationConstants.complexSize = 16;
+	axis->specializationConstants.double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+	axis->specializationConstants.storeSharedComplexComponentsSeparately = 0;
+	
+	if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+		axis->specializationConstants.precision = 3;
+		axis->specializationConstants.complexSize = 32;
+		axis->specializationConstants.storeSharedComplexComponentsSeparately = 1;
 	}
 	else {
-		if (app->configuration.halfPrecision) {
-			axis->specializationConstants.precision = 0;
-			axis->specializationConstants.complexSize = 8;
+		if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+			axis->specializationConstants.precision = 1;
+			axis->specializationConstants.complexSize = 16;
 		}
 		else {
-			axis->specializationConstants.precision = 0;
-			axis->specializationConstants.complexSize = 8;
+			if (app->configuration.halfPrecision) {
+				axis->specializationConstants.precision = 0;
+				axis->specializationConstants.complexSize = 8;
+			}
+			else {
+				axis->specializationConstants.precision = 0;
+				axis->specializationConstants.complexSize = 8;
+			}
 		}
 	}
-
-	uint64_t allowedSharedMemory = app->configuration.sharedMemorySize;
-	uint64_t allowedSharedMemoryPow2 = app->configuration.sharedMemorySizePow2;
+	
+	pfUINT allowedSharedMemory = app->configuration.sharedMemorySize;
+	pfUINT allowedSharedMemoryPow2 = app->configuration.sharedMemorySizePow2;
 
 	if (axis->specializationConstants.useRaderMult) {
 		allowedSharedMemory -= (axis->specializationConstants.useRaderMult - 1) * axis->specializationConstants.complexSize;
 		allowedSharedMemoryPow2 -= (axis->specializationConstants.useRaderMult - 1) * axis->specializationConstants.complexSize;
 	}
 
-	uint64_t maxSequenceLengthSharedMemory = allowedSharedMemory / axis->specializationConstants.complexSize;
-	uint64_t maxSequenceLengthSharedMemoryPow2 = allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
-	uint64_t maxSingleSizeStrided = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemory / (app->configuration.coalescedMemory) : allowedSharedMemory / axis->specializationConstants.complexSize;
-	uint64_t maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemoryPow2 / (app->configuration.coalescedMemory) : allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
+	pfUINT maxSequenceLengthSharedMemory = allowedSharedMemory / axis->specializationConstants.complexSize;
+	pfUINT maxSequenceLengthSharedMemoryPow2 = allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
+	pfUINT maxSingleSizeStrided = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemory / (app->configuration.coalescedMemory) : allowedSharedMemory / axis->specializationConstants.complexSize;
+	pfUINT maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > axis->specializationConstants.complexSize) ? allowedSharedMemoryPow2 / (app->configuration.coalescedMemory) : allowedSharedMemoryPow2 / axis->specializationConstants.complexSize;
 
 	axis->specializationConstants.stageStartSize.type = 31;
 	axis->specializationConstants.stageStartSize.data.i = 1;
-	for (uint64_t i = 0; i < axis_upload_id; i++)
+	for (pfUINT i = 0; i < axis_upload_id; i++)
 		axis->specializationConstants.stageStartSize.data.i *= FFTPlan->axisSplit[axis_id][i];
 
 	axis->specializationConstants.firstStageStartSize.type = 31;
@@ -142,12 +150,12 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 		axis->specializationConstants.useBluesteinFFT = 1;
 	}
 
-	if (app->configuration.performDCT == 3) {
+	if ((app->configuration.performDCT == 3) || (app->configuration.performDST == 3)) {
 		axis->specializationConstants.actualInverse = (int)inverse;
 		axis->specializationConstants.inverse = (int)!inverse;
 	}
 	else {
-		if (app->configuration.performDCT == 4) {
+		if ((app->configuration.performDCT == 4) || (app->configuration.performDST == 4)) {
 			axis->specializationConstants.actualInverse = (int)inverse;
 			axis->specializationConstants.inverse = 1;
 		}
@@ -159,11 +167,11 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 	if (app->useBluesteinFFT[axis_id]) {
 		axis->specializationConstants.actualInverse = (int)inverse;
 		axis->specializationConstants.inverse = (int)reverseBluesteinMultiUpload;
-		if (app->configuration.performDCT == 3) {
+		if ((app->configuration.performDCT == 3) || (app->configuration.performDST == 3)) {
 			axis->specializationConstants.inverseBluestein = (int)!inverse;
 		}
 		else {
-			if (app->configuration.performDCT == 4) {
+			if ((app->configuration.performDCT == 4) || (app->configuration.performDST == 4)) {
 				axis->specializationConstants.inverseBluestein = 1;
 			}
 			else {
@@ -177,31 +185,31 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 
 	if ((axis_id == 0) && ((FFTPlan->numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)))) {
 		maxSequenceLengthSharedMemory *= axis->specializationConstants.registerBoost;
-		maxSequenceLengthSharedMemoryPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory));
+		maxSequenceLengthSharedMemoryPow2 = (pfUINT)pow(2, (pfUINT)log2(maxSequenceLengthSharedMemory));
 	}
 	else {
 		maxSingleSizeStrided *= axis->specializationConstants.registerBoost;
-		maxSingleSizeStridedPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSingleSizeStrided));
+		maxSingleSizeStridedPow2 = (pfUINT)pow(2, (pfUINT)log2(maxSingleSizeStrided));
 	}
 	axis->specializationConstants.maxSingleSizeStrided.type = 31;
 	axis->specializationConstants.maxSingleSizeStrided.data.i = maxSingleSizeStrided;
 
 	axis->specializationConstants.performR2C = (int)FFTPlan->actualPerformR2CPerAxis[axis_id];
 	axis->specializationConstants.performR2CmultiUpload = (int)FFTPlan->multiUploadR2C;
-	if (app->configuration.performDCT == 3) {
-		axis->specializationConstants.performDCT = 2;
-	}
-	else {
+
+	if (app->configuration.performDCT > 0)
 		axis->specializationConstants.performDCT = (int)app->configuration.performDCT;
-	}
+	if (app->configuration.performDST > 0)
+		axis->specializationConstants.performDST = (int)app->configuration.performDST;
+
 	if ((axis->specializationConstants.performR2CmultiUpload) && (app->configuration.size[0] % 2 != 0)) return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C;
-	uint64_t additionalR2Cshared = 0;
-	if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || ((axis->specializationConstants.performDCT == 4) && ((axis->specializationConstants.fftDim.data.i % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload)) {
+	pfUINT additionalR2Cshared = 0;
+	if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fftDim.data.i % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload)) {
 		additionalR2Cshared = ((axis->specializationConstants.fftDim.data.i % 2) == 0) ? 2 : 1;
-		if ((axis->specializationConstants.performDCT == 2) || ((axis->specializationConstants.performDCT == 4) && ((axis->specializationConstants.fftDim.data.i % 2) != 0))) additionalR2Cshared = 1;
+		if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fftDim.data.i % 2) != 0))) additionalR2Cshared = 1;
 	}
-	axis->specializationConstants.mergeSequencesR2C = (((axis->specializationConstants.fftDim.data.i + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] > 1) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || (((app->configuration.performDCT == 3) || (app->configuration.performDCT == 2) || (app->configuration.performDCT == 1) || ((app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0)))) ? (1 - (int)app->configuration.disableMergeSequencesR2C) : 0;
-	//uint64_t passID = FFTPlan->numAxisUploads[axis_id] - 1 - axis_upload_id;
+	axis->specializationConstants.mergeSequencesR2C = (((axis->specializationConstants.fftDim.data.i + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] > 1) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || ((((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) || ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1)) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0)))) ? (1 - (int)app->configuration.disableMergeSequencesR2C) : 0;
+	//pfUINT passID = FFTPlan->numAxisUploads[axis_id] - 1 - axis_upload_id;
 	axis->specializationConstants.fft_dim_full.type = 31;
 	axis->specializationConstants.fft_dim_full.data.i = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
 	if ((FFTPlan->numAxisUploads[axis_id] > 1) && (axis->specializationConstants.reorderFourStep || app->useBluesteinFFT[axis_id]) && (!app->configuration.userTempBuffer) && (app->configuration.allocateTempBuffer == 0)) {
@@ -295,7 +303,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 	//configure strides
 
 	PfContainer* axisStride = axis->specializationConstants.inputStride;
-	uint64_t* usedStride = app->configuration.bufferStride;
+	pfUINT* usedStride = app->configuration.bufferStride;
 	if ((!inverse) && (axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted)) usedStride = app->configuration.inputBufferStride;
 	if ((inverse) && (axis_id == app->lastAxis) && ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((app->useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0)) || (!app->useBluesteinFFT[axis_id])) && (!app->configuration.performConvolution)) && (app->configuration.isInputFormatted) && (!app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride;
 	axisStride[0].type = 31;
@@ -328,7 +336,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 	axisStride[app->configuration.FFTdim+1].data.i = axisStride[app->configuration.FFTdim].data.i * app->configuration.coordinateFeatures;
 	if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)))) {
 		axisStride[0].data.i = 1;
-        int64_t prevStride = axisStride[0].data.i;
+        pfINT prevStride = axisStride[0].data.i;
         
 		if (axis_id == 0) {
             for (int i = 1; i < app->configuration.FFTdim; i++){
@@ -360,7 +368,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
         }
 	}
 	if ((FFTPlan->multiUploadR2C) && (!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) {
-		for (uint64_t i = 1; i < (app->configuration.FFTdim+2); i++) {
+		for (pfUINT i = 1; i < (app->configuration.FFTdim+2); i++) {
 			axisStride[i].data.i /= 2;
 		}
 	}
@@ -400,7 +408,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
     axisStride[app->configuration.FFTdim+1].data.i = axisStride[app->configuration.FFTdim].data.i * app->configuration.coordinateFeatures;
 	if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 1)))) {
 		axisStride[0].data.i = 1;
-        int64_t prevStride = axisStride[0].data.i;
+        pfINT prevStride = axisStride[0].data.i;
         
         if (axis_id == 0) {
             for (int i = 1; i < app->configuration.FFTdim; i++){
@@ -432,7 +440,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
         }
 	}
 	if ((FFTPlan->multiUploadR2C) && (inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))))) {
-		for (uint64_t i = 1; i < (app->configuration.FFTdim+2); i++) {
+		for (pfUINT i = 1; i < (app->configuration.FFTdim+2); i++) {
 			axisStride[i].data.i /= 2;
 		}
 	}
@@ -474,13 +482,13 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 		if ((axis->specializationConstants.axisSwapped) || (!((axis_id == 0) && (axis_upload_id == 0)))) axis->specializationConstants.stridedSharedLayout = 1;
 
 		/*VkSpecializationMapEntry specializationMapEntries[36] = { {} };
-		for (uint64_t i = 0; i < 36; i++) {
+		for (pfUINT i = 0; i < 36; i++) {
 			specializationMapEntries[i].constantID = i + 1;
-			specializationMapEntries[i].size = sizeof(uint64_t);
-			specializationMapEntries[i].offset = i * sizeof(uint64_t);
+			specializationMapEntries[i].size = sizeof(pfUINT);
+			specializationMapEntries[i].offset = i * sizeof(pfUINT);
 		}
 		VkSpecializationInfo specializationInfo = { 0 };
-		specializationInfo.dataSize = 36 * sizeof(uint64_t);
+		specializationInfo.dataSize = 36 * sizeof(pfUINT);
 		specializationInfo.mapEntryCount = 36;
 		specializationInfo.pMapEntries = specializationMapEntries;*/
 		axis->specializationConstants.localSize[0].type = 31;
@@ -489,9 +497,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 		axis->specializationConstants.localSize[0].data.i = axis->axisBlock[0];
 		axis->specializationConstants.localSize[1].data.i = axis->axisBlock[1];
 		axis->specializationConstants.localSize[2].data.i = axis->axisBlock[2];
-		axis->specializationConstants.numSubgroups = (int)ceil(axis->axisBlock[0] * axis->axisBlock[1] * axis->axisBlock[2] / (double)app->configuration.warpSize);
+		axis->specializationConstants.numSubgroups = (int)pfceil(axis->axisBlock[0] * axis->axisBlock[1] * axis->axisBlock[2] / (double)app->configuration.warpSize);
 		//specializationInfo.pData = &axis->specializationConstants;
-		//uint64_t registerBoost = (FFTPlan->numAxisUploads[axis_id] > 1) ? app->configuration.registerBoost4Step : app->configuration.registerBoost;
+		//pfUINT registerBoost = (FFTPlan->numAxisUploads[axis_id] > 1) ? app->configuration.registerBoost4Step : app->configuration.registerBoost;
 
 		axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : (int)app->configuration.coordinateFeatures;
 		axis->specializationConstants.matrixConvolution = (int)app->configuration.matrixConvolution;
@@ -504,56 +512,58 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 		axis->specializationConstants.sharedMemSize = (int)app->configuration.sharedMemorySize;
 		axis->specializationConstants.sharedMemSizePow2 = (int)app->configuration.sharedMemorySizePow2;
 		axis->specializationConstants.normalize = (reverseBluesteinMultiUpload) ? 1 : (int)app->configuration.normalize;
-        for (uint64_t i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
+        for (pfUINT i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
             axis->specializationConstants.size[i].type = 31;
-            axis->specializationConstants.size[i].data.i = (int64_t)FFTPlan->actualFFTSizePerAxis[axis_id][i];
+            axis->specializationConstants.size[i].data.i = (pfINT)FFTPlan->actualFFTSizePerAxis[axis_id][i];
         }
 		
-		for (uint64_t i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
+		for (pfUINT i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
 			axis->specializationConstants.frequencyZeropadding = (int)app->configuration.frequencyZeroPadding;
 			axis->specializationConstants.performZeropaddingFull[i] = (int)app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on)
 			axis->specializationConstants.fft_zeropad_left_full[i].type = 31;
-			axis->specializationConstants.fft_zeropad_left_full[i].data.i = (int64_t)app->configuration.fft_zeropad_left[i];
+			axis->specializationConstants.fft_zeropad_left_full[i].data.i = (pfINT)app->configuration.fft_zeropad_left[i];
 			axis->specializationConstants.fft_zeropad_right_full[i].type = 31;
-			axis->specializationConstants.fft_zeropad_right_full[i].data.i = (int64_t)app->configuration.fft_zeropad_right[i];
+			axis->specializationConstants.fft_zeropad_right_full[i].data.i = (pfINT)app->configuration.fft_zeropad_right[i];
 		}
 		if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->numAxisUploads[axis_id] == 1))) {
 			axis->specializationConstants.zeropadBluestein[0] = 1;
 			axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].type = 31;
-			axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i = (int64_t)app->configuration.size[axis_id];
+			axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i = (pfINT)app->configuration.size[axis_id];
 			if ((FFTPlan->multiUploadR2C) && (axis_id == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i /= 2;
-			if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i - 2;
-			if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i /= 2;
+			if (axis->specializationConstants.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i - 2;
+			if (axis->specializationConstants.performDST == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i + 2;
+			if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id].data.i /= 2;
 			axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id].type = 31;
-			axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id].data.i = (int64_t)FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
+			axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id].data.i = (pfINT)FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
 		}
 		if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))) {
 			axis->specializationConstants.zeropadBluestein[1] = 1;
 			axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].type = 31;
-			axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i = (int64_t)app->configuration.size[axis_id];
+			axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i = (pfINT)app->configuration.size[axis_id];
 			if ((FFTPlan->multiUploadR2C) && (axis_id == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i /= 2;
-			if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i - 2;
-			if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i /= 2;
+			if (axis->specializationConstants.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i - 2;
+			if (axis->specializationConstants.performDST == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i + 2;
+			if (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id].data.i /= 2;
 			axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id].type = 31;
-			axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id].data.i = (int64_t)FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
+			axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id].data.i = (pfINT)FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
 		}
-		uint64_t zeropad_r2c_multiupload_scale = ((axis_id == 0) && (FFTPlan->multiUploadR2C)) ? 2 : 1;
+		pfUINT zeropad_r2c_multiupload_scale = ((axis_id == 0) && (FFTPlan->multiUploadR2C)) ? 2 : 1;
 		if ((inverse)) {
 			if ((app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload != 1)) {
 				axis->specializationConstants.zeropad[0] = (int)app->configuration.performZeropadding[axis_id];
 				axis->specializationConstants.fft_zeropad_left_read[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_left_read[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_left_read[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
 				axis->specializationConstants.fft_zeropad_right_read[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_right_read[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_right_read[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
 			}
 			else
 				axis->specializationConstants.zeropad[0] = 0;
 			if ((!app->configuration.frequencyZeroPadding) && (((axis_upload_id == 0) && (!((axis->specializationConstants.useBluesteinFFT) || (app->configuration.performConvolution)))) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)) || (app->configuration.performConvolution)))))) {
 				axis->specializationConstants.zeropad[1] = (int)app->configuration.performZeropadding[axis_id];
 				axis->specializationConstants.fft_zeropad_left_write[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_left_write[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_left_write[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
 				axis->specializationConstants.fft_zeropad_right_write[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_right_write[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_right_write[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
 			}
 			else
 				axis->specializationConstants.zeropad[1] = 0;
@@ -562,18 +572,18 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload != 1)) {
 				axis->specializationConstants.zeropad[0] = (int)app->configuration.performZeropadding[axis_id];
 				axis->specializationConstants.fft_zeropad_left_read[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_left_read[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_left_read[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
 				axis->specializationConstants.fft_zeropad_right_read[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_right_read[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_right_read[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
 			}
 			else
 				axis->specializationConstants.zeropad[0] = 0;
 			if (((app->configuration.frequencyZeroPadding) && (((axis_upload_id == 0) && (!axis->specializationConstants.useBluesteinFFT)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.useBluesteinFFT && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))))) || (((!app->configuration.frequencyZeroPadding) && (app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1) && (app->configuration.performConvolution)))) {
 				axis->specializationConstants.zeropad[1] = (int)app->configuration.performZeropadding[axis_id];
 				axis->specializationConstants.fft_zeropad_left_write[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_left_write[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_left_write[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale;
 				axis->specializationConstants.fft_zeropad_right_write[axis_id].type = 31;
-				axis->specializationConstants.fft_zeropad_right_write[axis_id].data.i = (int64_t)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
+				axis->specializationConstants.fft_zeropad_right_write[axis_id].data.i = (pfINT)app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale;
 			}
 			else
 				axis->specializationConstants.zeropad[1] = 0;
@@ -598,7 +608,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			axis->specializationConstants.BluesteinPostMultiplication = 0;
 
 
-        uint64_t tempSize[3];
+        pfUINT tempSize[3];
 
 		if (axis_id == 0) {
 			if (axis_upload_id == 0)
@@ -606,19 +616,19 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			else
 				tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim.data.i / axis->axisBlock[0];
             tempSize[1] = FFTPlan->actualFFTSizePerAxis[axis_id][1];
-			if ((FFTPlan->actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
+			if ((FFTPlan->actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (pfUINT)pfceil(tempSize[1] / 2.0);
             
-			//if (app->configuration.performZeropadding[1]) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
-			//if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0);
+			//if (app->configuration.performZeropadding[1]) tempSize[1] = (pfUINT)pfceil(tempSize[1] / 2.0);
+			//if (app->configuration.performZeropadding[2]) tempSize[2] = (pfUINT)pfceil(tempSize[2] / 2.0);
         }else{
-			tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)axis->specializationConstants.fftDim.data.i);
+			tempSize[0] = (pfUINT)pfceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)axis->specializationConstants.fftDim.data.i);
 			tempSize[1] = 1;
 			
-			//if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0);
-			//if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0);
+			//if (app->configuration.actualPerformR2C == 1) tempSize[0] = (pfUINT)pfceil(tempSize[0] / 2.0);
+			//if (app->configuration.performZeropadding[2]) tempSize[2] = (pfUINT)pfceil(tempSize[2] / 2.0);
 		}
         tempSize[2] = 1;
-        for (uint64_t i = 1; i < app->configuration.FFTdim; i++) {
+        for (pfUINT i = 1; i < app->configuration.FFTdim; i++) {
             if (i!=axis_id)
                 tempSize[2] *= FFTPlan->actualFFTSizePerAxis[axis_id][i];
         }
@@ -626,14 +636,14 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
         if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures;
         
 		if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > tempSize[0]) && (tempSize[1] >= tempSize[2])) {
-			uint64_t temp_tempSize = tempSize[0];
+			pfUINT temp_tempSize = tempSize[0];
 			tempSize[0] = tempSize[1];
 			tempSize[1] = temp_tempSize;
 			axis->specializationConstants.swapComputeWorkGroupID = 1;
 		}
 		else {
 			if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > tempSize[0]) && (tempSize[2] >= tempSize[1])) {
-				uint64_t temp_tempSize = tempSize[0];
+				pfUINT temp_tempSize = tempSize[0];
 				tempSize[0] = tempSize[2];
 				tempSize[2] = temp_tempSize;
 				axis->specializationConstants.swapComputeWorkGroupID = 2;
@@ -675,29 +685,29 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 				axis->pushConstants.structSize += 1;
 			}
 			if (app->configuration.useUint64)
-				axis->pushConstants.structSize *= sizeof(uint64_t);
+				axis->pushConstants.structSize *= sizeof(pfUINT);
 			else
 				axis->pushConstants.structSize *= sizeof(uint32_t);
 			axis->specializationConstants.pushConstantsStructSize = (int)axis->pushConstants.structSize;
 		}
-		//uint64_t LUT = app->configuration.useLUT;
-		uint64_t type = 0;
+		//pfUINT LUT = app->configuration.useLUT;
+		pfUINT type = 0;
 		if ((axis_id == 0) && (axis_upload_id == 0)) type = 0;
 		if (axis_id != 0) type = 1;
 		if ((axis_id == 0) && (axis_upload_id > 0)) type = 2;
 		//if ((axis->specializationConstants.fftDim == 8 * maxSequenceLengthSharedMemory) && (app->configuration.registerBoost >= 8)) axis->specializationConstants.registerBoost = 8;
 		if ((axis_id == 0) && (!axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 5;
 		if ((axis_id == 0) && (axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 6;
-		if ((axis_id == 0) && (app->configuration.performDCT == 1)) type = 110;
-		if ((axis_id != 0) && (app->configuration.performDCT == 1)) type = 111;
-		if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 120;
-		if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 121;
-		if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 130;
-		if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 131;
-		if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 142;
-		if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 144;
-		if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 143;
-		if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 145;
+		if ((axis_id == 0) && ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1))) type = 110;
+		if ((axis_id != 0) && ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1))) type = 111;
+		if ((axis_id == 0) && ((((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2))  && (!inverse)) || (((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) && (inverse)))) type = 120;
+		if ((axis_id != 0) && ((((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) && (!inverse)) || (((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) && (inverse)))) type = 121;
+		if ((axis_id == 0) && ((((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) && (inverse)) || (((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) && (!inverse)))) type = 130;
+		if ((axis_id != 0) && ((((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) && (inverse)) || (((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) && (!inverse)))) type = 131;
+		if ((axis_id == 0) && ((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) == 0)) type = 142;
+		if ((axis_id == 0) && ((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) == 1)) type = 144;
+		if ((axis_id != 0) && ((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) == 0)) type = 143;
+		if ((axis_id != 0) && ((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) == 1)) type = 145;
 
 		resFFT = initMemoryParametersAPI(app, &axis->specializationConstants);
 		if (resFFT != VKFFT_SUCCESS) {
@@ -718,6 +728,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			case 2:
 				axis->specializationConstants.inputNumberByteSize = 16;
 				break;
+			case 3:
+				axis->specializationConstants.inputNumberByteSize = 32;
+				break;
 			}
 			break;
 		case 5: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
@@ -732,6 +745,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			case 2:
 				axis->specializationConstants.inputNumberByteSize = 8;
 				break;
+			case 3:
+				axis->specializationConstants.inputNumberByteSize = 16;
+				break;
 			}
 			break;
 		}
@@ -748,6 +764,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			case 2:
 				axis->specializationConstants.outputNumberByteSize = 16;
 				break;
+			case 3:
+				axis->specializationConstants.outputNumberByteSize = 32;
+				break;
 			}
 			break;
 		case 6: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
@@ -762,6 +781,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 			case 2:
 				axis->specializationConstants.outputNumberByteSize = 8;
 				break;
+			case 3:
+				axis->specializationConstants.outputNumberByteSize = 16;
+				break;
 			}
 			break;
 		}
@@ -775,6 +797,9 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 		case 2:
 			axis->specializationConstants.kernelNumberByteSize = 16;
 			break;
+		case 3:
+			axis->specializationConstants.kernelNumberByteSize = 32;
+			break;
 		}
 
 		resFFT = initParametersAPI(app, &axis->specializationConstants);
@@ -813,7 +838,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla
 	freeMemoryParametersAPI(app, &axis->specializationConstants);
 	freeParametersAPI(app, &axis->specializationConstants);
 	if (axis->specializationConstants.axisSwapped) {//swap back for correct dispatch
-		uint64_t temp = axis->axisBlock[1];
+		pfUINT temp = axis->axisBlock[1];
 		axis->axisBlock[1] = axis->axisBlock[0];
 		axis->axisBlock[0] = temp;
 		axis->specializationConstants.axisSwapped = 0;
diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h
index 7f244894..f5b5e8e0 100644
--- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h
+++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h
@@ -27,7 +27,7 @@
 #include "vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_ManageLUT.h"
 #include "vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel2/vkFFT_R2C_even_decomposition.h"
 #include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h"
-static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t inverse) {
+static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, pfUINT inverse) {
 	VkFFTResult resFFT = VKFFT_SUCCESS;
 #if(VKFFT_BACKEND==0)
 	VkResult res = VK_SUCCESS;
@@ -43,7 +43,7 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 #endif
 	VkFFTAxis* axis = &FFTPlan->R2Cdecomposition;
 	axis->specializationConstants.sourceFFTSize.type = 31;
-	axis->specializationConstants.sourceFFTSize.data.i = (int64_t)app->configuration.size[0];
+	axis->specializationConstants.sourceFFTSize.data.i = (pfINT)app->configuration.size[0];
     axis->specializationConstants.numFFTdims = (int)app->configuration.FFTdim;
     
 	axis->specializationConstants.warpSize = (int)app->configuration.warpSize;
@@ -60,23 +60,28 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 	axis->specializationConstants.maxCodeLength = (int)app->configuration.maxCodeLength;
 	axis->specializationConstants.maxTempLength = (int)app->configuration.maxTempLength;
 
-	axis->specializationConstants.double_PI = 3.14159265358979323846264338327950288419716939937510L;
-
-	if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
-		axis->specializationConstants.precision = 1;
-		axis->specializationConstants.complexSize = (2 * sizeof(double));
+	axis->specializationConstants.double_PI = pfFPinit("3.14159265358979323846264338327950288419716939937510");
+	if (app->configuration.quadDoubleDoublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) {
+		axis->specializationConstants.precision = 3;
+		axis->specializationConstants.complexSize = (4 * sizeof(double));
 	}
-	else {
-		if (app->configuration.halfPrecision) {
-            axis->specializationConstants.precision = 0;
-            axis->specializationConstants.complexSize = (2 * sizeof(float));
+	else
+	{
+		if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) {
+			axis->specializationConstants.precision = 1;
+			axis->specializationConstants.complexSize = (2 * sizeof(double));
 		}
 		else {
-			axis->specializationConstants.precision = 0;
-			axis->specializationConstants.complexSize = (2 * sizeof(float));
+			if (app->configuration.halfPrecision) {
+				axis->specializationConstants.precision = 0;
+				axis->specializationConstants.complexSize = (2 * sizeof(float));
+			}
+			else {
+				axis->specializationConstants.precision = 0;
+				axis->specializationConstants.complexSize = (2 * sizeof(float));
+			}
 		}
 	}
-	axis->specializationConstants.complexSize = axis->specializationConstants.complexSize;
 	axis->specializationConstants.supportAxis = 0;
 	axis->specializationConstants.symmetricKernel = (int)app->configuration.symmetricKernel;
 	axis->specializationConstants.conjugateConvolution = (int)app->configuration.conjugateConvolution;
@@ -120,8 +125,8 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
     }
     
 	axis->specializationConstants.inverse = (int)inverse;
-	uint64_t axis_id = 0;
-	uint64_t axis_upload_id = 0;
+	pfUINT axis_id = 0;
+	pfUINT axis_upload_id = 0;
 
 	resFFT = VkFFTConfigureDescriptorsR2CMultiUploadDecomposition(app, FFTPlan, axis, axis_id, axis_upload_id, inverse);
 	if (resFFT != VKFFT_SUCCESS) {
@@ -152,23 +157,23 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 		axis->axisBlock[1] = 1;
 		axis->axisBlock[2] = 1;
 
-        uint64_t tempSize[3] = {1, 1, 1};
+        pfUINT tempSize[3] = {1, 1, 1};
         for (int i = 0; i < app->configuration.FFTdim; i++){
             tempSize[0] *= app->configuration.size[i];
         }
-        tempSize[0] = (uint64_t)ceil(tempSize[0]/ (long double)(2 * axis->axisBlock[0]));
+        tempSize[0] = (pfUINT)pfceil(tempSize[0]/ (pfLD)(2 * axis->axisBlock[0]));
        
 		tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches * app->configuration.coordinateFeatures;
         
 		if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > tempSize[0]) && (tempSize[1] >= tempSize[2])) {
-			uint64_t temp_tempSize = tempSize[0];
+			pfUINT temp_tempSize = tempSize[0];
 			tempSize[0] = tempSize[1];
 			tempSize[1] = temp_tempSize;
 			axis->specializationConstants.swapComputeWorkGroupID = 1;
 		}
 		else {
 			if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > tempSize[0]) && (tempSize[2] >= tempSize[1])) {
-				uint64_t temp_tempSize = tempSize[0];
+				pfUINT temp_tempSize = tempSize[0];
 				tempSize[0] = tempSize[2];
 				tempSize[2] = temp_tempSize;
 				axis->specializationConstants.swapComputeWorkGroupID = 2;
@@ -190,34 +195,34 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 
 		axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : (int)app->configuration.coordinateFeatures;
 		axis->specializationConstants.matrixConvolution = (int)app->configuration.matrixConvolution;
-        for (uint64_t i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
+        for (pfUINT i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
             axis->specializationConstants.size[i].type = 31;
-            axis->specializationConstants.size[i].data.i = (int64_t)app->configuration.size[i];
+            axis->specializationConstants.size[i].data.i = (pfINT)app->configuration.size[i];
         }
 
 		axis->specializationConstants.registers_per_thread = 4;
 
 		axis->specializationConstants.numBatches.type = 31;
-		axis->specializationConstants.numBatches.data.i = (int64_t)app->configuration.numberBatches;
+		axis->specializationConstants.numBatches.data.i = (pfINT)app->configuration.numberBatches;
 		if ((app->configuration.FFTdim == 1) && (app->configuration.size[1] == 1) && ((app->configuration.numberBatches == 1) && (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) {
-			axis->specializationConstants.numBatches.data.i = (int64_t)app->actualNumBatches;
+			axis->specializationConstants.numBatches.data.i = (pfINT)app->actualNumBatches;
 		}
 
 		axis->specializationConstants.numKernels.type = 31;
-		axis->specializationConstants.numKernels.data.i = (int64_t)app->configuration.numberKernels;
+		axis->specializationConstants.numKernels.data.i = (pfINT)app->configuration.numberKernels;
 		axis->specializationConstants.sharedMemSize = (int)app->configuration.sharedMemorySize;
 		axis->specializationConstants.sharedMemSizePow2 = (int)app->configuration.sharedMemorySizePow2;
 		axis->specializationConstants.normalize = (int)app->configuration.normalize;
 		axis->specializationConstants.axis_id = 0;
 		axis->specializationConstants.axis_upload_id = 0;
 
-		for (uint64_t i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
+		for (pfUINT i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
 			axis->specializationConstants.frequencyZeropadding = (int)app->configuration.frequencyZeroPadding;
 			axis->specializationConstants.performZeropaddingFull[i] = (int)app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on)
 			axis->specializationConstants.fft_zeropad_left_full[i].type = 31;
-			axis->specializationConstants.fft_zeropad_left_full[i].data.i = (int64_t)app->configuration.fft_zeropad_left[i];
+			axis->specializationConstants.fft_zeropad_left_full[i].data.i = (pfINT)app->configuration.fft_zeropad_left[i];
 			axis->specializationConstants.fft_zeropad_right_full[i].type = 31;
-			axis->specializationConstants.fft_zeropad_right_full[i].data.i = (int64_t)app->configuration.fft_zeropad_right[i];
+			axis->specializationConstants.fft_zeropad_right_full[i].data.i = (pfINT)app->configuration.fft_zeropad_right[i];
 		}
 		/*if ((inverse)) {
 			if ((app->configuration.frequencyZeroPadding) &&  (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) {
@@ -285,12 +290,12 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 				axis->pushConstants.structSize += 1;
 			}
 			if (app->configuration.useUint64)
-				axis->pushConstants.structSize *= sizeof(uint64_t);
+				axis->pushConstants.structSize *= sizeof(pfUINT);
 			else
 				axis->pushConstants.structSize *= sizeof(uint32_t);
 			axis->specializationConstants.pushConstantsStructSize = (int)axis->pushConstants.structSize;
 		}
-		//uint64_t LUT = app->configuration.useLUT;
+		//pfUINT LUT = app->configuration.useLUT;
 
 		resFFT = initMemoryParametersAPI(app, &axis->specializationConstants);
 		if (resFFT != VKFFT_SUCCESS) {
@@ -298,7 +303,7 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 			return resFFT;
 		}
 
-		uint64_t type = 0;
+		pfUINT type = 0;
 
 		axis->specializationConstants.inputMemoryCode = axis->specializationConstants.vecTypeInputMemoryCode;
 		switch ((axis->specializationConstants.inputMemoryCode % 100) / 10) {
@@ -311,6 +316,9 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 		case 2:
 			axis->specializationConstants.inputNumberByteSize = 16;
 			break;
+		case 3:
+			axis->specializationConstants.inputNumberByteSize = 32;
+			break;
 		}
 
 		axis->specializationConstants.outputMemoryCode = axis->specializationConstants.vecTypeOutputMemoryCode;
@@ -324,6 +332,9 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*
 		case 2:
 			axis->specializationConstants.outputNumberByteSize = 16;
 			break;
+		case 3:
+			axis->specializationConstants.outputNumberByteSize = 32;
+			break;
 		}
 		resFFT = initParametersAPI(app, &axis->specializationConstants);
 		if (resFFT != VKFFT_SUCCESS) {
diff --git a/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h b/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h
index 5c8c30e5..a726d8be 100644
--- a/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h
+++ b/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h
@@ -66,31 +66,36 @@
 #include "Metal/Metal.hpp"
 #endif
 
+#ifdef VKFFT_USE_QUADMATH_FP128
+#include <quadmath.h>
+#endif
 //unified VkFFT container
-typedef union PfData {
-	int64_t i;
-	long double d;
-	long double c[2];
+typedef struct PfContainer PfContainer;
+typedef union PfData PfData;
 
-	char* s;
+typedef union PfData {
+	pfINT i; // int
+	pfLD d; // long double
+	PfContainer* c; // [2] complex
+	PfContainer* dd; // [2] double-double __ibm128
 } PfData;
-typedef struct PfContainer PfContainer;
 struct PfContainer{
 	int type; // 0 - uninitialized
 			  // 1 - int, 2 - float, 3 - complex float; 
-			  // + X0: 0 - half, 1 - float, 2 - double, 3 - long double - precision identifiers (only for strings now, all number values are in max long double precision for simplicity)
+			  // + X0: 0 - half, 1 - float, 2 - double, 3 - double-double, 4 - fp128 (if available): - precision identifiers (only for strings now, all number values are in max pfLD precision for simplicity)
 			  // 100 + X - variable name, containing same type as in X
 			  
-	PfData data; // memory of the container
-	int size; //  bytes allcoated in data.s
+	PfData data; // memory contents of the container
+	char* name; // name of the container
+	int size; //  bytes allcoated in name
 };
 
 typedef struct {
 	//WHDCN layout
 
 	//required parameters:
-	uint64_t FFTdim; //FFT dimensionality (1, 2 or 3)
-	uint64_t size[VKFFT_MAX_FFT_DIMENSIONS]; // WHD -system dimensions
+	pfUINT FFTdim; //FFT dimensionality (1, 2 or 3)
+	pfUINT size[VKFFT_MAX_FFT_DIMENSIONS]; // WHD -system dimensions
 
 #if(VKFFT_BACKEND==0)
 	VkPhysicalDevice* physicalDevice;//pointer to Vulkan physical device, obtained from vkEnumeratePhysicalDevices
@@ -98,17 +103,17 @@ typedef struct {
 	VkQueue* queue;//pointer to Vulkan queue, created with vkGetDeviceQueue
 	VkCommandPool* commandPool;//pointer to Vulkan command pool, created with vkCreateCommandPool
 	VkFence* fence;//pointer to Vulkan fence, created with vkCreateFence
-	uint64_t isCompilerInitialized;//specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0
+	pfUINT isCompilerInitialized;//specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0
 #elif(VKFFT_BACKEND==1)
 	CUdevice* device;//pointer to CUDA device, obtained from cuDeviceGet
 	//CUcontext* context;//pointer to CUDA context, obtained from cuDeviceGet
 	cudaStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels
-	uint64_t num_streams;//try to submit CUDA kernels in multiple streams for asynchronous execution. Default 0, set to >=1 if you pass values in the stream pointer.
+	pfUINT num_streams;//try to submit CUDA kernels in multiple streams for asynchronous execution. Default 0, set to >=1 if you pass values in the stream pointer.
 #elif(VKFFT_BACKEND==2)
 	hipDevice_t* device;//pointer to HIP device, obtained from hipDeviceGet
 	//hipCtx_t* context;//pointer to HIP context, obtained from hipDeviceGet
 	hipStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels
-	uint64_t num_streams;//try to submit HIP kernels in multiple streams for asynchronous execution. Default 0, set to >=1 if you pass values in the stream pointer.
+	pfUINT num_streams;//try to submit HIP kernels in multiple streams for asynchronous execution. Default 0, set to >=1 if you pass values in the stream pointer.
 #elif(VKFFT_BACKEND==3)
 	cl_platform_id* platform;//not required
 	cl_device_id* device;
@@ -124,20 +129,20 @@ typedef struct {
 #endif
 
 	//data parameters:
-	uint64_t userTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on)
+	pfUINT userTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on)
 
-	uint64_t bufferNum;//multiple buffer sequence storage is Vulkan only. Default 1
-	uint64_t tempBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation
-	uint64_t inputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled
-	uint64_t outputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled
-	uint64_t kernelNum;//multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled
+	pfUINT bufferNum;//multiple buffer sequence storage is Vulkan only. Default 1
+	pfUINT tempBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation
+	pfUINT inputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled
+	pfUINT outputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled
+	pfUINT kernelNum;//multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled
 
 	//sizes are obligatory in Vulkan backend, optional in others
-	uint64_t* bufferSize;//array of buffers sizes in bytes
-	uint64_t* tempBufferSize;//array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation
-	uint64_t* inputBufferSize;//array of input buffers sizes in bytes, if isInputFormatted is enabled
-	uint64_t* outputBufferSize;//array of output buffers sizes in bytes, if isOutputFormatted is enabled
-	uint64_t* kernelSize;//array of kernel buffers sizes in bytes, if performConvolution is enabled
+	pfUINT* bufferSize;//array of buffers sizes in bytes
+	pfUINT* tempBufferSize;//array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation
+	pfUINT* inputBufferSize;//array of input buffers sizes in bytes, if isInputFormatted is enabled
+	pfUINT* outputBufferSize;//array of output buffers sizes in bytes, if isOutputFormatted is enabled
+	pfUINT* kernelSize;//array of kernel buffers sizes in bytes, if performConvolution is enabled
 
 #if(VKFFT_BACKEND==0)
 	VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations
@@ -176,130 +181,136 @@ typedef struct {
 	MTL::Buffer** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
 	MTL::Buffer** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
 #endif
-	uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 
-	uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 
-	uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 
-	uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
-	uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
-	uint64_t specifyOffsetsAtLaunch;//specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0
+	pfUINT bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 
+	pfUINT tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 
+	pfUINT inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 
+	pfUINT outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
+	pfUINT kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
+	pfUINT specifyOffsetsAtLaunch;//specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0
 
 	//optional: (default 0 if not stated otherwise)
 #if(VKFFT_BACKEND==0)
 	VkPipelineCache* pipelineCache;//pointer to Vulkan pipeline cache
+	VkBuffer* stagingBuffer;//pointer to the user defined staging buffer (used internally for LUT data transfers)
+	VkDeviceMemory* stagingBufferMemory;//pointer to the user defined staging buffer memory, associated with the stagingBuffer (used internally for LUT data transfers)
 #endif
-	uint64_t coalescedMemory;//in bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Gonna work regardles, but if specified by user correctly, the performance will be higher.
-	uint64_t aimThreads;//aim at this many threads per block. Default 128
-	uint64_t numSharedBanks;//how many banks shared memory has. Default 32
-	uint64_t inverseReturnToInputBuffer;//return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled
-	uint64_t numberBatches;// N - used to perform multiple batches of initial data. Default 1
-	uint64_t useUint64;// use 64-bit addressing mode in generated kernels
-	uint64_t omitDimension[VKFFT_MAX_FFT_DIMENSIONS];//disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C dimension 0 for now. Doesn't work with convolutions.
-	uint64_t performBandwidthBoost;//try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise 
-	uint64_t groupedBatch[VKFFT_MAX_FFT_DIMENSIONS];// try to force this many FFTs to be perfromed by one threadblock for each dimension
-
-	uint64_t doublePrecision; //perform calculations in double precision (0 - off, 1 - on).
-	uint64_t halfPrecision; //perform calculations in half precision (0 - off, 1 - on)
-	uint64_t halfPrecisionMemoryOnly; //use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out of place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on)
-	uint64_t doublePrecisionFloatMemory; //use FP64 precision for all calculations, while all memory storage is done in FP32.
-
-	uint64_t performR2C; //perform R2C/C2R decomposition (0 - off, 1 - on)
-	uint64_t performDCT; //perform DCT transformation (X - DCT type, 1-4)
-	uint64_t disableMergeSequencesR2C; //disable merging of two real sequences to reduce calculations (0 - off, 1 - on)
-	uint64_t normalize; //normalize inverse transform (0 - off, 1 - on)
-	uint64_t disableReorderFourStep; // disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
-	int64_t useLUT; //switches from calculating sincos to using precomputed LUT tables (-1 - off, 0 - auto, 1 - on). Configured by initialization routine
-	int64_t useLUT_4step; //switches from calculating sincos to using precomputed LUT tables for intermediate roots of 1 in the Four-step FFT algorithm. (-1 - off, 0 - auto, 1 - on). Configured by initialization routine
-	uint64_t makeForwardPlanOnly; //generate code only for forward FFT (0 - off, 1 - on)
-	uint64_t makeInversePlanOnly; //generate code only for inverse FFT (0 - off, 1 - on)
-
-	uint64_t bufferStride[VKFFT_MAX_FFT_DIMENSIONS];//buffer strides - default set to x - x*y - x*y*z values
-	uint64_t isInputFormatted; //specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
-	uint64_t isOutputFormatted; //specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
-	uint64_t inputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];//input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
-	uint64_t outputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];//output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
-
-	uint64_t considerAllAxesStrided;//will create plan for nonstrided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on)
-	uint64_t keepShaderCode;//will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on)
-	uint64_t printMemoryLayout;//will print order of buffers used in shaders (0 - off, 1 - on)
-
-	uint64_t saveApplicationToString;//will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). Currently disabled in Metal backend. (0 - off, 1 - on)
-
-	uint64_t loadApplicationFromString;//will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). Currently disabled in Metal backend. (0 - off, 1 - on). Mutually exclusive with saveApplicationToString
+	pfUINT coalescedMemory;//in bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Gonna work regardles, but if specified by user correctly, the performance will be higher.
+	pfUINT aimThreads;//aim at this many threads per block. Default 128
+	pfUINT numSharedBanks;//how many banks shared memory has. Default 32
+	pfUINT inverseReturnToInputBuffer;//return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled
+	pfUINT numberBatches;// N - used to perform multiple batches of initial data. Default 1
+	pfUINT useUint64;// use 64-bit addressing mode in generated kernels
+	pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS];//disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C dimension 0 for now. Doesn't work with convolutions.
+	pfUINT performBandwidthBoost;//try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise 
+	pfUINT groupedBatch[VKFFT_MAX_FFT_DIMENSIONS];// try to force this many FFTs to be perfromed by one threadblock for each dimension
+
+	pfUINT doublePrecision; //perform calculations in double precision (0 - off, 1 - on).
+	pfUINT quadDoubleDoublePrecision; //perform calculations in double-double emulation of quad precision (0 - off, 1 - on).
+	pfUINT quadDoubleDoublePrecisionDoubleMemory; //perform calculations in double-double emulation of quad precision, while all memory storage is done in FP64.
+	pfUINT halfPrecision; //perform calculations in half precision (0 - off, 1 - on)
+	pfUINT halfPrecisionMemoryOnly; //use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out of place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on)
+	pfUINT doublePrecisionFloatMemory; //use FP64 precision for all calculations, while all memory storage is done in FP32.
+
+	pfUINT performR2C; //perform R2C/C2R decomposition (0 - off, 1 - on)
+	pfUINT performDCT; //perform DCT transformation (X - DCT type, 1-4)
+	pfUINT performDST; //perform DST transformation (X - DCT type, 1-4)
+	pfUINT disableMergeSequencesR2C; //disable merging of two real sequences to reduce calculations (0 - off, 1 - on)
+	pfUINT normalize; //normalize inverse transform (0 - off, 1 - on)
+	pfUINT disableReorderFourStep; // disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
+	pfINT useLUT; //switches from calculating sincos to using precomputed LUT tables (-1 - off, 0 - auto, 1 - on). Configured by initialization routine
+	pfINT useLUT_4step; //switches from calculating sincos to using precomputed LUT tables for intermediate roots of 1 in the Four-step FFT algorithm. (-1 - off, 0 - auto, 1 - on). Configured by initialization routine
+	pfUINT makeForwardPlanOnly; //generate code only for forward FFT (0 - off, 1 - on)
+	pfUINT makeInversePlanOnly; //generate code only for inverse FFT (0 - off, 1 - on)
+
+	pfUINT bufferStride[VKFFT_MAX_FFT_DIMENSIONS];//buffer strides - default set to x - x*y - x*y*z values
+	pfUINT isInputFormatted; //specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
+	pfUINT isOutputFormatted; //specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
+	pfUINT inputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];//input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
+	pfUINT outputBufferStride[VKFFT_MAX_FFT_DIMENSIONS];//output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
+	pfUINT swapTo2Stage4Step; //specify at which number to switch from 1 upload to 2 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled.
+	pfUINT swapTo3Stage4Step; //specify at which number to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 65536
+	
+	pfUINT considerAllAxesStrided;//will create plan for nonstrided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on)
+	pfUINT keepShaderCode;//will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on)
+	pfUINT printMemoryLayout;//will print order of buffers used in shaders (0 - off, 1 - on)
+
+	pfUINT saveApplicationToString;//will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). Currently disabled in Metal backend. (0 - off, 1 - on)
+
+	pfUINT loadApplicationFromString;//will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). Currently disabled in Metal backend. (0 - off, 1 - on). Mutually exclusive with saveApplicationToString
 	void* loadApplicationString;//memory binary array through which user can load VkFFT binaries, must be provided by user if loadApplicationFromString = 1. Use rb/wb flags to load/save.
 
-	uint64_t disableSetLocale;//disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0.
+	pfUINT disableSetLocale;//disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0.
 
 	//optional Bluestein optimizations: (default 0 if not stated otherwise)
-	uint64_t fixMaxRadixBluestein;//controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13.
-	uint64_t forceBluesteinSequenceSize;// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13.
-	uint64_t useCustomBluesteinPaddingPattern;// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. 
+	pfUINT fixMaxRadixBluestein;//controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13.
+	pfUINT forceBluesteinSequenceSize;// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13.
+	pfUINT useCustomBluesteinPaddingPattern;// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. 
 											  // paddedSizes - array of lengths to pad to. paddedSizes[i] will be the padding size for all non-decomposable sequences from primeSizes[i] to primeSizes[i+1] (will use default scheme after last one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme (for example). Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern.
-	uint64_t* primeSizes; // described in useCustomBluesteinPaddingPattern
-	uint64_t* paddedSizes; // described in useCustomBluesteinPaddingPattern
+	pfUINT* primeSizes; // described in useCustomBluesteinPaddingPattern
+	pfUINT* paddedSizes; // described in useCustomBluesteinPaddingPattern
 
-	uint64_t fixMinRaderPrimeMult;//start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader.
-	uint64_t fixMaxRaderPrimeMult;//switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40)
+	pfUINT fixMinRaderPrimeMult;//start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader.
+	pfUINT fixMaxRaderPrimeMult;//switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40)
 
-	uint64_t fixMinRaderPrimeFFT;//start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. 
-	uint64_t fixMaxRaderPrimeFFT;//switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory.
+	pfUINT fixMinRaderPrimeFFT;//start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. 
+	pfUINT fixMaxRaderPrimeFFT;//switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory.
 
 	//optional zero padding control parameters: (default 0 if not stated otherwise)
-	uint64_t performZeropadding[VKFFT_MAX_FFT_DIMENSIONS]; // don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on)
-	uint64_t fft_zeropad_left[VKFFT_MAX_FFT_DIMENSIONS];//specify start boundary of zero block in the system for each axis
-	uint64_t fft_zeropad_right[VKFFT_MAX_FFT_DIMENSIONS];//specify end boundary of zero block in the system for each axis
-	uint64_t frequencyZeroPadding; //set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding
+	pfUINT performZeropadding[VKFFT_MAX_FFT_DIMENSIONS]; // don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on)
+	pfUINT fft_zeropad_left[VKFFT_MAX_FFT_DIMENSIONS];//specify start boundary of zero block in the system for each axis
+	pfUINT fft_zeropad_right[VKFFT_MAX_FFT_DIMENSIONS];//specify end boundary of zero block in the system for each axis
+	pfUINT frequencyZeroPadding; //set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding
 
 	//optional convolution control parameters: (default 0 if not stated otherwise)
-	uint64_t performConvolution; //perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter
-	uint64_t conjugateConvolution;//0 off, 1 - conjugation of the sequence FFT is currently done on, 2 - conjugation of the convolution kernel
-	uint64_t crossPowerSpectrumNormalization;//normalize the FFT x kernel multiplication in frequency domain
-	uint64_t coordinateFeatures; // C - coordinate, or dimension of features vector. In matrix convolution - size of vector
-	uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
-	uint64_t symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric
-	uint64_t numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output
-	uint64_t kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation
+	pfUINT performConvolution; //perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter
+	pfUINT conjugateConvolution;//0 off, 1 - conjugation of the sequence FFT is currently done on, 2 - conjugation of the convolution kernel
+	pfUINT crossPowerSpectrumNormalization;//normalize the FFT x kernel multiplication in frequency domain
+	pfUINT coordinateFeatures; // C - coordinate, or dimension of features vector. In matrix convolution - size of vector
+	pfUINT matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
+	pfUINT symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric
+	pfUINT numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output
+	pfUINT kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation
 
 	//register overutilization (experimental): (default 0 if not stated otherwise)
-	uint64_t registerBoost; //specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Default 1
-	uint64_t registerBoostNonPow2; //specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on)
-	uint64_t registerBoost4Step; //specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1
+	pfUINT registerBoost; //specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Default 1
+	pfUINT registerBoostNonPow2; //specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on)
+	pfUINT registerBoost4Step; //specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1
 
 	//not used techniques:
-	uint64_t swapTo3Stage4Step; //specify at which number to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 131072
-	uint64_t devicePageSize;//in KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages
-	uint64_t localPageSize;//in KB, the size to split page into if sequence spans multiple devicePageSize pages
+	pfUINT devicePageSize;//in KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages
+	pfUINT localPageSize;//in KB, the size to split page into if sequence spans multiple devicePageSize pages
 
 	//automatically filled based on device info (still can be reconfigured by user):
-	uint64_t computeCapabilityMajor; // CUDA/HIP compute capability of the device
-	uint64_t computeCapabilityMinor; // CUDA/HIP compute capability of the device
-	uint64_t maxComputeWorkGroupCount[VKFFT_MAX_FFT_DIMENSIONS]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits
-	uint64_t maxComputeWorkGroupSize[VKFFT_MAX_FFT_DIMENSIONS]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits
-	uint64_t maxThreadsNum; //max number of threads from VkPhysicalDeviceLimits
-	uint64_t sharedMemorySizeStatic; //available for static allocation shared memory size, in bytes
-	uint64_t sharedMemorySize; //available for allocation shared memory size, in bytes
-	uint64_t sharedMemorySizePow2; //power of 2 which is less or equal to sharedMemorySize, in bytes
-	uint64_t warpSize; //number of threads per warp/wavefront.
-	uint64_t halfThreads;//Intel fix
-	uint64_t allocateTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated
-	uint64_t reorderFourStep; // unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1.
-	int64_t maxCodeLength; //specify how big can be buffer used for code generation (in char). Default 4000000 chars.
-	int64_t maxTempLength; //specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number.
-	uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern
-	uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow
-	uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc.
+	pfUINT computeCapabilityMajor; // CUDA/HIP compute capability of the device
+	pfUINT computeCapabilityMinor; // CUDA/HIP compute capability of the device
+	pfUINT maxComputeWorkGroupCount[VKFFT_MAX_FFT_DIMENSIONS]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits
+	pfUINT maxComputeWorkGroupSize[VKFFT_MAX_FFT_DIMENSIONS]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits
+	pfUINT maxThreadsNum; //max number of threads from VkPhysicalDeviceLimits
+	pfUINT sharedMemorySizeStatic; //available for static allocation shared memory size, in bytes
+	pfUINT sharedMemorySize; //available for allocation shared memory size, in bytes
+	pfUINT sharedMemorySizePow2; //power of 2 which is less or equal to sharedMemorySize, in bytes
+	pfUINT warpSize; //number of threads per warp/wavefront.
+	pfUINT halfThreads;//Intel fix
+	pfUINT allocateTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated
+	pfUINT reorderFourStep; // unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1.
+	pfINT maxCodeLength; //specify how big can be buffer used for code generation (in char). Default 4000000 chars.
+	pfINT maxTempLength; //specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number.
+	pfUINT autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern
+	pfUINT useRaderUintLUT; // allocate additional LUT to store g_pow
+	pfUINT vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc.
 #if(VKFFT_BACKEND==0)
 	VkDeviceMemory tempBufferDeviceMemory;//Filled at app creation
 	VkCommandBuffer* commandBuffer;//Filled at app execution
 	VkMemoryBarrier* memory_barrier;//Filled at app creation
 #elif(VKFFT_BACKEND==1)
 	cudaEvent_t* stream_event;//Filled at app creation
-	uint64_t streamCounter;//Filled at app creation
-	uint64_t streamID;//Filled at app creation
+	pfUINT streamCounter;//Filled at app creation
+	pfUINT streamID;//Filled at app creation
 #elif(VKFFT_BACKEND==2)
 	hipEvent_t* stream_event;//Filled at app creation
-	uint64_t streamCounter;//Filled at app creation
-	uint64_t streamID;//Filled at app creation
-	int64_t  useStrict32BitAddress; // guarantee 32 bit addresses in bytes instead of number of elements. This results in fewer instructions generated. -1: Disable, 0: Infer based on size, 1: enable. Has no effect with useUint64.
+	pfUINT streamCounter;//Filled at app creation
+	pfUINT streamID;//Filled at app creation
+	pfINT  useStrict32BitAddress; // guarantee 32 bit addresses in bytes instead of number of elements. This results in fewer instructions generated. -1: Disable, 0: Infer based on size, 1: enable. Has no effect with useUint64.
 #elif(VKFFT_BACKEND==3)
 	cl_command_queue* commandQueue;
 #elif(VKFFT_BACKEND==4)
@@ -358,11 +369,11 @@ typedef struct {
 	MTL::Buffer** kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled
 #endif
 	//following parameters can be specified during kernels launch, if specifyOffsetsAtLaunch parameter was enabled during the initializeVkFFT call
-	uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 
-	uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 
-	uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 
-	uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
-	uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
+	pfUINT bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 
+	pfUINT tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 
+	pfUINT inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 
+	pfUINT outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
+	pfUINT kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
 } VkFFTLaunchParams;//parameters specified at plan execution
 typedef enum VkFFTResult {
 	VKFFT_SUCCESS = 0,
@@ -373,6 +384,7 @@ typedef enum VkFFTResult {
 	VKFFT_ERROR_NULL_TEMP_PASSED = 5,
 	VKFFT_ERROR_MATH_FAILED = 6,
     VKFFT_ERROR_FFTdim_GT_MAX_FFT_DIMENSIONS = 7,
+	VKFFT_ERROR_NONZERO_APP_INITIALIZATION = 8,
 	VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001,
 	VKFFT_ERROR_INVALID_DEVICE = 1002,
 	VKFFT_ERROR_INVALID_QUEUE = 1003,
@@ -398,10 +410,11 @@ typedef enum VkFFTResult {
 	VKFFT_ERROR_EMPTY_kernel = 2012,
 	VKFFT_ERROR_EMPTY_applicationString = 2013,
 	VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays = 2014,
+	VKFFT_ERROR_EMPTY_app = 2015,
 	VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,
 	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002,
 	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,
-	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004,
+	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004,
 	VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005,
 	VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001,
 	VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002,
@@ -480,6 +493,8 @@ static inline const char* getVkFFTErrorString(VkFFTResult result)
 		return "VKFFT_ERROR_MATH_FAILED";
     case VKFFT_ERROR_FFTdim_GT_MAX_FFT_DIMENSIONS:
         return "VKFFT_ERROR_FFTdim_GT_MAX_FFT_DIMENSIONS";
+    case VKFFT_ERROR_NONZERO_APP_INITIALIZATION:
+    	return "VKFFT_ERROR_NONZERO_APP_INITIALIZATION";
 	case VKFFT_ERROR_INVALID_PHYSICAL_DEVICE:
 		return "VKFFT_ERROR_INVALID_PHYSICAL_DEVICE";
 	case VKFFT_ERROR_INVALID_DEVICE:
@@ -530,14 +545,16 @@ static inline const char* getVkFFTErrorString(VkFFTResult result)
 		return "VKFFT_ERROR_EMPTY_applicationString";
 	case VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays:
 		return "VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays";
+	case VKFFT_ERROR_EMPTY_app:
+		return "VKFFT_ERROR_EMPTY_app";
 	case VKFFT_ERROR_UNSUPPORTED_RADIX:
 		return "VKFFT_ERROR_UNSUPPORTED_RADIX";
 	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH:
 		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH";
 	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C:
 		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C";
-	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT:
-		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT";
+	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R:
+		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R";
 	case VKFFT_ERROR_UNSUPPORTED_FFT_OMIT:
 		return "VKFFT_ERROR_UNSUPPORTED_FFT_OMIT";
 	case VKFFT_ERROR_FAILED_TO_ALLOCATE:
@@ -683,9 +700,9 @@ struct VkFFTRaderContainer {
 	int containerFFTDim;
 	int containerFFTNum;
 	int subLogicalGroupSizeMax;//how many threads are needed per Rader transform
-	int64_t RaderKernelOffsetLUT;
-	int64_t RaderRadixOffsetLUT;
-	int64_t RaderRadixOffsetLUTiFFT;
+	pfINT RaderKernelOffsetLUT;
+	pfINT RaderRadixOffsetLUT;
+	pfINT RaderRadixOffsetLUTiFFT;
 	PfContainer g_powConstantStruct;
 	PfContainer r_rader_kernelConstantStruct;
 	PfContainer i_rader_kernelConstantStruct;
@@ -696,7 +713,7 @@ struct VkFFTRaderContainer {
 
 typedef struct {
 	VkFFTResult res;
-	long double double_PI; 
+	pfLD double_PI;
     int numFFTdims;
 	PfContainer size[VKFFT_MAX_FFT_DIMENSIONS];
 	PfContainer localSize[3];
@@ -733,6 +750,7 @@ typedef struct {
 	int performR2C;
 	int performR2CmultiUpload;
 	int performDCT;
+	int performDST; 
 	int performBandwidthBoost;
 	int frequencyZeropadding;
 	int performZeropaddingFull[VKFFT_MAX_FFT_DIMENSIONS]; // don't do read/write if full sequence is omitted
@@ -761,17 +779,18 @@ typedef struct {
 	PfContainer kernelOffset;
 	PfContainer outputOffset;
 	int reorderFourStep;
+	int storeSharedComplexComponentsSeparately;
 	int pushConstantsStructSize;
 	int performWorkGroupShift[VKFFT_MAX_FFT_DIMENSIONS];
 	int performPostCompilationInputOffset;
 	int performPostCompilationOutputOffset;
 	int performPostCompilationKernelOffset;
-	uint64_t inputBufferBlockNum;
-	uint64_t inputBufferBlockSize;
-	uint64_t outputBufferBlockNum;
-	uint64_t outputBufferBlockSize;
-	uint64_t kernelBlockNum;
-	uint64_t kernelBlockSize;
+	pfUINT inputBufferBlockNum;
+	pfUINT inputBufferBlockSize;
+	pfUINT outputBufferBlockNum;
+	pfUINT outputBufferBlockSize;
+	pfUINT kernelBlockNum;
+	pfUINT kernelBlockSize;
 	int numCoordinates;
 	int matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
 	PfContainer numBatches;
@@ -844,7 +863,7 @@ typedef struct {
 	int performBufferSetUpdate;
 	int useUint64;
 #if(VKFFT_BACKEND==2)
-	int64_t  useStrict32BitAddress;
+	pfINT  useStrict32BitAddress;
 #endif
 	int disableSetLocale;
 
@@ -861,6 +880,7 @@ typedef struct {
 	PfContainer LUTId;
 	PfContainer raderIDx;
 	PfContainer raderIDx2;
+	PfContainer offsetImaginaryShared;
 	PfContainer gl_LocalInvocationID_x;
 	PfContainer gl_LocalInvocationID_y;
 	PfContainer gl_LocalInvocationID_z;
@@ -878,10 +898,12 @@ typedef struct {
 	PfContainer halfDef;
 	PfContainer floatDef;
 	PfContainer doubleDef;
+	PfContainer quadDef;
 
 	PfContainer half2Def;
 	PfContainer float2Def;
 	PfContainer double2Def;
+	PfContainer quad2Def;
 
 	PfContainer halfLiteral;
 	PfContainer floatLiteral;
@@ -923,6 +945,12 @@ typedef struct {
 	PfContainer tempInt;
 	PfContainer tempInt2;
 	PfContainer tempFloat;
+
+	PfContainer tempQuad;
+	PfContainer tempQuad2;
+	PfContainer tempQuad3;
+	PfContainer tempIntQuad;
+
 	PfContainer w;
 	PfContainer iw;
 	PfContainer angle;
@@ -931,11 +959,11 @@ typedef struct {
 	PfContainer locID[33];
 	char* code0;
 	char* tempStr;
-	int64_t tempLen;
-	int64_t currentLen;
-	int64_t currentTempLen;
-	int64_t maxCodeLength;
-	int64_t maxTempLength;
+	pfINT tempLen;
+	pfINT currentLen;
+	pfINT currentTempLen;
+	pfINT maxCodeLength;
+	pfINT maxTempLength;
 
 	int dataTypeSize;
 	PfContainer LFending;
@@ -974,7 +1002,7 @@ typedef struct {
 
 	PfContainer oldLocale;
 
-	int64_t id;
+	pfINT id;
 } VkFFTSpecializationConstantsLayout;
 
 typedef struct {
@@ -983,28 +1011,28 @@ typedef struct {
 	MTL::Buffer* dataUintBuffer;
 #endif
 	//specify what can be in layout
-	uint64_t performWorkGroupShift[VKFFT_MAX_FFT_DIMENSIONS];
-	uint64_t workGroupShift[VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT performWorkGroupShift[VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT workGroupShift[VKFFT_MAX_FFT_DIMENSIONS];
 
-	uint64_t performPostCompilationInputOffset;
-	uint64_t inputOffset;
+	pfUINT performPostCompilationInputOffset;
+	pfUINT inputOffset;
 
-	uint64_t performPostCompilationOutputOffset;
-	uint64_t outputOffset;
+	pfUINT performPostCompilationOutputOffset;
+	pfUINT outputOffset;
 
-	uint64_t performPostCompilationKernelOffset;
-	uint64_t kernelOffset;
+	pfUINT performPostCompilationKernelOffset;
+	pfUINT kernelOffset;
 
-	uint64_t structSize;
+	pfUINT structSize;
 } VkFFTPushConstantsLayout;
 
 typedef struct {
-	uint64_t numBindings;
-	uint64_t axisBlock[4];
-	uint64_t groupedBatch;
+	pfUINT numBindings;
+	pfUINT axisBlock[4];
+	pfUINT groupedBatch;
 	VkFFTSpecializationConstantsLayout specializationConstants;
 	VkFFTPushConstantsLayout pushConstants;
-	uint64_t updatePushConstants;
+	pfUINT updatePushConstants;
 	char VkFFTFunctionName[50];
 #if(VKFFT_BACKEND==0)
 	VkBuffer* inputBuffer;
@@ -1072,21 +1100,21 @@ typedef struct {
 #endif
 
 	void* binary;
-	uint64_t binarySize;
+	pfUINT binarySize;
 
-	uint64_t bufferLUTSize;
-	uint64_t bufferRaderUintLUTSize;
-	uint64_t referenceLUT;
+	pfUINT bufferLUTSize;
+	pfUINT bufferRaderUintLUTSize;
+	pfUINT referenceLUT;
 } VkFFTAxis;
 
 typedef struct {
-	uint64_t actualFFTSizePerAxis[VKFFT_MAX_FFT_DIMENSIONS][VKFFT_MAX_FFT_DIMENSIONS];
-	uint64_t numAxisUploads[VKFFT_MAX_FFT_DIMENSIONS];
-	uint64_t axisSplit[VKFFT_MAX_FFT_DIMENSIONS][4];
+	pfUINT actualFFTSizePerAxis[VKFFT_MAX_FFT_DIMENSIONS][VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT numAxisUploads[VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT axisSplit[VKFFT_MAX_FFT_DIMENSIONS][4];
 	VkFFTAxis axes[VKFFT_MAX_FFT_DIMENSIONS][4];
 
-	uint64_t multiUploadR2C;
-	uint64_t actualPerformR2CPerAxis[VKFFT_MAX_FFT_DIMENSIONS]; // automatically specified, shows if R2C is actually performed or inside FFT or as a separate step
+	pfUINT multiUploadR2C;
+	pfUINT actualPerformR2CPerAxis[VKFFT_MAX_FFT_DIMENSIONS]; // automatically specified, shows if R2C is actually performed or inside FFT or as a separate step
 	VkFFTAxis R2Cdecomposition;
 	VkFFTAxis inverseBluesteinAxes[VKFFT_MAX_FFT_DIMENSIONS][4];
 } VkFFTPlan;
@@ -1095,11 +1123,11 @@ typedef struct {
 	VkFFTPlan* localFFTPlan;
 	VkFFTPlan* localFFTPlan_inverse; //additional inverse plan
 
-	uint64_t actualNumBatches;
-	uint64_t firstAxis;
-	uint64_t lastAxis;
+	pfUINT actualNumBatches;
+	pfUINT firstAxis;
+	pfUINT lastAxis;
 	//Bluestein buffers reused among plans
-	uint64_t useBluesteinFFT[VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT useBluesteinFFT[VKFFT_MAX_FFT_DIMENSIONS];
 #if(VKFFT_BACKEND==0)
 	VkDeviceMemory bufferRaderUintLUTDeviceMemory[VKFFT_MAX_FFT_DIMENSIONS][4];
 	VkBuffer bufferRaderUintLUT[VKFFT_MAX_FFT_DIMENSIONS][4];
@@ -1135,20 +1163,20 @@ typedef struct {
 	MTL::Buffer* bufferBluesteinFFT[VKFFT_MAX_FFT_DIMENSIONS];
 	MTL::Buffer* bufferBluesteinIFFT[VKFFT_MAX_FFT_DIMENSIONS];
 #endif
-	uint64_t bufferRaderUintLUTSize[VKFFT_MAX_FFT_DIMENSIONS][4];
-	uint64_t bufferBluesteinSize[VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT bufferRaderUintLUTSize[VKFFT_MAX_FFT_DIMENSIONS][4];
+	pfUINT bufferBluesteinSize[VKFFT_MAX_FFT_DIMENSIONS];
 	void* applicationBluesteinString[VKFFT_MAX_FFT_DIMENSIONS];
-	uint64_t applicationBluesteinStringSize[VKFFT_MAX_FFT_DIMENSIONS];
+	pfUINT applicationBluesteinStringSize[VKFFT_MAX_FFT_DIMENSIONS];
 
-	uint64_t numRaderFFTPrimes;
-	uint64_t rader_primes[30];
-	uint64_t rader_buffer_size[30];
+	pfUINT numRaderFFTPrimes;
+	pfUINT rader_primes[30];
+	pfUINT rader_buffer_size[30];
 	void* raderFFTkernel[30];
-	uint64_t applicationStringOffsetRader;
+	pfUINT applicationStringOffsetRader;
 
-	uint64_t currentApplicationStringPos;
+	pfUINT currentApplicationStringPos;
 
-	uint64_t applicationStringSize;//size of saveApplicationString in bytes
+	pfUINT applicationStringSize;//size of saveApplicationString in bytes
 	void* saveApplicationString;//memory array(uint32_t* for Vulkan, char* for CUDA/HIP/OpenCL) through which user can access VkFFT generated binaries. (will be allocated by VkFFT, deallocated with deleteVkFFT call)
 } VkFFTApplication;