From 4f8acdaa98bcbb21c8950499fd2be5211bc39cb5 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Fri, 15 Nov 2024 14:27:11 +0200
Subject: [PATCH 01/15] Add initial support for TensorQ8 in the front-end of
 the API

---
 tornado-api/src/main/java/module-info.java    |   1 +
 .../tornado/api/types/tensors/Float16.java    |   7 ++
 .../tornado/api/types/tensors/GGMLType.java   |  66 +++++++++++
 .../tornado/api/types/tensors/Shape.java      |   3 +-
 .../tornado/api/types/tensors/TensorQ8.java   | 105 ++++++++++++++++++
 5 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java
 create mode 100644 tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java
 create mode 100644 tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java

diff --git a/tornado-api/src/main/java/module-info.java b/tornado-api/src/main/java/module-info.java
index 3eb1b3838c..f1a7686948 100644
--- a/tornado-api/src/main/java/module-info.java
+++ b/tornado-api/src/main/java/module-info.java
@@ -16,6 +16,7 @@
  *
  */
 module tornado.api {
+    requires jdk.unsupported;
     exports uk.ac.manchester.tornado.api;
     exports uk.ac.manchester.tornado.api.annotations;
     exports uk.ac.manchester.tornado.api.common;
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java
new file mode 100644
index 0000000000..be74aa2965
--- /dev/null
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java
@@ -0,0 +1,7 @@
+package uk.ac.manchester.tornado.api.types.tensors;
+
+
+public final class Float16 {
+    public static final int BYTES = 2;
+}
+
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java
new file mode 100644
index 0000000000..8811de6914
--- /dev/null
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java
@@ -0,0 +1,66 @@
+package uk.ac.manchester.tornado.api.types.tensors;
+
+public enum GGMLType {
+    F32(Float.BYTES),
+    F16(Float16.BYTES),
+    Q4_0(Float16.BYTES + 16 * Byte.BYTES, 32),
+    Q4_1(2 * Float16.BYTES + 16 * Byte.BYTES, 32),
+    UNSUPPORTED_Q4_2(Integer.MAX_VALUE), // support has been removed
+    UNSUPPORTED_Q4_3(Integer.MAX_VALUE), // support has been removed
+    Q5_0(Integer.MAX_VALUE),
+    Q5_1(Integer.MAX_VALUE),
+    Q8_0(Float16.BYTES + 32 * Byte.BYTES, 32),
+    Q8_1(32 * Byte.BYTES + 2 * Float.BYTES, 32),
+    // k-quantizations
+    Q2_K(Integer.MAX_VALUE),
+    Q3_K(Integer.MAX_VALUE),
+    Q4_K(2 * Float16.BYTES + ((GGMLType.QK_K / 16) / 8 * 6) + GGMLType.QK_K / 2, GGMLType.QK_K),
+    Q5_K(2 * Float16.BYTES + ((GGMLType.QK_K / 16) / 8 * 6) + GGMLType.QK_K / 8 + GGMLType.QK_K / 2, GGMLType.QK_K),
+    Q6_K(GGMLType.QK_K / 2 + GGMLType.QK_K / 4 + GGMLType.QK_K / 16 + Float16.BYTES, GGMLType.QK_K),
+    Q8_K(Integer.MAX_VALUE),
+    I8(Byte.BYTES),
+    I16(Short.BYTES),
+    I32(Integer.BYTES);
+
+    private static final GGMLType[] VALUES = values();
+
+    private final int typeSize;
+
+    private final int blockSize;
+
+    public int getTypeSize() {
+        return typeSize;
+    }
+
+    public int getBlockSize() {
+        return blockSize;
+    }
+
+    public static GGMLType fromId(int id) {
+        return VALUES[id];
+    }
+
+    GGMLType(int typeSize) {
+        this(typeSize, 1);
+    }
+
+    public long byteSizeFor(int numberOfElements) {
+        long t = numberOfElements * (long) getTypeSize();
+        assert t % getBlockSize() == 0;
+        return Math.toIntExact(t / getBlockSize());
+    }
+
+    public static final int QK_K = 256; // or 64?
+
+    GGMLType(int typeSize, int blockSize) {
+        assert blockSize > 0;
+        assert typeSize > 0;
+        assert isPowerOf2(blockSize);
+        this.typeSize = typeSize;
+        this.blockSize = blockSize;
+    }
+
+    private static boolean isPowerOf2(int n) {
+        return n > 0 && (n & (n - 1)) == 0;
+    }
+}
\ No newline at end of file
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Shape.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Shape.java
index a678cd01d5..d8138651cf 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Shape.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Shape.java
@@ -45,7 +45,8 @@ public long[] getDimensions() {
      * @return the total size of the shape as an int
      */
     public int getSize() {
-        return (int) Arrays.stream(dimensions).reduce(1, (a, b) -> a * b);
+        assert Arrays.stream(dimensions).allMatch(i -> i > 0);
+        return (int) Arrays.stream(dimensions).reduce(Math::multiplyExact).orElseThrow();
     }
 
     @Override
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
new file mode 100644
index 0000000000..4d17b5cab6
--- /dev/null
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -0,0 +1,105 @@
+package uk.ac.manchester.tornado.api.types.tensors;
+
+import sun.misc.Unsafe;
+import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
+import uk.ac.manchester.tornado.api.types.arrays.LongArray;
+
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.lang.reflect.Field;
+
+public class TensorQ8 extends Tensor {
+    private final DType dType;
+    private final Shape shape;
+
+    private final HalfFloatArray tensorStorage;
+
+    private int numberOfElements;
+
+
+    public TensorQ8(Shape shape) {
+        super(DType.HALF_FLOAT, shape);
+        this.shape = shape;
+        this.numberOfElements = shape.getSize();
+        this.dType = DType.HALF_FLOAT;
+        this.tensorStorage = new HalfFloatArray(numberOfElements);
+    }
+
+
+    public TensorQ8(int size, MemorySegment memorySegment) {
+        super(DType.HALF_FLOAT, new Shape(size));
+        this.dType = DType.HALF_FLOAT;
+        this.shape = new Shape(size);
+        this.numberOfElements = size;
+        this.tensorStorage = HalfFloatArray.fromSegment(memorySegment);
+    }
+
+
+    static short readShort(MemorySegment memorySegment, long offset) {
+        return memorySegment.get(ValueLayout.JAVA_SHORT, memorySegment.address()+offset);
+    }
+
+    static byte readByte(MemorySegment memorySegment, long offset) {
+        return memorySegment.get(ValueLayout.JAVA_BYTE, memorySegment.address()+offset);
+    }
+
+    public float getFloat(int index) {
+        assert 0 <= index && index < numberOfElements;
+        int blockIndex = index / GGMLType.Q8_0.getBlockSize();
+        int withinBlockIndex = index % GGMLType.Q8_0.getBlockSize();
+        int blockOffset = blockIndex * GGMLType.Q8_0.getTypeSize();
+        byte quant = readByte(tensorStorage.getSegment(), blockOffset + Float16.BYTES + withinBlockIndex);
+        float scale = Float.float16ToFloat(readShort(tensorStorage.getSegment(), blockOffset));
+        return quant * scale;
+    }
+
+    @Override
+    public Shape getShape() {
+        return null;
+    }
+
+    @Override
+    public String getDTypeAsString() {
+        return "";
+    }
+
+    @Override
+    public DType getDType() {
+        return null;
+    }
+
+    @Override
+    public int getSize() {
+        return 0;
+    }
+
+    @Override
+    public MemorySegment getSegment() {
+        return null;
+    }
+
+    @Override
+    public MemorySegment getSegmentWithHeader() {
+        return null;
+    }
+
+    @Override
+    public long getNumBytesOfSegmentWithHeader() {
+        return 0;
+    }
+
+    @Override
+    public long getNumBytesOfSegment() {
+        return 0;
+    }
+
+    @Override
+    protected void clear() {
+
+    }
+
+    @Override
+    public int getElementSize() {
+        return 0;
+    }
+}

From 435cadc0afb5fb7de5a99d8f8025c713173d6e64 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Fri, 15 Nov 2024 15:04:15 +0200
Subject: [PATCH 02/15] WIP

---
 .../tornado/api/types/tensors/TensorQ8.java    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 4d17b5cab6..56b1f9cd7a 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -48,14 +48,14 @@ public float getFloat(int index) {
         int blockIndex = index / GGMLType.Q8_0.getBlockSize();
         int withinBlockIndex = index % GGMLType.Q8_0.getBlockSize();
         int blockOffset = blockIndex * GGMLType.Q8_0.getTypeSize();
-        byte quant = readByte(tensorStorage.getSegment(), blockOffset + Float16.BYTES + withinBlockIndex);
+        byte quant = readBy te(tensorStorage.getSegment(), blockOffset + Float16.BYTES + withinBlockIndex);
         float scale = Float.float16ToFloat(readShort(tensorStorage.getSegment(), blockOffset));
         return quant * scale;
     }
 
     @Override
     public Shape getShape() {
-        return null;
+        return shape;
     }
 
     @Override
@@ -65,32 +65,32 @@ public String getDTypeAsString() {
 
     @Override
     public DType getDType() {
-        return null;
+        return dType;
     }
 
     @Override
     public int getSize() {
-        return 0;
+        return numberOfElements;
     }
 
     @Override
     public MemorySegment getSegment() {
-        return null;
+        return tensorStorage.getSegment();
     }
 
     @Override
     public MemorySegment getSegmentWithHeader() {
-        return null;
+        return tensorStorage.getSegmentWithHeader();
     }
 
     @Override
     public long getNumBytesOfSegmentWithHeader() {
-        return 0;
+        return tensorStorage.getNumBytesOfSegmentWithHeader();
     }
 
     @Override
     public long getNumBytesOfSegment() {
-        return 0;
+        return tensorStorage.getNumBytesOfSegment();
     }
 
     @Override
@@ -100,6 +100,6 @@ protected void clear() {
 
     @Override
     public int getElementSize() {
-        return 0;
+        return numberOfElements;
     }
 }

From 60abfbd08f286e437bdf2d1ee5b65c1f09fabb3d Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 18:51:55 +0200
Subject: [PATCH 03/15] Add working version of TensorQ8 with validated
 precision

---
 .../tornado/api/types/arrays/ByteArray.java   |  26 ++
 .../tornado/api/types/tensors/TensorQ8.java   | 173 ++++++++--
 .../unittests/tensors/TestTensorQ8.java       | 312 ++++++++++++++++++
 3 files changed, 479 insertions(+), 32 deletions(-)
 create mode 100644 tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
index 213f68fb2d..ab5f868da7 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
@@ -27,6 +27,7 @@
 
 import uk.ac.manchester.tornado.api.annotations.Parallel;
 import uk.ac.manchester.tornado.api.internal.annotations.SegmentElementSize;
+import uk.ac.manchester.tornado.api.types.tensors.GGMLType;
 
 /**
  * This class represents an array of bytes stored in native memory.
@@ -61,6 +62,23 @@ public ByteArray(int numberOfElements) {
         segment.setAtIndex(JAVA_INT, 0, numberOfElements);
     }
 
+    public ByteArray(int numberOfElements, boolean noHeader) {
+        this.numberOfElements = numberOfElements;
+        baseIndex=0;
+        segmentByteSize = numberOfElements * BYTE_BYTES;
+        segment = Arena.ofAuto().allocate(segmentByteSize, 1);
+//        segment.setAtIndex(JAVA_INT, 0, numberOfElements);
+    }
+
+
+    public ByteArray(int numberOfElements, long requiredStorageSize) {
+        this.numberOfElements = numberOfElements;
+        baseIndex=0;
+//        segmentByteSize = numberOfElements * BYTE_BYTES;
+        segment = Arena.ofAuto().allocate(requiredStorageSize, 1);
+        //        segment.setAtIndex(JAVA_INT, 0, numberOfElements);
+    }
+
     /**
      * Constructs a new {@link ByteArray} instance by concatenating the contents of the given array of {@link ByteArray} instances.
      *
@@ -123,6 +141,14 @@ public static ByteArray fromSegment(MemorySegment segment) {
         return byteArray;
     }
 
+    public static ByteArray fromSegment(MemorySegment segment, boolean noHeader) {
+        long byteSize = segment.byteSize();
+        int numElements = (int) (byteSize / BYTE_BYTES);
+        ByteArray byteArray = new ByteArray(numElements, noHeader);
+        MemorySegment.copy(segment, 0, byteArray.segment, byteArray.baseIndex * BYTE_BYTES, byteSize);
+        return byteArray;
+    }
+
     /**
      * Creates a new instance of the {@link ByteArray} class from a {@link ByteBuffer}.
      *
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 56b1f9cd7a..29fcd52369 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -1,56 +1,165 @@
+/*
+ * Copyright (c) 2013-2024, APT Group, Department of Computer Science,
+ * The University of Manchester.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
 package uk.ac.manchester.tornado.api.types.tensors;
 
-import sun.misc.Unsafe;
-import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
-import uk.ac.manchester.tornado.api.types.arrays.LongArray;
+import uk.ac.manchester.tornado.api.types.arrays.ByteArray;
+import uk.ac.manchester.tornado.api.types.arrays.TornadoNativeArray;
 
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
-import java.lang.reflect.Field;
 
 public class TensorQ8 extends Tensor {
-    private final DType dType;
+    private final boolean  DEBUG_TENSOR_Q8 = false;
+    private final ByteArray tensorStorage;
+    private final int numberOfElements;
     private final Shape shape;
+    private final DType dType;
 
-    private final HalfFloatArray tensorStorage;
-
-    private int numberOfElements;
+    private final int blockSize;
+    private final int bytesPerBlock;
 
+    private static final int HEADER_SIZE = (int) TornadoNativeArray.ARRAY_HEADER;
 
     public TensorQ8(Shape shape) {
-        super(DType.HALF_FLOAT, shape);
+        super(DType.QINT8, shape);
         this.shape = shape;
         this.numberOfElements = shape.getSize();
-        this.dType = DType.HALF_FLOAT;
-        this.tensorStorage = new HalfFloatArray(numberOfElements);
+        this.dType = DType.QINT8;
+        this.blockSize = GGMLType.Q8_0.getBlockSize();
+
+        // Each block contains:
+        // - 2 bytes for float16 scale
+        // - blockSize bytes for quantized values
+        this.bytesPerBlock = Float16.BYTES + blockSize;
+
+        // Calculate number of blocks needed to store all elements
+        int numBlocks = (numberOfElements + blockSize - 1) / blockSize;
+
+        // Calculate total storage size in bytes, including header
+        long dataSize = (long)numBlocks * bytesPerBlock;
+        long totalSize = dataSize + HEADER_SIZE;
+
+        if (DEBUG_TENSOR_Q8) {
+            System.out.println("Debug info:");
+            System.out.println("Number of elements: " + numberOfElements);
+            System.out.println("Block size: " + blockSize);
+            System.out.println("Bytes per block: " + bytesPerBlock);
+            System.out.println("Number of blocks: " + numBlocks);
+            System.out.println("Data size: " + dataSize);
+            System.out.println("Header size: " + HEADER_SIZE);
+            System.out.println("Total size with header: " + totalSize);
+        }
+
+        this.tensorStorage = new ByteArray(numberOfElements, totalSize);
+    }
+
+    private float[] getBlockValues(int blockIndex) {
+        float[] values = new float[blockSize];
+        int blockOffset = blockIndex * bytesPerBlock;
+
+        try {
+            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset));
+            for (int i = 0; i < blockSize; i++) {
+                byte quant = readByte(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset + Float16.BYTES + i);
+                values[i] = quant * scale;
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to read block " + blockIndex +
+                    " at offset " + blockOffset + ": " + e.getMessage());
+        }
+        return values;
+    }
+
+    public float getFloat(int index) {
+        if (index < 0 || index >= numberOfElements) {
+            throw new IndexOutOfBoundsException("Index " + index + " out of bounds for length " + numberOfElements);
+        }
+
+        int blockIndex = index / blockSize;
+        int withinBlockIndex = index % blockSize;
+        int blockOffset = blockIndex * bytesPerBlock;
+
+        try {
+            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset));
+            byte quant = readByte(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset + Float16.BYTES + withinBlockIndex);
+            return quant * scale;
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to get float at index " + index +
+                    " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
+        }
     }
 
+    public void setFloat(int index, float value) {
+        if (index < 0 || index >= numberOfElements) {
+            throw new IndexOutOfBoundsException("Index " + index + " out of bounds for length " + numberOfElements);
+        }
+
+        int blockIndex = index / blockSize;
+        int withinBlockIndex = index % blockSize;
+
+        // Get current block values
+        float[] blockValues = getBlockValues(blockIndex);
+        blockValues[withinBlockIndex] = value;
+
+        // Compute optimal scale for block
+        float scale = computeOptimalScale(blockValues);
+
+        // Update block
+        int blockOffset = blockIndex * bytesPerBlock;
+
+        try {
+            // Write scale
+            writeShort(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset, Float.floatToFloat16(scale));
+
+            // Write quantized values
+            for (int i = 0; i < blockValues.length; i++) {
+                int quantized = Math.min(127, Math.max(-128, Math.round(blockValues[i] / scale)));
+                writeByte(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset + Float16.BYTES + i, (byte)quantized);
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to set float at index " + index +
+                    " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
+        }
+    }
 
-    public TensorQ8(int size, MemorySegment memorySegment) {
-        super(DType.HALF_FLOAT, new Shape(size));
-        this.dType = DType.HALF_FLOAT;
-        this.shape = new Shape(size);
-        this.numberOfElements = size;
-        this.tensorStorage = HalfFloatArray.fromSegment(memorySegment);
+    private float computeOptimalScale(float[] values) {
+        float maxAbs = 1e-5f;
+        for (float value : values) {
+            maxAbs = Math.max(maxAbs, Math.abs(value));
+        }
+        return maxAbs / 127.0f;
     }
 
 
     static short readShort(MemorySegment memorySegment, long offset) {
-        return memorySegment.get(ValueLayout.JAVA_SHORT, memorySegment.address()+offset);
+        return memorySegment.get(ValueLayout.JAVA_SHORT, offset);
     }
 
     static byte readByte(MemorySegment memorySegment, long offset) {
-        return memorySegment.get(ValueLayout.JAVA_BYTE, memorySegment.address()+offset);
+        return memorySegment.get(ValueLayout.JAVA_BYTE, offset);
     }
 
-    public float getFloat(int index) {
-        assert 0 <= index && index < numberOfElements;
-        int blockIndex = index / GGMLType.Q8_0.getBlockSize();
-        int withinBlockIndex = index % GGMLType.Q8_0.getBlockSize();
-        int blockOffset = blockIndex * GGMLType.Q8_0.getTypeSize();
-        byte quant = readBy te(tensorStorage.getSegment(), blockOffset + Float16.BYTES + withinBlockIndex);
-        float scale = Float.float16ToFloat(readShort(tensorStorage.getSegment(), blockOffset));
-        return quant * scale;
+    static void writeShort(MemorySegment memorySegment, long offset, short value) {
+        memorySegment.set(ValueLayout.JAVA_SHORT, offset, value);
+    }
+
+    static void writeByte(MemorySegment memorySegment, long offset, byte value) {
+        memorySegment.set(ValueLayout.JAVA_BYTE, offset, value);
     }
 
     @Override
@@ -60,17 +169,17 @@ public Shape getShape() {
 
     @Override
     public String getDTypeAsString() {
-        return "";
+        return dType.QINT8.toString();
     }
 
     @Override
     public DType getDType() {
-        return dType;
+        return DType.QINT8;
     }
 
     @Override
     public int getSize() {
-        return numberOfElements;
+        return shape.getSize();
     }
 
     @Override
@@ -100,6 +209,6 @@ protected void clear() {
 
     @Override
     public int getElementSize() {
-        return numberOfElements;
+        return getSize();
     }
-}
+}
\ No newline at end of file
diff --git a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
new file mode 100644
index 0000000000..b68e165f83
--- /dev/null
+++ b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
@@ -0,0 +1,312 @@
+package uk.ac.manchester.tornado.unittests.tensors;
+
+import org.junit.Assert;
+import org.junit.Test;
+import uk.ac.manchester.tornado.api.types.tensors.GGMLType;
+import uk.ac.manchester.tornado.api.types.tensors.Shape;
+import uk.ac.manchester.tornado.api.types.tensors.TensorQ8;
+import uk.ac.manchester.tornado.unittests.common.TornadoTestBase;
+
+
+public class TestTensorQ8 extends TornadoTestBase {
+
+    @Test
+    public void testBasicQuantization() {
+        // Test with a simple 1D tensor
+        Shape shape = new Shape(1);
+        TensorQ8 tensor = new TensorQ8(shape);
+
+        // Test setting and getting a single value
+        float testValue = 1.5f;
+        tensor.setFloat(0, testValue);
+        float retrieved = tensor.getFloat(0);
+        System.out.println("Segment size for storing single value " + tensor.getSegment().byteSize());
+        Assert.assertEquals(testValue, retrieved, 0.1f);
+    }
+
+    @Test
+    public void testTensorQ8SetAndGetFloat() {
+        // Define the shape and create a tensor
+        Shape shape = new Shape(5); // 1D tensor with 128 elements
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Set some values in the tensor using setFloat and then retrieve them with getFloat
+        float[] valuesToSet = {0.5f, -1.0f, 25.0f, -30.5f, 0.0f};
+        for (int i = 0; i < valuesToSet.length; i++) {
+            tensorQ8.setFloat(i, valuesToSet[i]);
+        }
+
+        // Check that each retrieved value matches the set value within tolerance
+        for (int i = 0; i < valuesToSet.length; i++) {
+            Assert.assertEquals(valuesToSet[i], tensorQ8.getFloat(i), 0.1f);
+        }
+    }
+
+    @Test
+    public void testTensorQ8SetAndGetFloatVerify() {
+        // Use a size that's aligned with Q8_0 block size (typically 32 elements)
+        int blockSize = GGMLType.Q8_0.getBlockSize();  // Should be 32
+        Shape shape = new Shape(blockSize);  // Use full block size
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Create test values array matching the block size
+        float[] valuesToSet = new float[blockSize];
+        // Fill with repeating pattern
+        float[] pattern = {0.5f, -1.0f, 25.0f, -30.5f, 0.0f};
+        for (int i = 0; i < blockSize; i++) {
+            valuesToSet[i] = pattern[i % pattern.length];
+        }
+
+        // Print expected layout information
+        System.out.println("Total elements: " + shape.getSize());
+        System.out.println("Block size: " + blockSize);
+        System.out.println("Total allocated bytes: " + tensorQ8.getSegment().byteSize());
+
+        // Set values
+        for (int i = 0; i < valuesToSet.length; i++) {
+            tensorQ8.setFloat(i, valuesToSet[i]);
+            // Immediately verify each value after setting
+            float retrieved = tensorQ8.getFloat(i);
+            System.out.printf("Index %d: Set=%.2f Retrieved=%.2f%n",
+                    i, valuesToSet[i], retrieved);
+            Assert.assertEquals("Value mismatch at index " + i,
+                    valuesToSet[i], retrieved, 0.1f);
+        }
+
+        // Verify all values again
+        for (int i = 0; i < valuesToSet.length; i++) {
+            float retrieved = tensorQ8.getFloat(i);
+            Assert.assertEquals("Final verification failed at index " + i,
+                    valuesToSet[i], retrieved, 0.1f);
+        }
+    }
+
+    @Test
+    public void testMixedScaleValues() {
+        // Test handling of mixed scales within a block
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Set values with very different scales
+        tensorQ8.setFloat(0, 100.0f);
+        tensorQ8.setFloat(1, 0.001f);
+        tensorQ8.setFloat(2, -100.0f);
+        tensorQ8.setFloat(3, -0.001f);
+
+        // Verify large values maintain reasonable accuracy
+        Assert.assertEquals(100.0f, tensorQ8.getFloat(0), 1.0f);
+        Assert.assertEquals(-100.0f, tensorQ8.getFloat(2), 1.0f);
+
+        // Small values might have less precision but should maintain sign
+        float small1 = tensorQ8.getFloat(1);
+        float small2 = tensorQ8.getFloat(3);
+        Assert.assertTrue("Small positive value lost sign", small1 >= 0);
+        Assert.assertTrue("Small negative value lost sign", small2 <= 0);
+    }
+
+    @Test
+    public void testQuantizationRange() {
+        // Test extreme values and quantization handling
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Test values in separate blocks to maintain scale independence
+        float[] testValues = {
+                0.0f,              // Zero
+                1e-6f,            // Very small positive
+                -1e-6f,           // Very small negative
+                100.0f,           // Large positive
+                -100.0f,          // Large negative
+        };
+
+        for (int i = 0; i < testValues.length; i++) {
+            tensorQ8.setFloat(i, testValues[i]);
+            float retrieved = tensorQ8.getFloat(i);
+
+            // For very small values, check if they're close to zero
+            if (Math.abs(testValues[i]) < 1e-5f) {
+                Assert.assertTrue("Small value not close to zero",
+                        Math.abs(retrieved) < 1e-4f);
+            } else {
+                // For larger values, check relative error
+                float relativeError = Math.abs((retrieved - testValues[i]) / testValues[i]);
+                Assert.assertTrue("Large relative error at index " + i +
+                                ": expected=" + testValues[i] + ", got=" + retrieved,
+                        relativeError < 0.01f);
+            }
+        }
+    }
+
+    @Test
+    public void testInt8Range() {
+        // Test the full INT8 range in a dedicated test
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Set a few values at INT8 boundaries
+        float[] boundaryValues = {
+                -128.0f,   // Min INT8
+                -127.0f,
+                -64.0f,
+                0.0f,
+                63.0f,
+                126.0f,
+                127.0f     // Max INT8
+        };
+
+        // Set values one at a time to ensure same scale
+        for (int i = 0; i < boundaryValues.length; i++) {
+            tensorQ8.setFloat(i, boundaryValues[i]);
+            float retrieved = tensorQ8.getFloat(i);
+            System.out.printf("INT8 boundary test: Setting %.1f, got %.1f%n",
+                    boundaryValues[i], retrieved);
+            Assert.assertEquals("Value mismatch at INT8 boundary " + boundaryValues[i],
+                    boundaryValues[i], retrieved, 1.0f);  // Allow 1.0 tolerance for boundary values
+        }
+    }
+
+    @Test
+    public void testIndependentBlocks() {
+        // Test that blocks can handle different scales independently
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize * 3);  // 3 blocks
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        System.out.println("\nTesting independent blocks with different scales:");
+
+        // Block 1: Small values (0.1 to 1.0)
+        System.out.println("\nBlock 1 - Small values:");
+        for (int i = 0; i < blockSize; i++) {
+            float value = 0.1f + (0.9f * i / blockSize);
+            tensorQ8.setFloat(i, value);
+            float retrieved = tensorQ8.getFloat(i);
+            System.out.printf("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
+                    i, value, retrieved, Math.abs(value - retrieved));
+        }
+
+        // Block 2: Medium values (10 to 20)
+        System.out.println("\nBlock 2 - Medium values:");
+        for (int i = 0; i < blockSize; i++) {
+            float value = 10.0f + (10.0f * i / blockSize);
+            tensorQ8.setFloat(blockSize + i, value);
+            float retrieved = tensorQ8.getFloat(blockSize + i);
+            System.out.printf("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
+                    i, value, retrieved, Math.abs(value - retrieved));
+        }
+
+        // Block 3: Large values (100 to 200)
+        System.out.println("\nBlock 3 - Large values:");
+        for (int i = 0; i < blockSize; i++) {
+            float value = 100.0f + (100.0f * i / blockSize);
+            tensorQ8.setFloat(2 * blockSize + i, value);
+            float retrieved = tensorQ8.getFloat(2 * blockSize + i);
+            System.out.printf("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
+                    i, value, retrieved, Math.abs(value - retrieved));
+        }
+
+        // Verify blocks maintain reasonable accuracy
+        System.out.println("\nVerifying accuracy for each block:");
+
+        // Helper function to check max absolute difference in a block
+        for (int block = 0; block < 3; block++) {
+            float maxDiff = 0.0f;
+            float maxRelErr = 0.0f;
+            float minVal = Float.MAX_VALUE;
+            float maxVal = Float.MIN_VALUE;
+
+            for (int i = 0; i < blockSize; i++) {
+                int idx = block * blockSize + i;
+                float original = (block == 0) ? (0.1f + (0.9f * i / blockSize)) :
+                        (block == 1) ? (10.0f + (10.0f * i / blockSize)) :
+                                (100.0f + (100.0f * i / blockSize));
+                float retrieved = tensorQ8.getFloat(idx);
+                float diff = Math.abs(original - retrieved);
+                float relErr = diff / Math.abs(original);
+
+                maxDiff = Math.max(maxDiff, diff);
+                maxRelErr = Math.max(maxRelErr, relErr);
+                minVal = Math.min(minVal, retrieved);
+                maxVal = Math.max(maxVal, retrieved);
+            }
+
+            System.out.printf("Block %d stats:%n", block);
+            System.out.printf("  Value range: %.6f to %.6f%n", minVal, maxVal);
+            System.out.printf("  Max absolute difference: %.6f%n", maxDiff);
+            System.out.printf("  Max relative error: %.6f%%%n", maxRelErr * 100);
+
+            // Verify block maintains reasonable range and accuracy
+            float expectedMaxErr;
+            if (block == 0) {  // Small values
+                expectedMaxErr = 0.5f;  // Larger relative error acceptable for small values
+            } else if (block == 1) {  // Medium values
+                expectedMaxErr = 0.2f;  // 20% error acceptable for medium values
+            } else {  // Large values
+                expectedMaxErr = 0.1f;  // 10% error acceptable for large values
+            }
+
+            Assert.assertTrue(
+                    String.format("Block %d error too large: %.2f%% > %.2f%%",
+                            block, maxRelErr * 100, expectedMaxErr * 100),
+                    maxRelErr < expectedMaxErr);
+        }
+    }
+
+    @Test
+    public void testConstantBlock() {
+        // Test how well we can represent a constant value
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize);
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        float testValue = 10.0f;
+        System.out.println("\nTesting constant value block:");
+
+        // Set all values in block to same value
+        for (int i = 0; i < blockSize; i++) {
+            tensorQ8.setFloat(i, testValue);
+        }
+
+        // Verify values
+        float maxDiff = 0.0f;
+        for (int i = 0; i < blockSize; i++) {
+            float retrieved = tensorQ8.getFloat(i);
+            float diff = Math.abs(retrieved - testValue);
+            maxDiff = Math.max(maxDiff, diff);
+            System.out.printf("Index %d: Expected=%.6f Got=%.6f Diff=%.6f%n",
+                    i, testValue, retrieved, diff);
+        }
+
+        float relativeError = maxDiff / Math.abs(testValue);
+        System.out.printf("Maximum relative error: %.6f%%%n", relativeError * 100);
+
+        Assert.assertTrue(
+                String.format("Relative error too large for constant block: %.2f%%",
+                        relativeError * 100),
+                relativeError < 0.1f);  // Expect very good accuracy for constant values
+    }
+
+    @Test
+    public void testSingleBlockPrecision() {
+        // Test precision within a single block using relative error metrics
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        float baseValue = 10.0f;  // Use a reasonable base value
+
+        System.out.println("\nTesting single block precision:");
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = baseValue * (i + 1) / shape.getSize();  // Spread values evenly
+            tensorQ8.setFloat(i, value);
+            float retrieved = tensorQ8.getFloat(i);
+            float relativeError = Math.abs((retrieved - value) / value);
+
+            System.out.printf("Index %d: Set=%.6f Got=%.6f RelError=%.6f%n",
+                    i, value, retrieved, relativeError);
+
+            Assert.assertTrue(String.format(
+                            "Relative error too large at index %d: expected=%.6f, got=%.6f, relative error=%.6f",
+                            i, value, retrieved, relativeError),
+                    relativeError < 0.1f);  // Allow 10% relative error
+        }
+    }
+}

From 26899d3d2bb6b74af6e1caa4c788b026e2874b98 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 18:53:09 +0200
Subject: [PATCH 04/15] Add test information and debug option

---
 .../tornado/api/types/tensors/TensorQ8.java   |  2 +-
 .../unittests/tensors/TestTensorQ8.java       | 26 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 29fcd52369..32f4570154 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2024, APT Group, Department of Computer Science,
+ * Copyright (c) 2024, APT Group, Department of Computer Science,
  * The University of Manchester.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
index b68e165f83..bf7f60fef1 100644
--- a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
+++ b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
@@ -1,3 +1,20 @@
+/*
+ * Copyright (c) 2024, APT Group, Department of Computer Science,
+ * The University of Manchester.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
 package uk.ac.manchester.tornado.unittests.tensors;
 
 import org.junit.Assert;
@@ -7,7 +24,14 @@
 import uk.ac.manchester.tornado.api.types.tensors.TensorQ8;
 import uk.ac.manchester.tornado.unittests.common.TornadoTestBase;
 
-
+/**
+ * <p>
+ * How to run?
+ * </p>
+ * <code>
+ * tornado-test -V uk.ac.manchester.tornado.unittests.tensors.TestTensorQ8
+ * </code>
+ */
 public class TestTensorQ8 extends TornadoTestBase {
 
     @Test

From 6737a4ba5628e6ba32846bfd84f6e179baf4b691 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 19:09:15 +0200
Subject: [PATCH 05/15] Add more tests on precision the value modifications

---
 .../tornado/api/types/tensors/TensorQ8.java   |  2 +-
 .../unittests/tensors/TestTensorQ8.java       | 91 +++++++++++++++++++
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 32f4570154..33e4dd1ebb 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -24,7 +24,7 @@
 import java.lang.foreign.ValueLayout;
 
 public class TensorQ8 extends Tensor {
-    private final boolean  DEBUG_TENSOR_Q8 = false;
+    private final boolean  DEBUG_TENSOR_Q8 = true;
     private final ByteArray tensorStorage;
     private final int numberOfElements;
     private final Shape shape;
diff --git a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
index bf7f60fef1..7fc53532e1 100644
--- a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
+++ b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
@@ -333,4 +333,95 @@ public void testSingleBlockPrecision() {
                     relativeError < 0.1f);  // Allow 10% relative error
         }
     }
+
+    @Test
+    public void testNonAlignedBlockSize() {
+        // Test tensor with size not aligned to block size
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize + 5); // Intentionally non-aligned
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Set values in both full and partial blocks
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = i * 1.5f;
+            tensorQ8.setFloat(i, value);
+            float retrieved = tensorQ8.getFloat(i);
+            Assert.assertEquals("Value mismatch in non-aligned blocks",
+                    value, retrieved, 0.1f);
+        }
+    }
+
+    @Test
+    public void testZeroCrossing() {
+        // Test values around zero to verify sign handling
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Test different ranges of values around zero
+        float[][] testRanges = {
+                // Small values - might get quantized to zero
+                {-0.001f, -0.0001f, 0.0f, 0.0001f, 0.001f},
+                // Medium values - should preserve sign
+                {-0.1f, -0.05f, 0.0f, 0.05f, 0.1f},
+                // Larger values - should definitely preserve sign
+                {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}};
+
+        System.out.println("\nTesting zero crossing behavior:");
+        for (int range = 0; range < testRanges.length; range++) {
+            System.out.printf("\nRange %d:%n", range);
+
+            // Set values from current range
+            for (int i = 0; i < testRanges[range].length; i++) {
+                float value = testRanges[range][i];
+                tensorQ8.setFloat(i, value);
+                float retrieved = tensorQ8.getFloat(i);
+
+                System.out.printf("Value: %10.6f -> Retrieved: %10.6f%n", value, retrieved);
+
+                if (Math.abs(value) >= 0.01f) {  // Only check sign for values >= 0.01
+                    Assert.assertEquals(String.format("Sign mismatch for value %.6f", value), Math.signum(value), Math.signum(retrieved), 0.0f);
+                } else {
+                    // For very small values, just verify they're close to zero
+                    Assert.assertTrue(String.format("Small value %.6f not close enough to zero (got %.6f)", value, retrieved), Math.abs(retrieved) < 0.01f);
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testRepeatedUpdates() {
+        // Test stability when repeatedly updating values
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        float testValue = 1.0f;
+        int testIndex = 0;
+
+        // Repeatedly update same value
+        for (int i = 0; i < 100; i++) {
+            tensorQ8.setFloat(testIndex, testValue);
+            float retrieved = tensorQ8.getFloat(testIndex);
+            Assert.assertEquals("Value unstable after repeated updates",
+                    testValue, retrieved, 0.1f);
+        }
+    }
+
+    @Test
+    public void testAlternatingPatterns() {
+        // Test alternating positive/negative pattern
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = (i % 2 == 0) ? 1.0f : -1.0f;
+            tensorQ8.setFloat(i, value);
+        }
+
+        for (int i = 0; i < shape.getSize(); i++) {
+            float expected = (i % 2 == 0) ? 1.0f : -1.0f;
+            float retrieved = tensorQ8.getFloat(i);
+            Assert.assertEquals("Alternating pattern not preserved",
+                    expected, retrieved, 0.1f);
+        }
+    }
 }

From be718bb37d19234c0df9030f56b41e1cc6c1ccfe Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 19:14:59 +0200
Subject: [PATCH 06/15] Add licenses were missing

---
 .../tornado/api/types/tensors/Float16.java      | 17 +++++++++++++++++
 .../tornado/api/types/tensors/GGMLType.java     | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java
index be74aa2965..5802f63d01 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/Float16.java
@@ -1,3 +1,20 @@
+/*
+ * Copyright (c) 2024, APT Group, Department of Computer Science,
+ * The University of Manchester.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
 package uk.ac.manchester.tornado.api.types.tensors;
 
 
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java
index 8811de6914..447805dccd 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/GGMLType.java
@@ -1,3 +1,20 @@
+/*
+ * Copyright (c) 2024, APT Group, Department of Computer Science,
+ * The University of Manchester.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
 package uk.ac.manchester.tornado.api.types.tensors;
 
 public enum GGMLType {

From 3818d74402e9ba0c543464ec81c47d1c6738d1e8 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 19:35:20 +0200
Subject: [PATCH 07/15] Simplify header handling

---
 .../tornado/api/types/arrays/ByteArray.java   | 19 +------------
 .../tornado/api/types/tensors/TensorQ8.java   | 28 ++++++++-----------
 2 files changed, 13 insertions(+), 34 deletions(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
index ab5f868da7..0bacb9ff15 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
@@ -62,21 +62,12 @@ public ByteArray(int numberOfElements) {
         segment.setAtIndex(JAVA_INT, 0, numberOfElements);
     }
 
-    public ByteArray(int numberOfElements, boolean noHeader) {
-        this.numberOfElements = numberOfElements;
-        baseIndex=0;
-        segmentByteSize = numberOfElements * BYTE_BYTES;
-        segment = Arena.ofAuto().allocate(segmentByteSize, 1);
-//        segment.setAtIndex(JAVA_INT, 0, numberOfElements);
-    }
-
 
     public ByteArray(int numberOfElements, long requiredStorageSize) {
         this.numberOfElements = numberOfElements;
         baseIndex=0;
-//        segmentByteSize = numberOfElements * BYTE_BYTES;
         segment = Arena.ofAuto().allocate(requiredStorageSize, 1);
-        //        segment.setAtIndex(JAVA_INT, 0, numberOfElements);
+        segment.setAtIndex(JAVA_INT, 0, numberOfElements);
     }
 
     /**
@@ -141,14 +132,6 @@ public static ByteArray fromSegment(MemorySegment segment) {
         return byteArray;
     }
 
-    public static ByteArray fromSegment(MemorySegment segment, boolean noHeader) {
-        long byteSize = segment.byteSize();
-        int numElements = (int) (byteSize / BYTE_BYTES);
-        ByteArray byteArray = new ByteArray(numElements, noHeader);
-        MemorySegment.copy(segment, 0, byteArray.segment, byteArray.baseIndex * BYTE_BYTES, byteSize);
-        return byteArray;
-    }
-
     /**
      * Creates a new instance of the {@link ByteArray} class from a {@link ByteBuffer}.
      *
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 33e4dd1ebb..52f4df0057 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -33,7 +33,7 @@ public class TensorQ8 extends Tensor {
     private final int blockSize;
     private final int bytesPerBlock;
 
-    private static final int HEADER_SIZE = (int) TornadoNativeArray.ARRAY_HEADER;
+//    private static final int HEADER_SIZE = (int) TornadoNativeArray.ARRAY_HEADER;
 
     public TensorQ8(Shape shape) {
         super(DType.QINT8, shape);
@@ -52,7 +52,7 @@ public TensorQ8(Shape shape) {
 
         // Calculate total storage size in bytes, including header
         long dataSize = (long)numBlocks * bytesPerBlock;
-        long totalSize = dataSize + HEADER_SIZE;
+        long totalSize = dataSize;
 
         if (DEBUG_TENSOR_Q8) {
             System.out.println("Debug info:");
@@ -61,7 +61,6 @@ public TensorQ8(Shape shape) {
             System.out.println("Bytes per block: " + bytesPerBlock);
             System.out.println("Number of blocks: " + numBlocks);
             System.out.println("Data size: " + dataSize);
-            System.out.println("Header size: " + HEADER_SIZE);
             System.out.println("Total size with header: " + totalSize);
         }
 
@@ -73,14 +72,13 @@ private float[] getBlockValues(int blockIndex) {
         int blockOffset = blockIndex * bytesPerBlock;
 
         try {
-            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset));
+            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(),   blockOffset));
             for (int i = 0; i < blockSize; i++) {
-                byte quant = readByte(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset + Float16.BYTES + i);
+                byte quant = readByte(tensorStorage.getSegmentWithHeader(),   blockOffset + Float16.BYTES + i);
                 values[i] = quant * scale;
             }
         } catch (Exception e) {
-            throw new RuntimeException("Failed to read block " + blockIndex +
-                    " at offset " + blockOffset + ": " + e.getMessage());
+            throw new RuntimeException("Failed to read block " + blockIndex + " at offset " + blockOffset + ": " + e.getMessage());
         }
         return values;
     }
@@ -95,12 +93,11 @@ public float getFloat(int index) {
         int blockOffset = blockIndex * bytesPerBlock;
 
         try {
-            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset));
-            byte quant = readByte(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset + Float16.BYTES + withinBlockIndex);
+            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(),   blockOffset));
+            byte quant = readByte(tensorStorage.getSegmentWithHeader(),  + blockOffset + Float16.BYTES + withinBlockIndex);
             return quant * scale;
         } catch (Exception e) {
-            throw new RuntimeException("Failed to get float at index " + index +
-                    " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
+            throw new RuntimeException("Failed to get float at index " + index + " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
         }
     }
 
@@ -124,16 +121,15 @@ public void setFloat(int index, float value) {
 
         try {
             // Write scale
-            writeShort(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset, Float.floatToFloat16(scale));
+            writeShort(tensorStorage.getSegmentWithHeader(),   blockOffset, Float.floatToFloat16(scale));
 
             // Write quantized values
             for (int i = 0; i < blockValues.length; i++) {
                 int quantized = Math.min(127, Math.max(-128, Math.round(blockValues[i] / scale)));
-                writeByte(tensorStorage.getSegmentWithHeader(), HEADER_SIZE + blockOffset + Float16.BYTES + i, (byte)quantized);
+                writeByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i, (byte)quantized);
             }
         } catch (Exception e) {
-            throw new RuntimeException("Failed to set float at index " + index +
-                    " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
+            throw new RuntimeException("Failed to set float at index " + index +  " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
         }
     }
 
@@ -184,7 +180,7 @@ public int getSize() {
 
     @Override
     public MemorySegment getSegment() {
-        return tensorStorage.getSegment();
+        return tensorStorage.getSegmentWithHeader();
     }
 
     @Override

From 8c61d72d8ac646269a94c7af5be1e08b1a280d04 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 19:43:04 +0200
Subject: [PATCH 08/15] Add verbose control for precision unit-tests

---
 .../tornado/api/types/tensors/TensorQ8.java   |   4 +-
 .../unittests/tensors/TestTensorQ8.java       | 298 ++++++++----------
 2 files changed, 141 insertions(+), 161 deletions(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 52f4df0057..d972e60442 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -24,7 +24,7 @@
 import java.lang.foreign.ValueLayout;
 
 public class TensorQ8 extends Tensor {
-    private final boolean  DEBUG_TENSOR_Q8 = true;
+    private final boolean  DEBUG_TENSOR_Q8 = false;
     private final ByteArray tensorStorage;
     private final int numberOfElements;
     private final Shape shape;
@@ -205,6 +205,6 @@ protected void clear() {
 
     @Override
     public int getElementSize() {
-        return getSize();
+        return DType.QINT8.getByteSize();
     }
 }
\ No newline at end of file
diff --git a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
index 7fc53532e1..c036767251 100644
--- a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
+++ b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
@@ -24,6 +24,8 @@
 import uk.ac.manchester.tornado.api.types.tensors.TensorQ8;
 import uk.ac.manchester.tornado.unittests.common.TornadoTestBase;
 
+import static java.lang.Boolean.FALSE;
+
 /**
  * <p>
  * How to run?
@@ -34,33 +36,42 @@
  */
 public class TestTensorQ8 extends TornadoTestBase {
 
+    private static final boolean VERBOSE = FALSE;  // Control verbose output
+
+    private void printVerbose(String message) {
+        if (VERBOSE) {
+            System.out.println(message);
+        }
+    }
+
+    private void printVerboseF(String format, Object... args) {
+        if (VERBOSE) {
+            System.out.printf(format, args);
+        }
+    }
+
     @Test
     public void testBasicQuantization() {
-        // Test with a simple 1D tensor
         Shape shape = new Shape(1);
         TensorQ8 tensor = new TensorQ8(shape);
 
-        // Test setting and getting a single value
         float testValue = 1.5f;
         tensor.setFloat(0, testValue);
         float retrieved = tensor.getFloat(0);
-        System.out.println("Segment size for storing single value " + tensor.getSegment().byteSize());
+        printVerboseF("Segment size for storing single value %d%n", tensor.getSegment().byteSize());
         Assert.assertEquals(testValue, retrieved, 0.1f);
     }
 
     @Test
     public void testTensorQ8SetAndGetFloat() {
-        // Define the shape and create a tensor
-        Shape shape = new Shape(5); // 1D tensor with 128 elements
+        Shape shape = new Shape(5);
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Set some values in the tensor using setFloat and then retrieve them with getFloat
         float[] valuesToSet = {0.5f, -1.0f, 25.0f, -30.5f, 0.0f};
         for (int i = 0; i < valuesToSet.length; i++) {
             tensorQ8.setFloat(i, valuesToSet[i]);
         }
 
-        // Check that each retrieved value matches the set value within tolerance
         for (int i = 0; i < valuesToSet.length; i++) {
             Assert.assertEquals(valuesToSet[i], tensorQ8.getFloat(i), 0.1f);
         }
@@ -68,36 +79,29 @@ public void testTensorQ8SetAndGetFloat() {
 
     @Test
     public void testTensorQ8SetAndGetFloatVerify() {
-        // Use a size that's aligned with Q8_0 block size (typically 32 elements)
-        int blockSize = GGMLType.Q8_0.getBlockSize();  // Should be 32
-        Shape shape = new Shape(blockSize);  // Use full block size
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize);
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Create test values array matching the block size
         float[] valuesToSet = new float[blockSize];
-        // Fill with repeating pattern
         float[] pattern = {0.5f, -1.0f, 25.0f, -30.5f, 0.0f};
         for (int i = 0; i < blockSize; i++) {
             valuesToSet[i] = pattern[i % pattern.length];
         }
 
-        // Print expected layout information
-        System.out.println("Total elements: " + shape.getSize());
-        System.out.println("Block size: " + blockSize);
-        System.out.println("Total allocated bytes: " + tensorQ8.getSegment().byteSize());
+        printVerboseF("Total elements: %d%n", shape.getSize());
+        printVerboseF("Block size: %d%n", blockSize);
+        printVerboseF("Total allocated bytes: %d%n", tensorQ8.getSegment().byteSize());
 
-        // Set values
         for (int i = 0; i < valuesToSet.length; i++) {
             tensorQ8.setFloat(i, valuesToSet[i]);
-            // Immediately verify each value after setting
             float retrieved = tensorQ8.getFloat(i);
-            System.out.printf("Index %d: Set=%.2f Retrieved=%.2f%n",
+            printVerboseF("Index %d: Set=%.2f Retrieved=%.2f%n",
                     i, valuesToSet[i], retrieved);
             Assert.assertEquals("Value mismatch at index " + i,
                     valuesToSet[i], retrieved, 0.1f);
         }
 
-        // Verify all values again
         for (int i = 0; i < valuesToSet.length; i++) {
             float retrieved = tensorQ8.getFloat(i);
             Assert.assertEquals("Final verification failed at index " + i,
@@ -107,21 +111,17 @@ public void testTensorQ8SetAndGetFloatVerify() {
 
     @Test
     public void testMixedScaleValues() {
-        // Test handling of mixed scales within a block
         Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Set values with very different scales
         tensorQ8.setFloat(0, 100.0f);
         tensorQ8.setFloat(1, 0.001f);
         tensorQ8.setFloat(2, -100.0f);
         tensorQ8.setFloat(3, -0.001f);
 
-        // Verify large values maintain reasonable accuracy
         Assert.assertEquals(100.0f, tensorQ8.getFloat(0), 1.0f);
         Assert.assertEquals(-100.0f, tensorQ8.getFloat(2), 1.0f);
 
-        // Small values might have less precision but should maintain sign
         float small1 = tensorQ8.getFloat(1);
         float small2 = tensorQ8.getFloat(3);
         Assert.assertTrue("Small positive value lost sign", small1 >= 0);
@@ -130,29 +130,21 @@ public void testMixedScaleValues() {
 
     @Test
     public void testQuantizationRange() {
-        // Test extreme values and quantization handling
         Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Test values in separate blocks to maintain scale independence
         float[] testValues = {
-                0.0f,              // Zero
-                1e-6f,            // Very small positive
-                -1e-6f,           // Very small negative
-                100.0f,           // Large positive
-                -100.0f,          // Large negative
+                0.0f, 1e-6f, -1e-6f, 100.0f, -100.0f,
         };
 
         for (int i = 0; i < testValues.length; i++) {
             tensorQ8.setFloat(i, testValues[i]);
             float retrieved = tensorQ8.getFloat(i);
 
-            // For very small values, check if they're close to zero
             if (Math.abs(testValues[i]) < 1e-5f) {
                 Assert.assertTrue("Small value not close to zero",
                         Math.abs(retrieved) < 1e-4f);
             } else {
-                // For larger values, check relative error
                 float relativeError = Math.abs((retrieved - testValues[i]) / testValues[i]);
                 Assert.assertTrue("Large relative error at index " + i +
                                 ": expected=" + testValues[i] + ", got=" + retrieved,
@@ -163,75 +155,60 @@ public void testQuantizationRange() {
 
     @Test
     public void testInt8Range() {
-        // Test the full INT8 range in a dedicated test
         Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Set a few values at INT8 boundaries
         float[] boundaryValues = {
-                -128.0f,   // Min INT8
-                -127.0f,
-                -64.0f,
-                0.0f,
-                63.0f,
-                126.0f,
-                127.0f     // Max INT8
+                -128.0f, -127.0f, -64.0f, 0.0f, 63.0f, 126.0f, 127.0f
         };
 
-        // Set values one at a time to ensure same scale
         for (int i = 0; i < boundaryValues.length; i++) {
             tensorQ8.setFloat(i, boundaryValues[i]);
             float retrieved = tensorQ8.getFloat(i);
-            System.out.printf("INT8 boundary test: Setting %.1f, got %.1f%n",
+            printVerboseF("INT8 boundary test: Setting %.1f, got %.1f%n",
                     boundaryValues[i], retrieved);
             Assert.assertEquals("Value mismatch at INT8 boundary " + boundaryValues[i],
-                    boundaryValues[i], retrieved, 1.0f);  // Allow 1.0 tolerance for boundary values
+                    boundaryValues[i], retrieved, 1.0f);
         }
     }
 
     @Test
     public void testIndependentBlocks() {
-        // Test that blocks can handle different scales independently
         int blockSize = GGMLType.Q8_0.getBlockSize();
-        Shape shape = new Shape(blockSize * 3);  // 3 blocks
+        Shape shape = new Shape(blockSize * 3);
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        System.out.println("\nTesting independent blocks with different scales:");
+        printVerbose("\nTesting independent blocks with different scales:");
 
-        // Block 1: Small values (0.1 to 1.0)
-        System.out.println("\nBlock 1 - Small values:");
+        printVerbose("\nBlock 1 - Small values:");
         for (int i = 0; i < blockSize; i++) {
             float value = 0.1f + (0.9f * i / blockSize);
             tensorQ8.setFloat(i, value);
             float retrieved = tensorQ8.getFloat(i);
-            System.out.printf("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
+            printVerboseF("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
                     i, value, retrieved, Math.abs(value - retrieved));
         }
 
-        // Block 2: Medium values (10 to 20)
-        System.out.println("\nBlock 2 - Medium values:");
+        printVerbose("\nBlock 2 - Medium values:");
         for (int i = 0; i < blockSize; i++) {
             float value = 10.0f + (10.0f * i / blockSize);
             tensorQ8.setFloat(blockSize + i, value);
             float retrieved = tensorQ8.getFloat(blockSize + i);
-            System.out.printf("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
+            printVerboseF("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
                     i, value, retrieved, Math.abs(value - retrieved));
         }
 
-        // Block 3: Large values (100 to 200)
-        System.out.println("\nBlock 3 - Large values:");
+        printVerbose("\nBlock 3 - Large values:");
         for (int i = 0; i < blockSize; i++) {
             float value = 100.0f + (100.0f * i / blockSize);
             tensorQ8.setFloat(2 * blockSize + i, value);
             float retrieved = tensorQ8.getFloat(2 * blockSize + i);
-            System.out.printf("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
+            printVerboseF("Index %d: Set=%.6f Got=%.6f Diff=%.6f%n",
                     i, value, retrieved, Math.abs(value - retrieved));
         }
 
-        // Verify blocks maintain reasonable accuracy
-        System.out.println("\nVerifying accuracy for each block:");
+        printVerbose("\nVerifying accuracy for each block:");
 
-        // Helper function to check max absolute difference in a block
         for (int block = 0; block < 3; block++) {
             float maxDiff = 0.0f;
             float maxRelErr = 0.0f;
@@ -253,20 +230,12 @@ public void testIndependentBlocks() {
                 maxVal = Math.max(maxVal, retrieved);
             }
 
-            System.out.printf("Block %d stats:%n", block);
-            System.out.printf("  Value range: %.6f to %.6f%n", minVal, maxVal);
-            System.out.printf("  Max absolute difference: %.6f%n", maxDiff);
-            System.out.printf("  Max relative error: %.6f%%%n", maxRelErr * 100);
-
-            // Verify block maintains reasonable range and accuracy
-            float expectedMaxErr;
-            if (block == 0) {  // Small values
-                expectedMaxErr = 0.5f;  // Larger relative error acceptable for small values
-            } else if (block == 1) {  // Medium values
-                expectedMaxErr = 0.2f;  // 20% error acceptable for medium values
-            } else {  // Large values
-                expectedMaxErr = 0.1f;  // 10% error acceptable for large values
-            }
+            printVerboseF("Block %d stats:%n", block);
+            printVerboseF("  Value range: %.6f to %.6f%n", minVal, maxVal);
+            printVerboseF("  Max absolute difference: %.6f%n", maxDiff);
+            printVerboseF("  Max relative error: %.6f%%%n", maxRelErr * 100);
+
+            float expectedMaxErr = (block == 0) ? 0.5f : (block == 1) ? 0.2f : 0.1f;
 
             Assert.assertTrue(
                     String.format("Block %d error too large: %.2f%% > %.2f%%",
@@ -275,77 +244,125 @@ public void testIndependentBlocks() {
         }
     }
 
+
     @Test
-    public void testConstantBlock() {
-        // Test how well we can represent a constant value
-        int blockSize = GGMLType.Q8_0.getBlockSize();
-        Shape shape = new Shape(blockSize);
+    public void testRepeatedUpdates() {
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        float testValue = 10.0f;
-        System.out.println("\nTesting constant value block:");
+        float testValue = 1.0f;
+        int testIndex = 0;
 
-        // Set all values in block to same value
-        for (int i = 0; i < blockSize; i++) {
-            tensorQ8.setFloat(i, testValue);
+        printVerbose("\nTesting repeated updates stability:");
+        for (int i = 0; i < 100; i++) {
+            tensorQ8.setFloat(testIndex, testValue);
+            float retrieved = tensorQ8.getFloat(testIndex);
+            printVerboseF("Update %d: Expected=%.6f Got=%.6f%n",
+                    i, testValue, retrieved);
+            Assert.assertEquals("Value unstable after repeated updates",
+                    testValue, retrieved, 0.1f);
         }
+    }
 
-        // Verify values
-        float maxDiff = 0.0f;
-        for (int i = 0; i < blockSize; i++) {
+    @Test
+    public void testAlternatingPatterns() {
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        printVerbose("\nTesting alternating pattern preservation:");
+
+        // Set alternating values
+        printVerbose("Setting alternating values:");
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = (i % 2 == 0) ? 1.0f : -1.0f;
+            tensorQ8.setFloat(i, value);
             float retrieved = tensorQ8.getFloat(i);
-            float diff = Math.abs(retrieved - testValue);
-            maxDiff = Math.max(maxDiff, diff);
-            System.out.printf("Index %d: Expected=%.6f Got=%.6f Diff=%.6f%n",
-                    i, testValue, retrieved, diff);
+            printVerboseF("Index %d: Set=%.6f Got=%.6f%n",
+                    i, value, retrieved);
         }
 
-        float relativeError = maxDiff / Math.abs(testValue);
-        System.out.printf("Maximum relative error: %.6f%%%n", relativeError * 100);
-
-        Assert.assertTrue(
-                String.format("Relative error too large for constant block: %.2f%%",
-                        relativeError * 100),
-                relativeError < 0.1f);  // Expect very good accuracy for constant values
+        // Verify alternating values
+        printVerbose("\nVerifying alternating pattern:");
+        for (int i = 0; i < shape.getSize(); i++) {
+            float expected = (i % 2 == 0) ? 1.0f : -1.0f;
+            float retrieved = tensorQ8.getFloat(i);
+            printVerboseF("Index %d: Expected=%.6f Got=%.6f%n",
+                    i, expected, retrieved);
+            Assert.assertEquals("Alternating pattern not preserved",
+                    expected, retrieved, 0.1f);
+        }
     }
 
     @Test
     public void testSingleBlockPrecision() {
-        // Test precision within a single block using relative error metrics
         Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        float baseValue = 10.0f;  // Use a reasonable base value
+        float baseValue = 10.0f;
 
-        System.out.println("\nTesting single block precision:");
+        printVerbose("\nTesting single block precision:");
         for (int i = 0; i < shape.getSize(); i++) {
-            float value = baseValue * (i + 1) / shape.getSize();  // Spread values evenly
+            float value = baseValue * (i + 1) / shape.getSize();
             tensorQ8.setFloat(i, value);
             float retrieved = tensorQ8.getFloat(i);
             float relativeError = Math.abs((retrieved - value) / value);
 
-            System.out.printf("Index %d: Set=%.6f Got=%.6f RelError=%.6f%n",
+            printVerboseF("Index %d: Set=%.6f Got=%.6f RelError=%.6f%n",
                     i, value, retrieved, relativeError);
 
-            Assert.assertTrue(String.format(
-                            "Relative error too large at index %d: expected=%.6f, got=%.6f, relative error=%.6f",
+            Assert.assertTrue(
+                    String.format("Relative error too large at index %d: expected=%.6f, got=%.6f, relative error=%.6f",
                             i, value, retrieved, relativeError),
-                    relativeError < 0.1f);  // Allow 10% relative error
+                    relativeError < 0.1f);
         }
     }
 
+    @Test
+    public void testConstantBlock() {
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize);
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        float testValue = 10.0f;
+        printVerbose("\nTesting constant value block:");
+
+        printVerbose("Setting constant values:");
+        for (int i = 0; i < blockSize; i++) {
+            tensorQ8.setFloat(i, testValue);
+        }
+
+        float maxDiff = 0.0f;
+        printVerbose("\nVerifying constant values:");
+        for (int i = 0; i < blockSize; i++) {
+            float retrieved = tensorQ8.getFloat(i);
+            float diff = Math.abs(retrieved - testValue);
+            maxDiff = Math.max(maxDiff, diff);
+            printVerboseF("Index %d: Expected=%.6f Got=%.6f Diff=%.6f%n",
+                    i, testValue, retrieved, diff);
+        }
+
+        float relativeError = maxDiff / Math.abs(testValue);
+        printVerboseF("Maximum relative error: %.6f%%%n", relativeError * 100);
+
+        Assert.assertTrue(
+                String.format("Relative error too large for constant block: %.2f%%",
+                        relativeError * 100),
+                relativeError < 0.1f);
+    }
+
     @Test
     public void testNonAlignedBlockSize() {
-        // Test tensor with size not aligned to block size
         int blockSize = GGMLType.Q8_0.getBlockSize();
-        Shape shape = new Shape(blockSize + 5); // Intentionally non-aligned
+        Shape shape = new Shape(blockSize + 5);
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Set values in both full and partial blocks
+        printVerbose("\nTesting non-aligned block size:");
         for (int i = 0; i < shape.getSize(); i++) {
             float value = i * 1.5f;
             tensorQ8.setFloat(i, value);
             float retrieved = tensorQ8.getFloat(i);
+            printVerboseF("Index %d: Set=%.6f Got=%.6f%n",
+                    i, value, retrieved);
             Assert.assertEquals("Value mismatch in non-aligned blocks",
                     value, retrieved, 0.1f);
         }
@@ -353,75 +370,38 @@ public void testNonAlignedBlockSize() {
 
     @Test
     public void testZeroCrossing() {
-        // Test values around zero to verify sign handling
         Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
         TensorQ8 tensorQ8 = new TensorQ8(shape);
 
-        // Test different ranges of values around zero
         float[][] testRanges = {
-                // Small values - might get quantized to zero
                 {-0.001f, -0.0001f, 0.0f, 0.0001f, 0.001f},
-                // Medium values - should preserve sign
                 {-0.1f, -0.05f, 0.0f, 0.05f, 0.1f},
-                // Larger values - should definitely preserve sign
-                {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}};
+                {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}
+        };
 
-        System.out.println("\nTesting zero crossing behavior:");
+        printVerbose("\nTesting zero crossing behavior:");
         for (int range = 0; range < testRanges.length; range++) {
-            System.out.printf("\nRange %d:%n", range);
+            printVerboseF("\nRange %d:%n", range);
 
-            // Set values from current range
             for (int i = 0; i < testRanges[range].length; i++) {
                 float value = testRanges[range][i];
                 tensorQ8.setFloat(i, value);
                 float retrieved = tensorQ8.getFloat(i);
 
-                System.out.printf("Value: %10.6f -> Retrieved: %10.6f%n", value, retrieved);
+                printVerboseF("Value: %10.6f -> Retrieved: %10.6f%n",
+                        value, retrieved);
 
-                if (Math.abs(value) >= 0.01f) {  // Only check sign for values >= 0.01
-                    Assert.assertEquals(String.format("Sign mismatch for value %.6f", value), Math.signum(value), Math.signum(retrieved), 0.0f);
+                if (Math.abs(value) >= 0.01f) {
+                    Assert.assertEquals(
+                            String.format("Sign mismatch for value %.6f", value),
+                            Math.signum(value), Math.signum(retrieved), 0.0f);
                 } else {
-                    // For very small values, just verify they're close to zero
-                    Assert.assertTrue(String.format("Small value %.6f not close enough to zero (got %.6f)", value, retrieved), Math.abs(retrieved) < 0.01f);
+                    Assert.assertTrue(
+                            String.format("Small value %.6f not close enough to zero (got %.6f)",
+                                    value, retrieved),
+                            Math.abs(retrieved) < 0.01f);
                 }
             }
         }
     }
-
-    @Test
-    public void testRepeatedUpdates() {
-        // Test stability when repeatedly updating values
-        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
-        TensorQ8 tensorQ8 = new TensorQ8(shape);
-
-        float testValue = 1.0f;
-        int testIndex = 0;
-
-        // Repeatedly update same value
-        for (int i = 0; i < 100; i++) {
-            tensorQ8.setFloat(testIndex, testValue);
-            float retrieved = tensorQ8.getFloat(testIndex);
-            Assert.assertEquals("Value unstable after repeated updates",
-                    testValue, retrieved, 0.1f);
-        }
-    }
-
-    @Test
-    public void testAlternatingPatterns() {
-        // Test alternating positive/negative pattern
-        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
-        TensorQ8 tensorQ8 = new TensorQ8(shape);
-
-        for (int i = 0; i < shape.getSize(); i++) {
-            float value = (i % 2 == 0) ? 1.0f : -1.0f;
-            tensorQ8.setFloat(i, value);
-        }
-
-        for (int i = 0; i < shape.getSize(); i++) {
-            float expected = (i % 2 == 0) ? 1.0f : -1.0f;
-            float retrieved = tensorQ8.getFloat(i);
-            Assert.assertEquals("Alternating pattern not preserved",
-                    expected, retrieved, 0.1f);
-        }
-    }
 }

From 31bac2e17badcceb6270da338b6b2132d902ba2e Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 19:46:32 +0200
Subject: [PATCH 09/15] Revert module info defaults

---
 tornado-api/src/main/java/module-info.java | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tornado-api/src/main/java/module-info.java b/tornado-api/src/main/java/module-info.java
index f1a7686948..3eb1b3838c 100644
--- a/tornado-api/src/main/java/module-info.java
+++ b/tornado-api/src/main/java/module-info.java
@@ -16,7 +16,6 @@
  *
  */
 module tornado.api {
-    requires jdk.unsupported;
     exports uk.ac.manchester.tornado.api;
     exports uk.ac.manchester.tornado.api.annotations;
     exports uk.ac.manchester.tornado.api.common;

From cb65fcfb1c0d339f3513b0d4c83112cefb518921 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 19:51:15 +0200
Subject: [PATCH 10/15] Add more mixed precision tests

---
 .../unittests/tensors/TestTensorQ8.java       | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
index c036767251..30d6d093e1 100644
--- a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
+++ b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ8.java
@@ -404,4 +404,140 @@ public void testZeroCrossing() {
             }
         }
     }
+
+    @Test
+    public void testSequentialBlockUpdates() {
+        // Test updating blocks in sequence vs random order
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize * 3);  // Three blocks
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        printVerbose("\nTesting sequential block updates:");
+
+        // Sequential updates
+        for (int block = 0; block < 3; block++) {
+            float blockValue = (block + 1) * 10.0f;
+            printVerboseF("\nSetting block %d to %.2f:%n", block, blockValue);
+
+            for (int i = 0; i < blockSize; i++) {
+                int index = block * blockSize + i;
+                tensorQ8.setFloat(index, blockValue);
+                float retrieved = tensorQ8.getFloat(index);
+                printVerboseF("Index %d: Set=%.6f Got=%.6f%n",
+                        index, blockValue, retrieved);
+                Assert.assertEquals("Sequential block update failed",
+                        blockValue, retrieved, 0.1f);
+            }
+        }
+
+        // Verify all blocks maintain their values
+        printVerbose("\nVerifying all blocks after updates:");
+        for (int block = 0; block < 3; block++) {
+            float expectedValue = (block + 1) * 10.0f;
+            for (int i = 0; i < blockSize; i++) {
+                int index = block * blockSize + i;
+                float retrieved = tensorQ8.getFloat(index);
+                Assert.assertEquals("Block value changed unexpectedly",
+                        expectedValue, retrieved, 0.1f);
+            }
+        }
+    }
+
+    @Test
+    public void testMaximumPrecisionValues() {
+        // Test precision with values requiring maximum accuracy
+        Shape shape = new Shape(GGMLType.Q8_0.getBlockSize());
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        printVerbose("\nTesting maximum precision values:");
+
+        // Test precise decimal values
+        float[] preciseValues = {
+                1.23456789f,
+                -1.23456789f,
+                12.3456789f,
+                -12.3456789f,
+                123.456789f,
+                -123.456789f
+        };
+
+        for (int i = 0; i < preciseValues.length; i++) {
+            tensorQ8.setFloat(i, preciseValues[i]);
+            float retrieved = tensorQ8.getFloat(i);
+            float relativeError = Math.abs((retrieved - preciseValues[i]) / preciseValues[i]);
+
+            printVerboseF("Precise value test %d: Set=%.9f Got=%.9f RelError=%.9f%n",
+                    i, preciseValues[i], retrieved, relativeError);
+
+            // For high-precision values, we expect relative error < 1%
+            Assert.assertTrue(
+                    String.format("Precision lost: expected=%.9f, got=%.9f, error=%.9f",
+                            preciseValues[i], retrieved, relativeError),
+                    relativeError < 0.01f);
+        }
+    }
+
+    @Test
+    public void testBlockScaleInterference() {
+        // Test that updates in one block don't affect other blocks' scales
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize * 2);  // Two blocks
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        printVerbose("\nTesting block scale interference:");
+
+        // Set first block to small values
+        printVerbose("\nSetting first block to small values:");
+        for (int i = 0; i < blockSize; i++) {
+            float value = 0.1f + (0.1f * i / blockSize);
+            tensorQ8.setFloat(i, value);
+            printVerboseF("Block 1 index %d: Set=%.6f%n", i, value);
+        }
+
+        // Set second block to large values
+        printVerbose("\nSetting second block to large values:");
+        for (int i = 0; i < blockSize; i++) {
+            float value = 100.0f + (100.0f * i / blockSize);
+            tensorQ8.setFloat(blockSize + i, value);
+            printVerboseF("Block 2 index %d: Set=%.6f%n", i, value);
+        }
+
+        // Verify first block maintained small values
+        printVerbose("\nVerifying first block maintained precision:");
+        for (int i = 0; i < blockSize; i++) {
+            float expected = 0.1f + (0.1f * i / blockSize);
+            float retrieved = tensorQ8.getFloat(i);
+            float relativeError = Math.abs((retrieved - expected) / expected);
+
+            printVerboseF("Block 1 verification index %d: Expected=%.6f Got=%.6f RelError=%.6f%n",
+                    i, expected, retrieved, relativeError);
+
+            Assert.assertTrue(
+                    String.format("Block 1 precision lost after block 2 update at index %d", i),
+                    relativeError < 0.1f);
+        }
+    }
+
+    @Test
+    public void testBlockBoundaryUpdates() {
+        // Test updating values at block boundaries
+        int blockSize = GGMLType.Q8_0.getBlockSize();
+        Shape shape = new Shape(blockSize * 2);  // Two blocks
+        TensorQ8 tensorQ8 = new TensorQ8(shape);
+
+        // Set values around block boundary
+        float[] boundaryValues = {1.0f, 2.0f, 3.0f, 4.0f};
+        int boundaryStart = blockSize - 2;  // Two values before boundary
+
+        printVerbose("\nTesting block boundary updates:");
+        for (int i = 0; i < boundaryValues.length; i++) {
+            int index = boundaryStart + i;
+            tensorQ8.setFloat(index, boundaryValues[i]);
+            float retrieved = tensorQ8.getFloat(index);
+            printVerboseF("Index %d (block boundary +/- 2): Set=%.6f Got=%.6f%n",
+                    index, boundaryValues[i], retrieved);
+            Assert.assertEquals("Value mismatch at block boundary",
+                    boundaryValues[i], retrieved, 0.1f);
+        }
+    }
 }

From 92deea997cb4bdecc1d3ab84f57e7d9e681a5a2f Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 20:06:30 +0200
Subject: [PATCH 11/15] Fix to copy raw memory segment

---
 .../tornado/api/types/arrays/ByteArray.java   |  8 +++++
 .../tornado/api/types/tensors/TensorQ8.java   | 33 ++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
index 0bacb9ff15..e7eb7f88c7 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
@@ -132,6 +132,14 @@ public static ByteArray fromSegment(MemorySegment segment) {
         return byteArray;
     }
 
+    // Temporary workaround to copy raw memory segment without a tornado header
+    public static ByteArray fromSegment(MemorySegment segment, int numberOfElements) {
+        long byteSize = segment.byteSize();
+        ByteArray byteArray = new ByteArray(numberOfElements, byteSize);
+        MemorySegment.copy(segment, 0, byteArray.segment, byteArray.baseIndex * BYTE_BYTES, byteSize);
+        return byteArray;
+    }
+
     /**
      * Creates a new instance of the {@link ByteArray} class from a {@link ByteBuffer}.
      *
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index d972e60442..b9058bf36c 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -33,7 +33,6 @@ public class TensorQ8 extends Tensor {
     private final int blockSize;
     private final int bytesPerBlock;
 
-//    private static final int HEADER_SIZE = (int) TornadoNativeArray.ARRAY_HEADER;
 
     public TensorQ8(Shape shape) {
         super(DType.QINT8, shape);
@@ -67,6 +66,38 @@ public TensorQ8(Shape shape) {
         this.tensorStorage = new ByteArray(numberOfElements, totalSize);
     }
 
+    public TensorQ8(int numberOfElements, MemorySegment memorySegment) {
+        super(DType.QINT8, new Shape(numberOfElements));
+        this.shape = new Shape(numberOfElements);
+        this.numberOfElements = numberOfElements;
+        this.dType = DType.QINT8;
+        this.blockSize = GGMLType.Q8_0.getBlockSize();
+
+        // Each block contains:
+        // - 2 bytes for float16 scale
+        // - blockSize bytes for quantized values
+        this.bytesPerBlock = Float16.BYTES + blockSize;
+
+        // Calculate number of blocks needed to store all elements
+        int numBlocks = (numberOfElements + blockSize - 1) / blockSize;
+
+        // Calculate total storage size in bytes, including header
+        long dataSize = (long)numBlocks * bytesPerBlock;
+        long totalSize = dataSize;
+
+        if (DEBUG_TENSOR_Q8) {
+            System.out.println("Debug info:");
+            System.out.println("Number of elements: " + numberOfElements);
+            System.out.println("Block size: " + blockSize);
+            System.out.println("Bytes per block: " + bytesPerBlock);
+            System.out.println("Number of blocks: " + numBlocks);
+            System.out.println("Data size: " + dataSize);
+            System.out.println("Total size with header: " + totalSize);
+        }
+
+        this.tensorStorage = ByteArray.fromSegment(memorySegment, numberOfElements);
+    }
+
     private float[] getBlockValues(int blockIndex) {
         float[] values = new float[blockSize];
         int blockOffset = blockIndex * bytesPerBlock;

From 483a12e0667d1c02a72ecbf1e03f57ffc25dabd6 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 20:26:39 +0200
Subject: [PATCH 12/15] Minor fix for tensor q8

---
 .../uk/ac/manchester/tornado/api/types/tensors/DType.java    | 5 ++++-
 .../uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/DType.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/DType.java
index 39da8b6c77..b169868de1 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/DType.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/DType.java
@@ -67,7 +67,10 @@ public enum DType {
     /**
      * Represents a quantized 8-bit unsigned integer used in specialized applications like machine learning, using 1 byte.
      */
-    QUINT8(1, ValueLayout.JAVA_BYTE);
+    QUINT8(1, ValueLayout.JAVA_BYTE),
+
+    Q4_0(1,  ValueLayout.JAVA_BYTE);
+
     // @formatter:on
 
     /**
diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index b9058bf36c..5bac48f929 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -18,7 +18,6 @@
 package uk.ac.manchester.tornado.api.types.tensors;
 
 import uk.ac.manchester.tornado.api.types.arrays.ByteArray;
-import uk.ac.manchester.tornado.api.types.arrays.TornadoNativeArray;
 
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;

From 38c8d817b20af5acc978b61d34326bb53058217d Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 20:27:28 +0200
Subject: [PATCH 13/15] Add TensorQ4 init support

---
 .../tornado/api/types/tensors/TensorQ4.java   | 268 ++++++++++++
 .../unittests/tensors/TestTensorQ4.java       | 392 ++++++++++++++++++
 2 files changed, 660 insertions(+)
 create mode 100644 tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ4.java
 create mode 100644 tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ4.java

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ4.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ4.java
new file mode 100644
index 0000000000..88e9da5260
--- /dev/null
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ4.java
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2024, APT Group, Department of Computer Science,
+ * The University of Manchester.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package uk.ac.manchester.tornado.api.types.tensors;
+
+import uk.ac.manchester.tornado.api.types.arrays.ByteArray;
+
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+
+public class TensorQ4 extends Tensor {
+    private final boolean DEBUG_TENSOR_Q4 = false;
+    private final ByteArray tensorStorage;
+    private final int numberOfElements;
+    private final Shape shape;
+    private final DType dType;
+
+    private final int blockSize;
+    private final int bytesPerBlock;
+
+    public TensorQ4(Shape shape) {
+        super(DType.Q4_0, shape);
+        this.shape = shape;
+        this.numberOfElements = shape.getSize();
+        this.dType = DType.Q4_0;
+        this.blockSize = GGMLType.Q4_0.getBlockSize();
+
+        // Each block contains:
+        // - 2 bytes for float16 scale
+        // - blockSize/2 bytes for quantized values (4-bits per value)
+        this.bytesPerBlock = Float16.BYTES + blockSize / 2;
+
+        // Calculate number of blocks needed to store all elements
+        int numBlocks = (numberOfElements + blockSize - 1) / blockSize;
+
+        // Calculate total storage size in bytes
+        long dataSize = (long) numBlocks * bytesPerBlock;
+        long totalSize = dataSize;
+
+        if (DEBUG_TENSOR_Q4) {
+            System.out.println("Debug info:");
+            System.out.println("Number of elements: " + numberOfElements);
+            System.out.println("Block size: " + blockSize);
+            System.out.println("Bytes per block: " + bytesPerBlock);
+            System.out.println("Number of blocks: " + numBlocks);
+            System.out.println("Data size: " + dataSize);
+            System.out.println("Total size: " + totalSize);
+        }
+
+        this.tensorStorage = new ByteArray(numberOfElements, totalSize);
+    }
+
+    public TensorQ4(int numberOfElements, MemorySegment memorySegment) {
+        super(DType.QINT8, new Shape(numberOfElements));
+        this.shape = new Shape(numberOfElements);
+        this.numberOfElements = numberOfElements;
+        this.dType = DType.Q4_0;
+        this.blockSize = GGMLType.Q4_0.getBlockSize();
+
+        // Each block contains:
+        // - 2 bytes for float16 scale
+        // - blockSize/2 bytes for quantized values (4-bits per value)
+        this.bytesPerBlock = Float16.BYTES + blockSize / 2;
+
+        // Calculate number of blocks needed to store all elements
+        int numBlocks = (numberOfElements + blockSize - 1) / blockSize;
+
+        // Calculate total storage size in bytes
+        long dataSize = (long) numBlocks * bytesPerBlock;
+        long totalSize = dataSize;
+
+        if (DEBUG_TENSOR_Q4) {
+            System.out.println("Debug info:");
+            System.out.println("Number of elements: " + numberOfElements);
+            System.out.println("Block size: " + blockSize);
+            System.out.println("Bytes per block: " + bytesPerBlock);
+            System.out.println("Number of blocks: " + numBlocks);
+            System.out.println("Data size: " + dataSize);
+            System.out.println("Total size: " + totalSize);
+        }
+
+        this.tensorStorage = ByteArray.fromSegment(memorySegment, numberOfElements);
+    }
+
+    private float[] getBlockValues(int blockIndex) {
+        float[] values = new float[blockSize];
+        int blockOffset = blockIndex * bytesPerBlock;
+
+        try {
+            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(), blockOffset));
+
+            // Read 4-bit quantized values
+            for (int i = 0; i < blockSize; i++) {
+                byte quant;
+                if (i < blockSize / 2) {
+                    // Lower 4 bits
+                    quant = (byte) (readByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i) & 0x0F);
+                } else {
+                    // Upper 4 bits
+                    quant = (byte) ((readByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i - blockSize / 2) >>> 4) & 0x0F);
+                }
+                // Convert from 4-bit value to float
+                quant -= 8;  // Center at zero [-8, 7]
+                values[i] = quant * scale;
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to read block " + blockIndex + " at offset " + blockOffset + ": " + e.getMessage());
+        }
+        return values;
+    }
+
+    public float getFloat(int index) {
+        if (index < 0 || index >= numberOfElements) {
+            throw new IndexOutOfBoundsException("Index " + index + " out of bounds for length " + numberOfElements);
+        }
+
+        int blockIndex = index / blockSize;
+        int withinBlockIndex = index % blockSize;
+        int blockOffset = blockIndex * bytesPerBlock;
+
+        try {
+            float scale = Float.float16ToFloat(readShort(tensorStorage.getSegmentWithHeader(), blockOffset));
+
+            // Extract 4-bit value
+            byte quant;
+            if (withinBlockIndex < blockSize / 2) {
+                // Lower 4 bits
+                quant = (byte) (readByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + withinBlockIndex) & 0x0F);
+            } else {
+                // Upper 4 bits
+                quant = (byte) ((readByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + withinBlockIndex - blockSize / 2) >>> 4) & 0x0F);
+            }
+            quant -= 8;  // Center at zero [-8, 7]
+            return quant * scale;
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to get float at index " + index + " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
+        }
+    }
+
+    public void setFloat(int index, float value) {
+        if (index < 0 || index >= numberOfElements) {
+            throw new IndexOutOfBoundsException("Index " + index + " out of bounds for length " + numberOfElements);
+        }
+
+        int blockIndex = index / blockSize;
+        int withinBlockIndex = index % blockSize;
+
+        // Get current block values
+        float[] blockValues = getBlockValues(blockIndex);
+        blockValues[withinBlockIndex] = value;
+
+        // Compute optimal scale for block
+        float scale = computeOptimalScale(blockValues);
+
+        // Update block
+        int blockOffset = blockIndex * bytesPerBlock;
+
+        try {
+            // Write scale
+            writeShort(tensorStorage.getSegmentWithHeader(), blockOffset, Float.floatToFloat16(scale));
+
+            // Write quantized values
+            for (int i = 0; i < blockValues.length; i++) {
+                byte quant = (byte) (Math.round(blockValues[i] / scale) + 8); // Add 8 to shift to [0, 15]
+                quant = (byte) Math.min(15, Math.max(0, quant));  // Clamp to 4-bit range
+
+                if (i < blockSize / 2) {
+                    // Write to lower 4 bits
+                    byte current = readByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i);
+                    writeByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i, (byte) ((current & 0xF0) | (quant & 0x0F)));
+                } else {
+                    // Write to upper 4 bits
+                    byte current = readByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i - blockSize / 2);
+                    writeByte(tensorStorage.getSegmentWithHeader(), blockOffset + Float16.BYTES + i - blockSize / 2, (byte) ((current & 0x0F) | (quant << 4)));
+                }
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to set float at index " + index + " (block " + blockIndex + ", offset " + blockOffset + "): " + e.getMessage());
+        }
+    }
+
+    private float computeOptimalScale(float[] values) {
+        float maxAbs = 1e-5f;
+        for (float value : values) {
+            maxAbs = Math.max(maxAbs, Math.abs(value));
+        }
+        return maxAbs / 7.0f;  // Scale to [-7, 7] range for 4-bit values
+    }
+
+    static short readShort(MemorySegment memorySegment, long offset) {
+        return memorySegment.get(ValueLayout.JAVA_SHORT, offset);
+    }
+
+    static byte readByte(MemorySegment memorySegment, long offset) {
+        return memorySegment.get(ValueLayout.JAVA_BYTE, offset);
+    }
+
+    static void writeShort(MemorySegment memorySegment, long offset, short value) {
+        memorySegment.set(ValueLayout.JAVA_SHORT, offset, value);
+    }
+
+    static void writeByte(MemorySegment memorySegment, long offset, byte value) {
+        memorySegment.set(ValueLayout.JAVA_BYTE, offset, value);
+    }
+    @Override
+    public Shape getShape() {
+        return shape;
+    }
+
+    @Override
+    public String getDTypeAsString() {
+        return dType.QINT8.toString();
+    }
+
+    @Override
+    public DType getDType() {
+        return DType.QINT8;
+    }
+
+    @Override
+    public int getSize() {
+        return shape.getSize();
+    }
+
+    @Override
+    public MemorySegment getSegment() {
+        return tensorStorage.getSegmentWithHeader();
+    }
+
+    @Override
+    public MemorySegment getSegmentWithHeader() {
+        return tensorStorage.getSegmentWithHeader();
+    }
+
+    @Override
+    public long getNumBytesOfSegmentWithHeader() {
+        return tensorStorage.getNumBytesOfSegmentWithHeader();
+    }
+
+    @Override
+    public long getNumBytesOfSegment() {
+        return tensorStorage.getNumBytesOfSegment();
+    }
+
+    @Override
+    protected void clear() {
+
+    }
+
+    @Override
+    public int getElementSize() {
+        return DType.QINT8.getByteSize();
+    }
+}
\ No newline at end of file
diff --git a/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ4.java b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ4.java
new file mode 100644
index 0000000000..4f35d7fdd7
--- /dev/null
+++ b/tornado-unittests/src/main/java/uk/ac/manchester/tornado/unittests/tensors/TestTensorQ4.java
@@ -0,0 +1,392 @@
+package uk.ac.manchester.tornado.unittests.tensors;
+
+import org.junit.Assert;
+import org.junit.Test;
+import uk.ac.manchester.tornado.api.types.tensors.GGMLType;
+import uk.ac.manchester.tornado.api.types.tensors.Shape;
+import uk.ac.manchester.tornado.api.types.tensors.TensorQ4;
+import uk.ac.manchester.tornado.unittests.common.TornadoTestBase;
+
+import static java.lang.Boolean.FALSE;
+
+/**
+ * <p>
+ * How to run?
+ * </p>
+ * <code>
+ * tornado-test -V uk.ac.manchester.tornado.unittests.tensors.TestTensorQ4
+ * </code>
+ */
+public class TestTensorQ4 extends TornadoTestBase {
+    private static final boolean VERBOSE = FALSE;
+
+    private void printVerbose(String message) {
+        if (VERBOSE) System.out.println(message);
+    }
+
+    private void printVerboseF(String format, Object... args) {
+        if (VERBOSE) System.out.printf(format, args);
+    }
+
+    @Test
+    public void testBasicQuantization() {
+        // Unchanged - passing
+        Shape shape = new Shape(1);
+        TensorQ4 tensor = new TensorQ4(shape);
+
+        float testValue = 1.0f;
+        tensor.setFloat(0, testValue);
+        float retrieved = tensor.getFloat(0);
+        printVerboseF("Segment size for storing single value %d%n", tensor.getSegment().byteSize());
+        Assert.assertEquals(testValue, retrieved, 0.2f);
+    }
+
+    @Test
+    public void testFourBitRange() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Test a single block to maintain consistent scale
+        float[] boundaryValues = {
+                -8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 2.0f, 4.0f, 6.0f
+        };
+
+        printVerbose("\nTesting 4-bit range quantization:");
+        for (int i = 0; i < boundaryValues.length; i++) {
+            tensorQ4.setFloat(i, boundaryValues[i]);
+            float retrieved = tensorQ4.getFloat(i);
+            printVerboseF("4-bit value test: Setting %.1f, got %.1f%n",
+                    boundaryValues[i], retrieved);
+            // Increased tolerance to account for quantization steps
+            Assert.assertEquals("Value mismatch at 4-bit value " + boundaryValues[i],
+                    boundaryValues[i], retrieved, 0.6f);
+        }
+    }
+
+    @Test
+    public void testPackedValues() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Test both nibbles of each byte with values well within quantization range
+        float[] values = {-4.0f, -2.0f, 0.0f, 2.0f, 4.0f, -4.0f, -2.0f, 2.0f};
+
+        printVerbose("\nTesting packed 4-bit storage:");
+        for (int i = 0; i < values.length; i++) {
+            tensorQ4.setFloat(i, values[i]);
+            float retrieved = tensorQ4.getFloat(i);
+            printVerboseF("Packed index %d: Set=%.1f Got=%.1f%n",
+                    i, values[i], retrieved);
+            Assert.assertEquals("Value mismatch for packed storage",
+                    values[i], retrieved, 0.5f);
+        }
+    }
+
+    @Test
+    public void testBlockScaleInterference() {
+        int blockSize = GGMLType.Q4_0.getBlockSize();
+        Shape shape = new Shape(blockSize * 2);
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        printVerbose("\nTesting block scale interference:");
+
+        // Use values well within the 4-bit quantization range
+        for (int i = 0; i < blockSize; i++) {
+            float value = -4.0f + (8.0f * i / blockSize); // Range from -4 to 4
+            tensorQ4.setFloat(i, value);
+            printVerboseF("Block 1 index %d: Set=%.6f%n", i, value);
+        }
+
+        for (int i = 0; i < blockSize; i++) {
+            float value = -2.0f + (4.0f * i / blockSize); // Range from -2 to 2
+            tensorQ4.setFloat(blockSize + i, value);
+            printVerboseF("Block 2 index %d: Set=%.6f%n", i, value);
+        }
+
+        // Verify first block maintained reasonable accuracy
+        for (int i = 0; i < blockSize; i++) {
+            float expected = -4.0f + (8.0f * i / blockSize);
+            float retrieved = tensorQ4.getFloat(i);
+            float absError = Math.abs(retrieved - expected);
+
+            printVerboseF("Block 1 verification index %d: Expected=%.6f Got=%.6f AbsError=%.6f%n",
+                    i, expected, retrieved, absError);
+
+            Assert.assertTrue("Block 1 accuracy lost after block 2 update",
+                    absError < 0.6f);
+        }
+    }
+
+    @Test
+    public void testFullRangeQuantization() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Test evenly spaced values within quantization range
+        float[] testValues = new float[16];
+        for (int i = 0; i < 16; i++) {
+            testValues[i] = -7.0f + (i * 14.0f / 15.0f); // Range from -7 to 7
+        }
+
+        printVerbose("\nTesting quantization range:");
+        for (int i = 0; i < testValues.length; i++) {
+            tensorQ4.setFloat(i, testValues[i]);
+            float retrieved = tensorQ4.getFloat(i);
+
+            printVerboseF("Step %2d: Set=%.3f Got=%.3f%n",
+                    i, testValues[i], retrieved);
+
+            float absError = Math.abs(retrieved - testValues[i]);
+            Assert.assertTrue(
+                    String.format("Excessive quantization error: expected=%.3f, got=%.3f, error=%.3f",
+                            testValues[i], retrieved, absError),
+                    absError < 0.6f);
+        }
+    }
+    @Test
+    public void testTensorQ4SetAndGetFloatVerify() {
+        int blockSize = GGMLType.Q4_0.getBlockSize();
+        Shape shape = new Shape(blockSize);
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Use values within Q4 range (-8 to 7)
+        float[] pattern = {0.5f, -1.0f, 4.0f, -6.0f, 0.0f};
+        float[] valuesToSet = new float[blockSize];
+        for (int i = 0; i < blockSize; i++) {
+            valuesToSet[i] = pattern[i % pattern.length];
+        }
+
+        printVerboseF("Total elements: %d%n", shape.getSize());
+        printVerboseF("Block size: %d%n", blockSize);
+        printVerboseF("Total allocated bytes: %d%n", tensorQ4.getSegment().byteSize());
+
+        for (int i = 0; i < valuesToSet.length; i++) {
+            tensorQ4.setFloat(i, valuesToSet[i]);
+            float retrieved = tensorQ4.getFloat(i);
+            printVerboseF("Index %d: Set=%.2f Retrieved=%.2f%n",
+                    i, valuesToSet[i], retrieved);
+            Assert.assertEquals("Value mismatch at index " + i,
+                    valuesToSet[i], retrieved, 0.5f);
+        }
+    }
+
+    @Test
+    public void testSingleBlockPrecision() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        float baseValue = 4.0f;  // Smaller base value for Q4 range
+
+        printVerbose("\nTesting single block precision:");
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = baseValue * (i + 1) / shape.getSize();
+            tensorQ4.setFloat(i, value);
+            float retrieved = tensorQ4.getFloat(i);
+            float relativeError = Math.abs((retrieved - value) / value);
+
+            printVerboseF("Index %d: Set=%.6f Got=%.6f RelError=%.6f%n",
+                    i, value, retrieved, relativeError);
+
+            Assert.assertTrue(
+                    String.format("Relative error too large at index %d: expected=%.6f, got=%.6f, relative error=%.6f",
+                            i, value, retrieved, relativeError),
+                    relativeError < 0.3f);  // Higher tolerance for Q4
+        }
+    }
+
+    @Test
+    public void testMaximumPrecisionValues() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        printVerbose("\nTesting maximum precision values:");
+
+        float[] preciseValues = {
+                1.234f,
+                -1.234f,
+                3.456f,
+                -3.456f,
+                6.789f,
+                -6.789f
+        };
+
+        for (int i = 0; i < preciseValues.length; i++) {
+            tensorQ4.setFloat(i, preciseValues[i]);
+            float retrieved = tensorQ4.getFloat(i);
+            float relativeError = Math.abs((retrieved - preciseValues[i]) / preciseValues[i]);
+
+            printVerboseF("Precise value test %d: Set=%.6f Got=%.6f RelError=%.6f%n",
+                    i, preciseValues[i], retrieved, relativeError);
+
+            Assert.assertTrue(
+                    String.format("Precision lost: expected=%.6f, got=%.6f, error=%.6f",
+                            preciseValues[i], retrieved, relativeError),
+                    relativeError < 0.2f);
+        }
+    }
+
+    @Test
+    public void testSequentialBlockUpdates() {
+        int blockSize = GGMLType.Q4_0.getBlockSize();
+        Shape shape = new Shape(blockSize * 3);
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        printVerbose("\nTesting sequential block updates:");
+
+        // Sequential updates with Q4-appropriate values
+        for (int block = 0; block < 3; block++) {
+            float blockValue = (block + 1) * 2.0f;  // Values: 2, 4, 6
+            printVerboseF("\nSetting block %d to %.2f:%n", block, blockValue);
+
+            for (int i = 0; i < blockSize; i++) {
+                int index = block * blockSize + i;
+                tensorQ4.setFloat(index, blockValue);
+                float retrieved = tensorQ4.getFloat(index);
+                printVerboseF("Index %d: Set=%.6f Got=%.6f%n",
+                        index, blockValue, retrieved);
+                Assert.assertEquals("Sequential block update failed",
+                        blockValue, retrieved, 0.5f);
+            }
+        }
+    }
+
+    @Test
+    public void testNibbleBoundaryUpdates() {
+        // Test updating values at nibble boundaries
+        int blockSize = GGMLType.Q4_0.getBlockSize();
+        Shape shape = new Shape(blockSize);
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Set values around nibble boundaries
+        float[] values = {1.0f, -1.0f, 2.0f, -2.0f};
+
+        // Test boundaries between nibbles
+        for (int i = 0; i < values.length; i++) {
+            int index = (i * blockSize/4);  // Space out across block
+            tensorQ4.setFloat(index, values[i]);
+            float retrieved = tensorQ4.getFloat(index);
+            printVerboseF("Nibble boundary %d: Set=%.6f Got=%.6f%n",
+                    index, values[i], retrieved);
+            Assert.assertEquals("Value mismatch at nibble boundary",
+                    values[i], retrieved, 0.5f);
+        }
+    }
+
+    @Test
+    public void testAlternatingNibblePatterns() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        printVerbose("\nTesting alternating nibble pattern:");
+
+        // Set alternating values across nibble boundaries
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = (i % 2 == 0) ? 1.0f : -1.0f;
+            tensorQ4.setFloat(i, value);
+            float retrieved = tensorQ4.getFloat(i);
+            printVerboseF("Index %d: Set=%.6f Got=%.6f%n",
+                    i, value, retrieved);
+            Assert.assertEquals("Alternating pattern not preserved",
+                    value, retrieved, 0.5f);
+        }
+    }
+
+    @Test
+    public void testNibblePackingConsistency() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Create an array of expected quantized values
+        float[] expectedValues = {
+                -4.0f, -3.5f, -3.0f, -2.5f,
+                -2.0f, -1.5f, -1.0f, -0.5f,
+                0.0f, 0.5f, 1.0f, 1.5f,
+                2.0f, 2.5f, 3.0f, 3.5f
+        };
+
+        printVerbose("\nTesting nibble packing consistency:");
+
+        // Set values
+        for (int i = 0; i < expectedValues.length; i++) {
+            tensorQ4.setFloat(i, expectedValues[i]);
+        }
+
+        // Verify quantization
+        for (int i = 0; i < expectedValues.length; i++) {
+            float retrieved = tensorQ4.getFloat(i);
+            printVerboseF("Pattern %2d: Set=%.4f Got=%.4f Diff=%.4f%n",
+                    i, expectedValues[i], retrieved,
+                    Math.abs(expectedValues[i] - retrieved));
+
+            // Check if the retrieved value is within one quantization step
+            float quantStep = 0.5f;  // Quantization step size for Q4
+            Assert.assertTrue(
+                    String.format("Quantization error too large at index %d: expected=%.4f, got=%.4f",
+                            i, expectedValues[i], retrieved),
+                    Math.abs(retrieved - expectedValues[i]) <= quantStep
+            );
+        }
+
+        // Additional verification for nibble boundaries
+        printVerbose("\nVerifying nibble boundaries:");
+        for (int i = 0; i < expectedValues.length; i += 2) {
+            float val1 = tensorQ4.getFloat(i);
+            float val2 = tensorQ4.getFloat(i + 1);
+            printVerboseF("Nibble pair %d: %.4f %.4f%n", i/2, val1, val2);
+
+            // Verify the difference between adjacent values is consistent
+            if (i < expectedValues.length - 2) {
+                float diff1 = val2 - val1;
+                float diff2 = tensorQ4.getFloat(i + 2) - val2;
+                Assert.assertTrue(
+                        String.format("Inconsistent quantization steps: %.4f vs %.4f", diff1, diff2),
+                        Math.abs(diff1 - diff2) <= 0.1f
+                );
+            }
+        }
+    }
+
+    @Test
+    public void testGradualValueTransitions() {
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        // Test gradual transitions to check quantization steps
+        float step = 14.0f / shape.getSize();  // Range from -7 to 7
+        for (int i = 0; i < shape.getSize(); i++) {
+            float value = -7.0f + (step * i);
+            tensorQ4.setFloat(i, value);
+            float retrieved = tensorQ4.getFloat(i);
+            printVerboseF("Step %d: Set=%.3f Got=%.3f%n",
+                    i, value, retrieved);
+            Assert.assertEquals("Gradual transition not preserved",
+                    value, retrieved, 0.5f);
+        }
+    }
+
+    @Test
+    public void testQ4Symmetry() {
+        // Test symmetry of positive and negative values
+        Shape shape = new Shape(GGMLType.Q4_0.getBlockSize());
+        TensorQ4 tensorQ4 = new TensorQ4(shape);
+
+        for (int i = 0; i <= 7; i++) {
+            float positive = i * 1.0f;
+            float negative = -positive;
+
+            tensorQ4.setFloat(i * 2, positive);
+            tensorQ4.setFloat(i * 2 + 1, negative);
+
+            float retrievedPos = tensorQ4.getFloat(i * 2);
+            float retrievedNeg = tensorQ4.getFloat(i * 2 + 1);
+
+            printVerboseF("Symmetry test %d: +%.1f->%.1f, %.1f->%.1f%n",
+                    i, positive, retrievedPos, negative, retrievedNeg);
+
+            Assert.assertEquals("Positive value not preserved", positive, retrievedPos, 0.5f);
+            Assert.assertEquals("Negative value not preserved", negative, retrievedNeg, 0.5f);
+            Assert.assertEquals("Asymmetric quantization",
+                    Math.abs(retrievedPos), Math.abs(retrievedNeg), 0.1f);
+        }
+    }
+}

From 2e99bba1e70a09f68004118fa4342b2e4618bf91 Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Sat, 16 Nov 2024 20:32:30 +0200
Subject: [PATCH 14/15] Add javadocs on key methods

---
 .../tornado/api/types/tensors/TensorQ8.java   | 50 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
index 5bac48f929..ae3cf26530 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/tensors/TensorQ8.java
@@ -24,15 +24,30 @@
 
 public class TensorQ8 extends Tensor {
     private final boolean  DEBUG_TENSOR_Q8 = false;
+    /** Storage for the quantized tensor data including scales and values. */
     private final ByteArray tensorStorage;
+
+    /** Total number of elements in the tensor. */
     private final int numberOfElements;
+
+    /** Shape information for the tensor. */
     private final Shape shape;
+
+    /** Data type of the tensor (QINT8). */
     private final DType dType;
 
+    /** Number of values in each quantization block. */
     private final int blockSize;
-    private final int bytesPerBlock;
 
+    /** Total bytes per block including scale and quantized values. */
+    private final int bytesPerBlock;
 
+    /**
+     * Constructs a new Q8 tensor with the specified shape.
+     * Allocates memory and initializes the tensor storage.
+     *
+     * @param shape The shape of the tensor to create
+     */
     public TensorQ8(Shape shape) {
         super(DType.QINT8, shape);
         this.shape = shape;
@@ -65,6 +80,13 @@ public TensorQ8(Shape shape) {
         this.tensorStorage = new ByteArray(numberOfElements, totalSize);
     }
 
+    /**
+     * Constructs a Q8 tensor using existing memory segment data.
+     * Used for creating a tensor view of pre-existing quantized data.
+     *
+     * @param numberOfElements The number of elements in the tensor
+     * @param memorySegment The memory segment containing the quantized data
+     */
     public TensorQ8(int numberOfElements, MemorySegment memorySegment) {
         super(DType.QINT8, new Shape(numberOfElements));
         this.shape = new Shape(numberOfElements);
@@ -113,6 +135,15 @@ private float[] getBlockValues(int blockIndex) {
         return values;
     }
 
+    /**
+     * Gets a single float value from the tensor at the specified index.
+     * The value is dequantized using the scale factor from its containing block.
+     *
+     * @param index The index of the value to retrieve
+     * @return The dequantized float value
+     * @throws IndexOutOfBoundsException if the index is out of bounds
+     * @throws RuntimeException if there is an error reading the value
+     */
     public float getFloat(int index) {
         if (index < 0 || index >= numberOfElements) {
             throw new IndexOutOfBoundsException("Index " + index + " out of bounds for length " + numberOfElements);
@@ -131,6 +162,15 @@ public float getFloat(int index) {
         }
     }
 
+    /**
+     * Sets a float value in the tensor at the specified index.
+     * Updates the entire block's scale factor when any value in the block changes.
+     *
+     * @param index The index where the value should be set
+     * @param value The float value to set
+     * @throws IndexOutOfBoundsException if the index is out of bounds
+     * @throws RuntimeException if there is an error writing the value
+     */
     public void setFloat(int index, float value) {
         if (index < 0 || index >= numberOfElements) {
             throw new IndexOutOfBoundsException("Index " + index + " out of bounds for length " + numberOfElements);
@@ -163,6 +203,14 @@ public void setFloat(int index, float value) {
         }
     }
 
+    /**
+     * Computes the optimal scale factor for a block of values.
+     * The scale is chosen to maximize the use of the INT8 range (-128 to 127).
+     *
+     * @param values The array of float values to compute the scale for
+     * @return The optimal scale factor for quantizing the values
+     */
+
     private float computeOptimalScale(float[] values) {
         float maxAbs = 1e-5f;
         for (float value : values) {

From 9835e4e930932168abb12672c905045d576ef8de Mon Sep 17 00:00:00 2001
From: mikepapadim <mikepapadim@hotmail.com>
Date: Wed, 20 Nov 2024 16:05:08 +0000
Subject: [PATCH 15/15] Fix for not initilized bytes

---
 .../uk/ac/manchester/tornado/api/types/arrays/ByteArray.java     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
index e7eb7f88c7..5b2f2ba6d3 100644
--- a/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
+++ b/tornado-api/src/main/java/uk/ac/manchester/tornado/api/types/arrays/ByteArray.java
@@ -66,6 +66,7 @@ public ByteArray(int numberOfElements) {
     public ByteArray(int numberOfElements, long requiredStorageSize) {
         this.numberOfElements = numberOfElements;
         baseIndex=0;
+        this.segmentByteSize = requiredStorageSize;
         segment = Arena.ofAuto().allocate(requiredStorageSize, 1);
         segment.setAtIndex(JAVA_INT, 0, numberOfElements);
     }