From 14429c1d82283a8140423304920aec77c05bddce Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:19:42 +0100 Subject: [PATCH 1/3] stash trash --- crates/ratchet-core/src/cpu/reindex.rs | 94 ++++++++++++++++++- crates/ratchet-core/src/cpu/utils.rs | 73 +++++++++++++- .../ratchet-core/src/ops/reindex/permute.rs | 29 +++++- 3 files changed, 189 insertions(+), 7 deletions(-) diff --git a/crates/ratchet-core/src/cpu/reindex.rs b/crates/ratchet-core/src/cpu/reindex.rs index d3953484..c4cd0394 100644 --- a/crates/ratchet-core/src/cpu/reindex.rs +++ b/crates/ratchet-core/src/cpu/reindex.rs @@ -1,16 +1,106 @@ -use super::utils::cpu_store_result; -use crate::{CPUOperation, DType, OperationError, Reindex, Slice, Strides, Tensor, TensorDType}; +use super::utils::{cpu_store_result, StridedIterator}; +use crate::{ + CPUOperation, DType, OperationError, Permute, Reindex, Shape, Slice, Strides, Tensor, + TensorDType, +}; use half::{bf16, f16}; +use ndarray::ShapeBuilder; +use pyo3::ffi::PyExc_FutureWarning; +use std::ops::Range; impl CPUOperation for Reindex { fn apply_cpu(&self, dst: Tensor) -> Result { match self { + Reindex::Permute(p) => { + println!("Permute: {:?}", p); + p.apply_cpu(dst) + } Reindex::Slice(s) => s.apply_cpu(dst), _ => todo!(), } } } +impl CPUOperation for Permute { + fn apply_cpu(&self, dst: Tensor) -> Result { + match dst.dt() { + DType::F32 => apply_permute::(self, dst), + _ => todo!(), + } + } +} + +fn apply_permute( + Permute { src, dims }: &Permute, + dst: Tensor, +) -> Result { + let result = permute(&src.to_vec::()?, src.shape(), dims); + cpu_store_result(&dst, &result); + Ok(dst) +} + +fn get_strided_index(idx: usize, num_dims: usize, dims: &[usize], strides: &[isize]) -> usize { + let mut idx = idx; // 2 + let mut strided_i: usize = 0; + println!("strides: {strides:?}"); + println!("dims: {dims:?}"); + print!("{idx} -> "); + for d in 0..num_dims { + let dim_idx = num_dims - 1 - d; + strided_i += (idx % dims[dim_idx]) * strides[dim_idx] as usize; + idx /= dims[dim_idx]; + print!("{idx}|{dim_idx}|{strided_i}, "); + } + print!("\n"); + return strided_i; +} + +pub(crate) fn permute(src: &[T], shape: &Shape, dims: &[usize]) -> Vec { + /* + // simplify shape + // 1. remove dimensions with size 0..1 + // 2. consecutive dimensions can be merged + // 3. remove dimensions that are not in the permutation + let mut ranges: Vec> = vec![]; + let mut dims_s: Vec = vec![]; + + let mut start = 0; + let mut end = 0; + for i in 0..dims.len() - 1 { + if dims[i] + 1 == dims[i + 1] { + end = i; + } else { + ranges.push(start..end); + start = end; + } + } + ranges.push(start..); + + println!("ranges: {:?}", ranges); + println!("dims_s: {:?}", dims_s); + + if ranges.len() <= 1 { + return src.to_vec(); + } + + */ + let mut dst = vec![T::zero(); shape.numel()]; + + let strides = Strides::from(shape).to_vec(); + + let mut p_shape = vec![0; shape.rank()]; + for i in 0..shape.rank() { + p_shape[dims[i]] = shape[i]; + } + + for i in 0..src.len() { + let strided_idx = get_strided_index(i, dims.len(), p_shape.as_slice(), &strides); + println!("{i} -> {strided_idx}"); + dst[i] = src[strided_idx]; + } + dst +} + impl CPUOperation for Slice { fn apply_cpu(&self, dst: Tensor) -> Result { match dst.dt() { diff --git a/crates/ratchet-core/src/cpu/utils.rs b/crates/ratchet-core/src/cpu/utils.rs index e61facde..a730681b 100644 --- a/crates/ratchet-core/src/cpu/utils.rs +++ b/crates/ratchet-core/src/cpu/utils.rs @@ -1,6 +1,77 @@ -use crate::{CPUBuffer, Storage, Tensor}; +use crate::{CPUBuffer, Shape, Storage, Strides, Tensor}; use bytemuck::NoUninit; +pub struct StridedIterator<'a> { + shape: &'a Shape, + strides: &'a Strides, + next_index: Option, + multi_index: Vec, +} + +impl<'a> StridedIterator<'a> { + pub fn new(shape: &'a Shape, strides: &'a Strides, start_offset: usize) -> Self { + Self { + shape, + strides, + next_index: if shape.numel() == 0 { + None + } else { + Some(start_offset) + }, + multi_index: vec![0; shape.len()], + } + } +} + +impl<'a> Iterator for StridedIterator<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + let storage_index = match self.next_index { + None => return None, + Some(storage_index) => storage_index, + }; + let mut updated = false; + let mut next_storage_index = storage_index; + for ((multi_i, max_i), stride_i) in self + .multi_index + .iter_mut() + .zip(self.shape.iter()) + .zip(self.strides.iter()) + .rev() + { + let next_i = *multi_i + 1; + if next_i < *max_i { + *multi_i = next_i; + updated = true; + next_storage_index += *stride_i as usize; + break; + } else { + next_storage_index -= *multi_i * *stride_i as usize; + *multi_i = 0 + } + } + self.next_index = if updated { + Some(next_storage_index) + } else { + None + }; + Some(storage_index) + } +} + +impl<'a> From<(&'a Shape, &'a Strides)> for StridedIterator<'a> { + fn from((shape, strides): (&'a Shape, &'a Strides)) -> Self { + StridedIterator::new(shape, strides, 0) + } +} + +impl<'a> From<(&'a Shape, &'a Strides, usize)> for StridedIterator<'a> { + fn from((shape, strides, offset): (&'a Shape, &'a Strides, usize)) -> Self { + StridedIterator::new(shape, strides, offset) + } +} + pub fn cpu_store_result(dst: &Tensor, data: &[T]) { dst.update_storage(Storage::CPU(CPUBuffer::from_slice(data, dst.shape()))); } diff --git a/crates/ratchet-core/src/ops/reindex/permute.rs b/crates/ratchet-core/src/ops/reindex/permute.rs index b032eb8c..13d8a764 100644 --- a/crates/ratchet-core/src/ops/reindex/permute.rs +++ b/crates/ratchet-core/src/ops/reindex/permute.rs @@ -111,9 +111,8 @@ def permute(a): run_py_prg(prg.to_string(), &[a], &[], a.dt()) } - fn run_reindex_trial(prob: PermuteProblem) -> anyhow::Result<()> { + fn run_reindex_trial(prob: PermuteProblem, device: Device) -> anyhow::Result<()> { let PermuteProblem { op } = prob; - let device = Device::request_device(DeviceRequest::GPU).unwrap(); let a = op.src.clone(); let a_gpu = a.to(&device)?; @@ -125,7 +124,29 @@ def permute(a): } #[proptest(cases = 16)] - fn test_permute(prob: PermuteProblem) { - run_reindex_trial(prob).unwrap(); + fn test_permute_gpu(prob: PermuteProblem) { + let device = Device::request_device(DeviceRequest::GPU).unwrap(); + run_reindex_trial(prob, device).unwrap(); + } + + //#[proptest(cases = 16)] + //fn test_permute_cpu(prob: PermuteProblem) { + #[test] + fn test_permute_cpu() { + let device = Device::request_device(DeviceRequest::CPU).unwrap(); + let t = Tensor::from_data( + &[ + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ], + Shape::from(vec![2usize, 4, 2, 1]), + device.clone(), + ); + let prob = PermuteProblem { + op: Permute { + src: t, + dims: vec![3, 1, 2, 0], + }, + }; + run_reindex_trial(prob, device).unwrap(); } } From 7d6311b0a5483643e9124fe1f0c24e3b4f0a5950 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:45:06 -0400 Subject: [PATCH 2/3] chore: add permute test --- .../ratchet-core/src/ops/reindex/permute.rs | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/crates/ratchet-core/src/ops/reindex/permute.rs b/crates/ratchet-core/src/ops/reindex/permute.rs index 13d8a764..01f0aeee 100644 --- a/crates/ratchet-core/src/ops/reindex/permute.rs +++ b/crates/ratchet-core/src/ops/reindex/permute.rs @@ -129,24 +129,9 @@ def permute(a): run_reindex_trial(prob, device).unwrap(); } - //#[proptest(cases = 16)] - //fn test_permute_cpu(prob: PermuteProblem) { - #[test] - fn test_permute_cpu() { + #[proptest(cases = 16)] + fn test_permute_cpu(prob: PermuteProblem) { let device = Device::request_device(DeviceRequest::CPU).unwrap(); - let t = Tensor::from_data( - &[ - 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., - ], - Shape::from(vec![2usize, 4, 2, 1]), - device.clone(), - ); - let prob = PermuteProblem { - op: Permute { - src: t, - dims: vec![3, 1, 2, 0], - }, - }; run_reindex_trial(prob, device).unwrap(); } } From a4932d5d4fd8800eea9635687435284628ed6ec2 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:46:59 -0400 Subject: [PATCH 3/3] chore: restructure reindex ops --- crates/ratchet-core/src/cpu/reindex.rs | 242 +++++++++++++------------ 1 file changed, 122 insertions(+), 120 deletions(-) diff --git a/crates/ratchet-core/src/cpu/reindex.rs b/crates/ratchet-core/src/cpu/reindex.rs index 94903a3c..fbb378ea 100644 --- a/crates/ratchet-core/src/cpu/reindex.rs +++ b/crates/ratchet-core/src/cpu/reindex.rs @@ -21,124 +21,6 @@ impl CPUOperation for Reindex { } } -impl CPUOperation for Broadcast { - fn apply_cpu(&self, dst: Tensor) -> Result { - match dst.dt() { - DType::F32 => apply_broadcast::(self, dst), - DType::BF16 => apply_broadcast::(self, dst), - DType::F16 => apply_broadcast::(self, dst), - DType::I32 => apply_broadcast::(self, dst), - DType::U32 => apply_broadcast::(self, dst), - _ => todo!(), - } - } -} - -fn apply_broadcast(b: &Broadcast, dst: Tensor) -> Result { - let result = broadcast(&b.src.to_vec::()?, b.src.shape(), b.to()); - cpu_store_result(&dst, &result); - Ok(dst) -} - -pub(crate) fn broadcast(src: &[T], src_shape: &Shape, dst_shape: &Shape) -> Vec { - let mut result = vec![T::zero(); dst_shape.numel()]; - - if src_shape.is_scalar() { - // Life is simple - result.fill(src[0]); - } else if src_shape.is_vector() { - // If from is a vector and the first dimension is the broadcasting dimension - if src_shape[0] > 1 && src_shape[0] == dst_shape[0] { - let chunk_size = result.len() / src_shape.numel(); - - (0..result.len()) - .step_by(chunk_size) - .enumerate() - .for_each(|(i, chunk)| { - result[chunk..chunk + chunk_size].fill(src[i]); - }); - } else { - generic_broadcast(src, &mut result, src_shape, dst_shape) - } - } else { - generic_broadcast(src, &mut result, src_shape, dst_shape) - } - - result -} - -#[inline] -fn offset_to_ndindex(offset: usize, strides: [usize; 4]) -> [usize; 4] { - let mut indices = [0; 4]; - let mut remaining = offset; - - let idx = remaining / strides[0]; - indices[0] = idx; - remaining -= idx * strides[0]; - - let idx = remaining / strides[1]; - indices[1] = idx; - remaining -= idx * strides[1]; - - let idx = remaining / strides[2]; - indices[2] = idx; - remaining -= idx * strides[2]; - - indices[3] = remaining; - indices -} - -#[inline] -fn nd_index_to_offset(ndindex: [usize; 4], strides: [usize; 4]) -> usize { - ndindex[0] * strides[0] - + ndindex[1] * strides[1] - + ndindex[2] * strides[2] - + ndindex[3] * strides[3] -} - -// TODO: Optimize. -// This generic implementation is almost a direct copy from the gpu impl, -// and can definitely be way more performant. -fn generic_broadcast( - src: &[T], - result: &mut [T], - src_shape: &Shape, - dst_shape: &Shape, -) { - // We now know that these will always be len 4, same as gpu impl. - let src_shape = &Shape::promote(src_shape.clone(), 4); - let dst_shape = &Shape::promote(dst_shape.clone(), 4); - - let src_strides = &Strides::from(src_shape); - let dst_strides = &Strides::from(dst_shape); - - let src_shape: [usize; 4] = src_shape.try_into().unwrap(); - let src_strides: [usize; 4] = src_strides.try_into().unwrap(); - let dst_strides: [usize; 4] = dst_strides.try_into().unwrap(); - - fn select(a: [usize; 4], b: [usize; 4], t: [bool; 4]) -> [usize; 4] { - let mut result = [0; 4]; - result[0] = if t[0] { a[0] } else { b[0] }; - result[1] = if t[1] { a[1] } else { b[1] }; - result[2] = if t[2] { a[2] } else { b[2] }; - result[3] = if t[3] { a[3] } else { b[3] }; - result - } - - let shape_onedim_lookup: [bool; 4] = [ - src_shape[0] != 1, - src_shape[1] != 1, - src_shape[2] != 1, - src_shape[3] != 1, - ]; - for i in 0..result.len() { - let dst_index = offset_to_ndindex(i, dst_strides); - let src_index = select(dst_index, [0; 4], shape_onedim_lookup); - let src_offset = nd_index_to_offset(src_index, src_strides); - result[i] = src[src_offset] - } -} - impl CPUOperation for Permute { fn apply_cpu(&self, dst: Tensor) -> Result { match dst.dt() { @@ -160,6 +42,9 @@ fn apply_permute(p: &Permute, dst: Tensor) -> Result( src: &[T], src_shape: &Shape, @@ -189,8 +74,7 @@ fn permute( let src_offset = nd_index_to_offset(src_index, src_strides); result[i] = src[src_offset] } - - return result; + result } impl CPUOperation for Slice { @@ -249,3 +133,121 @@ pub(crate) fn slice( dst } + +impl CPUOperation for Broadcast { + fn apply_cpu(&self, dst: Tensor) -> Result { + match dst.dt() { + DType::F32 => apply_broadcast::(self, dst), + DType::BF16 => apply_broadcast::(self, dst), + DType::F16 => apply_broadcast::(self, dst), + DType::I32 => apply_broadcast::(self, dst), + DType::U32 => apply_broadcast::(self, dst), + _ => todo!(), + } + } +} + +fn apply_broadcast(b: &Broadcast, dst: Tensor) -> Result { + let result = broadcast(&b.src.to_vec::()?, b.src.shape(), b.to()); + cpu_store_result(&dst, &result); + Ok(dst) +} + +pub(crate) fn broadcast(src: &[T], src_shape: &Shape, dst_shape: &Shape) -> Vec { + let mut result = vec![T::zero(); dst_shape.numel()]; + + if src_shape.is_scalar() { + // Life is simple + result.fill(src[0]); + } else if src_shape.is_vector() { + // If from is a vector and the first dimension is the broadcasting dimension + if src_shape[0] > 1 && src_shape[0] == dst_shape[0] { + let chunk_size = result.len() / src_shape.numel(); + + (0..result.len()) + .step_by(chunk_size) + .enumerate() + .for_each(|(i, chunk)| { + result[chunk..chunk + chunk_size].fill(src[i]); + }); + } else { + generic_broadcast(src, &mut result, src_shape, dst_shape) + } + } else { + generic_broadcast(src, &mut result, src_shape, dst_shape) + } + + result +} + +// TODO: Optimize. +// This generic implementation is almost a direct copy from the gpu impl, +// and can definitely be way more performant. +fn generic_broadcast( + src: &[T], + result: &mut [T], + src_shape: &Shape, + dst_shape: &Shape, +) { + // We now know that these will always be len 4, same as gpu impl. + let src_shape = &Shape::promote(src_shape.clone(), 4); + let dst_shape = &Shape::promote(dst_shape.clone(), 4); + + let src_strides = &Strides::from(src_shape); + let dst_strides = &Strides::from(dst_shape); + + let src_shape: [usize; 4] = src_shape.try_into().unwrap(); + let src_strides: [usize; 4] = src_strides.try_into().unwrap(); + let dst_strides: [usize; 4] = dst_strides.try_into().unwrap(); + + fn select(a: [usize; 4], b: [usize; 4], t: [bool; 4]) -> [usize; 4] { + let mut result = [0; 4]; + result[0] = if t[0] { a[0] } else { b[0] }; + result[1] = if t[1] { a[1] } else { b[1] }; + result[2] = if t[2] { a[2] } else { b[2] }; + result[3] = if t[3] { a[3] } else { b[3] }; + result + } + + let shape_onedim_lookup: [bool; 4] = [ + src_shape[0] != 1, + src_shape[1] != 1, + src_shape[2] != 1, + src_shape[3] != 1, + ]; + for i in 0..result.len() { + let dst_index = offset_to_ndindex(i, dst_strides); + let src_index = select(dst_index, [0; 4], shape_onedim_lookup); + let src_offset = nd_index_to_offset(src_index, src_strides); + result[i] = src[src_offset] + } +} + +#[inline] +fn offset_to_ndindex(offset: usize, strides: [usize; 4]) -> [usize; 4] { + let mut indices = [0; 4]; + let mut remaining = offset; + + let idx = remaining / strides[0]; + indices[0] = idx; + remaining -= idx * strides[0]; + + let idx = remaining / strides[1]; + indices[1] = idx; + remaining -= idx * strides[1]; + + let idx = remaining / strides[2]; + indices[2] = idx; + remaining -= idx * strides[2]; + + indices[3] = remaining; + indices +} + +#[inline] +fn nd_index_to_offset(ndindex: [usize; 4], strides: [usize; 4]) -> usize { + ndindex[0] * strides[0] + + ndindex[1] * strides[1] + + ndindex[2] * strides[2] + + ndindex[3] * strides[3] +}