diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml index b5e01949fd..5373b58d8d 100644 --- a/linalg/Cargo.toml +++ b/linalg/Cargo.toml @@ -47,6 +47,8 @@ core_affinity.workspace = true no_fp16 = [] default = [] complex = [ "tract-data/complex" ] +# Internal feature for benchmarking matmul kernels +compile_all_kernels = [] [[bench]] bench = false @@ -99,3 +101,8 @@ harness = false [[bench]] name = "intel" harness = false + +[[bench]] +bench = false +name = "kernel_test" +harness = false diff --git a/linalg/benches/intel.rs b/linalg/benches/intel.rs index d98a4ab8c4..c43d1b366a 100644 --- a/linalg/benches/intel.rs +++ b/linalg/benches/intel.rs @@ -6,7 +6,7 @@ use tract_linalg::mmm::OutputStoreKer; fn ruin_cache() { // return; - let _a = (0..1000000).collect::>(); + let _a = std::hint::black_box((0..10000000).collect::>()); } pub fn reference(mr: usize, k: usize, nr: usize) -> Vec @@ -63,7 +63,7 @@ fn bench_to_nanos< FusedSpec::AddMatMul { k, a: kernel.a_packed(4, k).wrap(&a.view()), - b: kernel.b_packed(4, k).wrap(&b.view()).unwrap(), + b: kernel.b_packed(4, k).wrap(&b.view()), }, // FusedSpec::AddUnicast(kernel.c_view(1, 0).wrap(&c.view_mut())), FusedSpec::Store(kernel.c_view(1, 0).wrap(&c.view_mut())), diff --git a/linalg/benches/kernel_test.rs b/linalg/benches/kernel_test.rs new file mode 100644 index 0000000000..14d45297d1 --- /dev/null +++ b/linalg/benches/kernel_test.rs @@ -0,0 +1,84 @@ +use criterion::*; + +mod utils; +use tract_data::prelude::DatumType; +use tract_linalg::mmm::MatMatMul; +use tract_linalg::mmm::MatMatMulKer; +use utils::*; + +pub fn mat_mat_mm( + be: &mut Bencher, + &(mm, dt, m, k, n, cold): &(&dyn MatMatMul, DatumType, usize, usize, usize, bool), +) { + mat_mat_with_mm(be, mm, &(dt, m, k, n, cold)); +} + +fn cold_and_hot(c: &mut Criterion, mm: &dyn MatMatMul, m: usize, k: usize, n: usize) { + let mut group = c.benchmark_group(format!("{}", mm.kernel_name())); + group.throughput(Throughput::Elements((m * k * n) as u64)); + let id = format!("{m}x{k}x{n}"); + group.bench_with_input( + BenchmarkId::new("f32/cold", &id), + &(mm, DatumType::F32, m, k, n, false), + mat_mat_mm, + ); + // group.bench_with_input( + // BenchmarkId::new("f32/hot", &id), + // &(mm, DatumType::F32, m, k, n, true), + // mat_mat_mm, + // ); +} + +fn mm(be: &mut Criterion, mm: impl AsRef, n: usize) { + // for m in (0..1024).step_by(128).skip(1) { + cold_and_hot(be, mm.as_ref(), 1024, 1000, n); + // } +} + +fn all(c: &mut Criterion) { + use tract_linalg::x86_64_fma::mmm::*; + macro_rules! benches_for_n { + ($c:expr ; $n:expr ; $m:expr) => ( + paste::paste! { + mm($c, []::mmm(), $n); + } + ); + ($c:expr ; $x:expr ; $m1:expr, $($y:expr),+) => ( + benches_for_n!($c ; $x ; $m1); + benches_for_n!($c ; $x ; $($y),+); + ); + } + + benches_for_n!(c; 1 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240); + benches_for_n!(c; 2 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160); + benches_for_n!(c; 3 ; 16, 32, 48, 64, 80, 96, 112); + benches_for_n!(c; 4 ; 16, 32, 48, 64, 80, 96); + benches_for_n!(c; 5 ; 16, 32, 48, 64, 80); + benches_for_n!(c; 6 ; 16, 32, 48, 64); + benches_for_n!(c; 7 ; 16, 32, 48); + benches_for_n!(c; 8 ; 16, 32, 48); + benches_for_n!(c; 9 ; 16, 32, 48); + benches_for_n!(c; 10 ; 16, 32); + benches_for_n!(c; 11 ; 16, 32); + benches_for_n!(c; 12 ; 16, 32); + benches_for_n!(c; 13 ; 16, 32); + benches_for_n!(c; 14 ; 16, 32); + benches_for_n!(c; 15 ; 16); + benches_for_n!(c; 16 ; 16); + benches_for_n!(c; 17 ; 16); + benches_for_n!(c; 18 ; 16); + benches_for_n!(c; 19 ; 16); + benches_for_n!(c; 20 ; 16); + benches_for_n!(c; 21 ; 16); + benches_for_n!(c; 22 ; 16); + benches_for_n!(c; 23 ; 16); + benches_for_n!(c; 24 ; 16); + benches_for_n!(c; 25 ; 16); + benches_for_n!(c; 26 ; 16); + benches_for_n!(c; 27 ; 16); + benches_for_n!(c; 28 ; 16); + benches_for_n!(c; 29 ; 16); +} + +criterion_group!(benches, all); +criterion_main!(benches); diff --git a/linalg/benches/utils.rs b/linalg/benches/utils.rs index 321c8140b1..bb3574760b 100644 --- a/linalg/benches/utils.rs +++ b/linalg/benches/utils.rs @@ -28,7 +28,8 @@ pub fn packed_vec(c: &mut Criterion, name: &str, m: usize, k: usize, n: usize) { } pub fn ruin_cache() { - let _a = (0..1000000).collect::>(); + // the collect gets optimized out by llvm without black_box + let _a = std::hint::black_box((0..10000000).collect::>()); } #[allow(clippy::too_many_arguments)] diff --git a/linalg/build.rs b/linalg/build.rs index 55c526c381..bc59f0258e 100644 --- a/linalg/build.rs +++ b/linalg/build.rs @@ -72,6 +72,11 @@ impl ConfigForHalf { } } +struct GenerateKernelsSpec { + sizes: Vec<(usize, usize)>, + file: path::PathBuf, +} + fn main() { let target = var("TARGET"); let arch = var("CARGO_CFG_TARGET_ARCH"); @@ -83,8 +88,51 @@ fn main() { match arch.as_ref() { "x86_64" => { - let mut files = preprocess_files("x86_64/fma", &[], &suffix, false); - files.extend(preprocess_files("x86_64/avx512", &[], &suffix, false)); + let mut files = preprocess_files("x86_64/fma", &[], &suffix, false, None); + + let avx512_kernels: Vec<_> = if cfg!(feature = "compile_all_kernels") { + // limits of the max M size of the kernels in avx512; index is n-1 + let avx512_kernels_max = [ + 240, 160, 112, 96, 80, 64, 48, 48, 48, 32, 32, 32, 32, 32, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + ]; + avx512_kernels_max + .iter() + .enumerate() + .flat_map(|(n_min_1, &max)| { + (16..=max).step_by(16).map(move |m| (m, n_min_1 + 1)) + }) + .collect() + } else { + vec![ + (16, 1), + (96, 1), + (96, 2), + (80, 3), + (64, 4), + (32, 5), + (32, 6), + (32, 7), + (32, 8), + (32, 9), + (32, 10), + (32, 11), + (32, 12), + (32, 13), + (32, 14), + ] + }; + + files.extend(preprocess_files( + "x86_64/avx512", + &[], + &suffix, + false, + Some(GenerateKernelsSpec { + sizes: avx512_kernels, + file: "x86_64/avx512/avx512_mmm_f32.tmpliq".into(), + }), + )); if os == "windows" { if use_masm() { @@ -136,7 +184,7 @@ fn main() { } } "arm" | "armv7" => { - let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false); + let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false, None); cc::Build::new() .files(files) .flag("-marm") @@ -148,6 +196,7 @@ fn main() { &[("core", vec!["cortexa7", "cortexa9", "generic"])], &suffix, false, + None, ); cc::Build::new() .files(files) @@ -162,11 +211,12 @@ fn main() { &[("core", vec!["a53", "a55", "gen"])], &suffix, false, + None, ); cc::Build::new().files(files).static_flag(true).compile("arm64simd"); if os == "macos" { // aarch64 darwin => M1 - let files = preprocess_files("arm64/apple_amx", &[], &suffix, false); + let files = preprocess_files("arm64/apple_amx", &[], &suffix, false, None); cc::Build::new().files(files).static_flag(true).compile("appleamx"); } if std::env::var("CARGO_FEATURE_NO_FP16").is_err() { @@ -177,6 +227,7 @@ fn main() { &[("core", vec!["a55", "gen"])], &suffix, config.needs_pragma, + None, ); config.cc().files(files).static_flag(true).compile("arm64fp16") } @@ -192,9 +243,24 @@ fn preprocess_files( variants: &[Variant], suffix: &str, needs_pragma: bool, + generate_kernels_spec: Option, ) -> Vec { let out_dir = path::PathBuf::from(var("OUT_DIR")); let mut files = vec![]; + + if let Some(spec) = generate_kernels_spec { + let tmpl_file = spec.file.file_stem().unwrap().to_str().unwrap(); + for (m, n) in spec.sizes { + let globals = vec![ + ("mr", liquid::model::Value::scalar(format!("{m}"))), + ("nr", liquid::model::Value::scalar(format!("{n}"))), + ]; + let file = out_dir.join(format!("{tmpl_file}_{m}x{n}.S")); + println!("{}", file.display()); + preprocess_file(&spec.file, &file, &globals, suffix, needs_pragma); + files.push(file); + } + } let dir_entries = { let mut dir_entries: Vec = input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect(); @@ -214,7 +280,7 @@ fn preprocess_files( for variable in variants { let key = variable.0; let value = variable.1[id % variable.1.len()]; - globals.push((key, value)); + globals.push((key, liquid::model::Value::scalar(value))); tmpl_file = tmpl_file.replace(key, value); id /= variable.1.len(); } @@ -239,7 +305,7 @@ fn strip_comments(s: String, msvc: bool) -> String { fn preprocess_file( template: impl AsRef, output: impl AsRef, - variants: &[(&'static str, &'static str)], + added_globals: &[(&'static str, liquid::model::Value)], suffix: &str, needs_pragma: bool, ) { @@ -277,8 +343,8 @@ fn preprocess_file( "jump_table": jump_table(), "align": align, }); - for (k, v) in variants { - globals.insert(k.to_string().into(), liquid::model::Value::scalar(*v)); + for (k, v) in added_globals { + globals.insert(k.to_string().into(), v.clone()); } let partials = load_partials(template.as_ref().parent().unwrap(), msvc); let mut parser = liquid::ParserBuilder::with_stdlib() diff --git a/linalg/src/x86_64_fma.rs b/linalg/src/x86_64_fma.rs index 4e9e4dcaa3..3d6f54d908 100644 --- a/linalg/src/x86_64_fma.rs +++ b/linalg/src/x86_64_fma.rs @@ -1,3 +1,7 @@ +use std::cmp::Ordering; + +use tract_data::internal::num_integer::Integer; + use crate::frame::element_wise::ElementWiseKer; use crate::frame::mmm::kernel::MatMatMulKer; use crate::Ops; @@ -96,14 +100,71 @@ fn plug_fma(ops: &mut Ops) { fn plug_avx512f(ops: &mut Ops) { ops.mmv_f32 = Box::new(|m, _k| match m { Some(m) if m < 31 => mmm::avx512_mmm_f32_16x1::mmm(), - _ => mmm::avx512_mmm_f32_128x1::mmm(), + _ => mmm::avx512_mmm_f32_96x1::mmm(), }); - ops.mmm_f32 = Box::new(|_, _, n| match n { - Some(1) => unreachable!("should've been mmv"), - Some(2) => mmm::avx512_mmm_f32_80x2::mmm(), - Some(n) if n % 4 == 0 && n % 3 != 0 => mmm::avx512_mmm_f32_48x4::mmm(), - _ => mmm::avx512_mmm_f32_64x3::mmm(), + ops.mmm_f32 = Box::new(|_, _, n| { + if n.is_none() { + return mmm::avx512_mmm_f32_32x12::mmm(); + } + let mut n = n.unwrap(); + + if n > 14 { + // throughputs are mesured using the kernel_throughput.py script + let scaling_baseline = 98.0; + let kernel_throughputs = [ + (2, 18.0 / scaling_baseline), + (3, 28.0 / scaling_baseline), + (4, 36.5 / scaling_baseline), + (5, 44.0 / scaling_baseline), + (6, 49.0 / scaling_baseline), + (7, 58.0 / scaling_baseline), + (8, 65.0 / scaling_baseline), + (9, 72.5 / scaling_baseline), + (10, 82.0 / scaling_baseline), + (11, 84.0 / scaling_baseline), + (12, 88.5 / scaling_baseline), + (13, 95.0 / scaling_baseline), + (14, 98.0 / scaling_baseline), + ]; + + let throughputs = kernel_throughputs.map(|(kernel_width, thrpt): (usize, f32)| { + let n_tiles = Integer::div_ceil(&n, &kernel_width); + + let n_elem_total = n_tiles * kernel_width; + let n_elem_on_border_tile = n_elem_total - n; + let wasted_ratio = n_elem_on_border_tile as f32 / n_elem_total as f32; + + let final_thrpt = thrpt * (1.0 - wasted_ratio); + + (kernel_width, final_thrpt) + }); + + let best_ker = *throughputs + .iter() + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal)) + .map(|(ker_width, _)| ker_width) + .unwrap(); + + n = best_ker; + } + + match n { + 2 => mmm::avx512_mmm_f32_96x2::mmm(), + 3 => mmm::avx512_mmm_f32_80x3::mmm(), + 4 => mmm::avx512_mmm_f32_64x4::mmm(), + 5 => mmm::avx512_mmm_f32_32x5::mmm(), + 6 => mmm::avx512_mmm_f32_32x6::mmm(), + 7 => mmm::avx512_mmm_f32_32x7::mmm(), + 8 => mmm::avx512_mmm_f32_32x8::mmm(), + 9 => mmm::avx512_mmm_f32_32x9::mmm(), + 10 => mmm::avx512_mmm_f32_32x10::mmm(), + 11 => mmm::avx512_mmm_f32_32x11::mmm(), + 12 => mmm::avx512_mmm_f32_32x12::mmm(), + 13 => mmm::avx512_mmm_f32_32x13::mmm(), + 14 => mmm::avx512_mmm_f32_32x14::mmm(), + _ => unreachable!("not a valid index"), + } }); log::info!("mmm_f32, mmv_f32: x86_64/avx512f activated"); } diff --git a/linalg/src/x86_64_fma/mmm.rs b/linalg/src/x86_64_fma/mmm.rs index ded1e27248..bbc6ac5cbb 100644 --- a/linalg/src/x86_64_fma/mmm.rs +++ b/linalg/src/x86_64_fma/mmm.rs @@ -7,14 +7,75 @@ MMMKernel!(f32, fma_mmm_f32_24x4; 24, 4; 32, 4; 0, 0; no_prefetch, is_x86_featur MMMKernel!(f32, fma_mmm_f32_32x3; 32, 3; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("fma")); MMMKernel!(f32, fma_mmm_f32_40x2; 40, 2; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("fma")); MMMKernel!(f32, fma_mmm_f32_64x1; 64, 1; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("fma")); -MMMKernel!(f32, avx512_mmm_f32_128x1; 128, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_16x1; 16, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_16x12; 16, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_16x8; 16, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_32x6; 32, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_32x5; 32, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_48x4; 48, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_64x3; 64, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_80x2; 80, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(i32, avx2_mmm_i32_8x8; 8, 8; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx2")); + +#[cfg(not(feature = "compile_all_kernels"))] +mod avx512_best { + use super::*; + MMMKernel!(f32, avx512_mmm_f32_16x1; 16, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_96x1; 96, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_96x2; 96, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_80x3; 80, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_64x4; 64, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x5; 32, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x6; 32, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x7; 32, 7; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x8; 32, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x9; 32, 9; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x10; 32, 10; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x11; 32, 11; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x12; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x13; 32, 13; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x14; 32, 14; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +} +#[cfg(not(feature = "compile_all_kernels"))] +pub use avx512_best::*; + +#[cfg(feature = "compile_all_kernels")] +mod all_avx512 { + use super::*; + macro_rules! make_kernels_for_n { + ($n:expr ; $m:expr) => ( + paste! { + MMMKernel!(f32, []; $m, $n; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + } + ); + ($n:expr ; $m1:expr, $($y:expr),+) => ( + make_kernels_for_n!($n ; $m1); + make_kernels_for_n!($n ; $($y),+); + ) + } + + make_kernels_for_n!(1 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240); + make_kernels_for_n!(2 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160); + make_kernels_for_n!(3 ; 16, 32, 48, 64, 80, 96, 112); + make_kernels_for_n!(4 ; 16, 32, 48, 64, 80, 96); + make_kernels_for_n!(5 ; 16, 32, 48, 64, 80); + make_kernels_for_n!(6 ; 16, 32, 48, 64); + make_kernels_for_n!(7 ; 16, 32, 48); + make_kernels_for_n!(8 ; 16, 32, 48); + make_kernels_for_n!(9 ; 16, 32, 48); + make_kernels_for_n!(10 ; 16, 32); + make_kernels_for_n!(11 ; 16, 32); + make_kernels_for_n!(12 ; 16, 32); + make_kernels_for_n!(13 ; 16, 32); + make_kernels_for_n!(14 ; 16, 32); + make_kernels_for_n!(15 ; 16); + make_kernels_for_n!(16 ; 16); + make_kernels_for_n!(17 ; 16); + make_kernels_for_n!(18 ; 16); + make_kernels_for_n!(19 ; 16); + make_kernels_for_n!(20 ; 16); + make_kernels_for_n!(21 ; 16); + make_kernels_for_n!(22 ; 16); + make_kernels_for_n!(23 ; 16); + make_kernels_for_n!(24 ; 16); + make_kernels_for_n!(25 ; 16); + make_kernels_for_n!(26 ; 16); + make_kernels_for_n!(27 ; 16); + make_kernels_for_n!(28 ; 16); + make_kernels_for_n!(29 ; 16); +} +#[cfg(feature = "compile_all_kernels")] +pub use all_avx512::*; diff --git a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 857f7821c7..0000000000 --- a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,59 +0,0 @@ - // Tile size: 10x1 - // Accumulators: 0-9 - // Col regs: 10-19 - // Row regs: 20, 21 - - vbroadcastss zmm20, dword ptr [rcx] - - vmovaps zmm10, [rax + 0] - vmovaps zmm11, [rax + 64] - vmovaps zmm12, [rax + 128] - vmovaps zmm13, [rax + 192] - vmovaps zmm14, [rax + 256] - - vfmadd231ps zmm0, zmm10, zmm20 - vfmadd231ps zmm1, zmm11, zmm20 - vfmadd231ps zmm2, zmm12, zmm20 - vfmadd231ps zmm3, zmm13, zmm20 - vfmadd231ps zmm4, zmm14, zmm20 - - vmovaps zmm15, [rax + 320] - vmovaps zmm16, [rax + 384] - vmovaps zmm17, [rax + 448] - vmovaps zmm18, [rax + 512] - vmovaps zmm19, [rax + 576] - - vfmadd231ps zmm5, zmm10, zmm20 - vfmadd231ps zmm6, zmm11, zmm20 - vfmadd231ps zmm7, zmm12, zmm20 - vfmadd231ps zmm8, zmm13, zmm20 - vfmadd231ps zmm9, zmm14, zmm20 - - vbroadcastss zmm21, dword ptr [rcx + 4] - - vmovaps zmm10, [rax + 640] - vmovaps zmm11, [rax + 704] - vmovaps zmm12, [rax + 768] - vmovaps zmm13, [rax + 832] - vmovaps zmm14, [rax + 896] - - vfmadd231ps zmm0, zmm10, zmm21 - vfmadd231ps zmm1, zmm11, zmm21 - vfmadd231ps zmm2, zmm12, zmm21 - vfmadd231ps zmm3, zmm13, zmm21 - vfmadd231ps zmm4, zmm14, zmm21 - - vmovaps zmm15, [rax + 960] - vmovaps zmm16, [rax + 1024] - vmovaps zmm17, [rax + 1088] - vmovaps zmm18, [rax + 1152] - vmovaps zmm19, [rax + 1216] - - vfmadd231ps zmm5, zmm10, zmm21 - vfmadd231ps zmm6, zmm11, zmm21 - vfmadd231ps zmm7, zmm12, zmm21 - vfmadd231ps zmm8, zmm13, zmm21 - vfmadd231ps zmm9, zmm14, zmm21 - - add rcx, 8 - add rax, 1280 diff --git a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 76aaae5bff..0000000000 --- a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,33 +0,0 @@ - // Tile size: 10x1 - // Accumulators: 0-9 - // Col regs: 10-19 - // Row regs: 20 - - vbroadcastss zmm20, dword ptr [rcx] - - vmovaps zmm10, [rax + 0] - vmovaps zmm11, [rax + 64] - vmovaps zmm12, [rax + 128] - vmovaps zmm13, [rax + 192] - vmovaps zmm14, [rax + 256] - - vfmadd231ps zmm0, zmm10, zmm20 - vfmadd231ps zmm1, zmm11, zmm20 - vfmadd231ps zmm2, zmm12, zmm20 - vfmadd231ps zmm3, zmm13, zmm20 - vfmadd231ps zmm4, zmm14, zmm20 - - vmovaps zmm15, [rax + 320] - vmovaps zmm16, [rax + 384] - vmovaps zmm17, [rax + 448] - vmovaps zmm18, [rax + 512] - vmovaps zmm19, [rax + 576] - - vfmadd231ps zmm5, zmm10, zmm20 - vfmadd231ps zmm6, zmm11, zmm20 - vfmadd231ps zmm7, zmm12, zmm20 - vfmadd231ps zmm8, zmm13, zmm20 - vfmadd231ps zmm9, zmm14, zmm20 - - add rcx, 4 - add rax, 320 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index ba4e6232c0..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,7 +0,0 @@ - vbroadcastss zmm15, dword ptr [rcx] - - vmovups zmm8, [rax] - vfmadd231ps zmm0, zmm15, zmm8 - - add rcx, 4 - add rax, 64 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli deleted file mode 100644 index 4a1c310834..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli +++ /dev/null @@ -1,68 +0,0 @@ - vmovups zmm31, [rcx] - // vbroadcastss zmm17, [rcx + 4 * 0] - // vbroadcastss zmm18, [rcx + 4 * 1] - // vbroadcastss zmm19, [rcx + 4 * 2] - // vbroadcastss zmm20, [rcx + 4 * 3] - // vbroadcastss zmm21, [rcx + 4 * 4] - // vbroadcastss zmm22, [rcx + 4 * 5] - // vbroadcastss zmm23, [rcx + 4 * 6] - // vbroadcastss zmm24, [rcx + 4 * 7] - // vbroadcastss zmm25, [rcx + 4 * 8] - // vbroadcastss zmm26, [rcx + 4 * 9] - // vbroadcastss zmm27, [rcx + 4 * 10] - // vbroadcastss zmm28, [rcx + 4 * 11] - // vbroadcastss zmm29, [rcx + 4 * 12] - // vbroadcastss zmm30, [rcx + 4 * 13] - // vbroadcastss zmm31, [rcx + 4 * 14] - - vbroadcastss zmm16, xmm31 - valignd zmm17, zmm31, zmm31, 1 - vbroadcastss zmm17, xmm17 - valignd zmm18, zmm31, zmm31, 2 - vbroadcastss zmm18, xmm18 - valignd zmm19, zmm31, zmm31, 3 - vbroadcastss zmm19, xmm19 - valignd zmm20, zmm31, zmm31, 4 - vbroadcastss zmm20, xmm20 - valignd zmm21, zmm31, zmm31, 5 - vbroadcastss zmm21, xmm21 - valignd zmm22, zmm31, zmm31, 6 - vbroadcastss zmm22, xmm22 - valignd zmm23, zmm31, zmm31, 7 - vbroadcastss zmm23, xmm23 - valignd zmm24, zmm31, zmm31, 8 - vbroadcastss zmm24, xmm24 - valignd zmm25, zmm31, zmm31, 9 - vbroadcastss zmm25, xmm25 - valignd zmm26, zmm31, zmm31, 10 - vbroadcastss zmm26, xmm26 - valignd zmm27, zmm31, zmm31, 11 - vbroadcastss zmm27, xmm27 - valignd zmm28, zmm31, zmm31, 12 - vbroadcastss zmm28, xmm28 - valignd zmm29, zmm31, zmm31, 13 - vbroadcastss zmm29, xmm29 - valignd zmm30, zmm31, zmm31, 14 - vbroadcastss zmm30, xmm30 - valignd zmm31, zmm31, zmm31, 15 - vbroadcastss zmm31, xmm31 - - vfmadd231ps zmm0, zmm16, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm18, [rax + 128] - vfmadd231ps zmm3, zmm19, [rax + 192] - vfmadd231ps zmm4, zmm20, [rax + 256] - vfmadd231ps zmm5, zmm21, [rax + 320] - vfmadd231ps zmm6, zmm22, [rax + 384] - vfmadd231ps zmm7, zmm23, [rax + 448] - vfmadd231ps zmm8, zmm24, [rax + 512] - vfmadd231ps zmm9, zmm25, [rax + 576] - vfmadd231ps zmm10, zmm26, [rax + 640] - vfmadd231ps zmm11, zmm27, [rax + 704] - vfmadd231ps zmm12, zmm28, [rax + 768] - vfmadd231ps zmm13, zmm29, [rax + 832] - vfmadd231ps zmm14, zmm30, [rax + 896] - vfmadd231ps zmm15, zmm31, [rax + 960] - - add rcx, 64 - add rax, 1024 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli deleted file mode 100644 index 103be7015b..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli +++ /dev/null @@ -1,24 +0,0 @@ - // slow - vbroadcastss xmm16, dword ptr [rcx] - vbroadcastss xmm17, dword ptr [rcx + 4] - vbroadcastss xmm18, dword ptr [rcx + 8] - vbroadcastss xmm19, dword ptr [rcx + 12] - - // fast - vmovups xmm31, [rcx] - vbroadcastss zmm16, xmm31 - valignd xmm17, xmm31, xmm31, 1 - vbroadcastss zmm17, xmm17 - valignd xmm18, xmm31, xmm31, 2 - vbroadcastss zmm18, xmm18 - valignd xmm19, xmm31, xmm31, 3 - vbroadcastss zmm19, xmm19 - - // commmon - vfmadd231ps zmm0, zmm16, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm18, [rax + 128] - vfmadd231ps zmm3, zmm19, [rax + 192] - - add rcx, 16 - add rax, 256 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli deleted file mode 100644 index d6cb277f89..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli +++ /dev/null @@ -1,29 +0,0 @@ - vmovups ymm31, [rcx] - - vbroadcastss zmm16, xmm31 - valignd ymm17, ymm31, ymm31, 1 - vbroadcastss zmm17, xmm17 - valignd ymm18, ymm31, ymm31, 2 - vbroadcastss zmm18, xmm18 - valignd ymm19, ymm31, ymm31, 3 - vbroadcastss zmm19, xmm19 - valignd ymm20, ymm31, ymm31, 4 - vbroadcastss zmm20, xmm20 - valignd ymm21, ymm31, ymm31, 5 - vbroadcastss zmm21, xmm21 - valignd ymm22, ymm31, ymm31, 6 - vbroadcastss zmm22, xmm22 - valignd ymm23, ymm31, ymm31, 7 - vbroadcastss zmm23, xmm23 - - vfmadd231ps zmm0, zmm16, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm18, [rax + 128] - vfmadd231ps zmm3, zmm19, [rax + 192] - vfmadd231ps zmm4, zmm20, [rax + 256] - vfmadd231ps zmm5, zmm21, [rax + 320] - vfmadd231ps zmm6, zmm22, [rax + 384] - vfmadd231ps zmm7, zmm23, [rax + 448] - - add rcx, 32 - add rax, 512 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli deleted file mode 100644 index 8c9bf905b3..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli +++ /dev/null @@ -1,11 +0,0 @@ - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm8, [rax + 0] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm16, dword ptr [rcx + 4] - vmovaps zmm9, [rax + 64] - vfmadd231ps zmm1, zmm16, zmm9 - - add rcx, 8 - add rax, 128 diff --git a/linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 4ffab3bd4e..0000000000 --- a/linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,45 +0,0 @@ - // Tile size: 1x12 - // Accumulators: 0-11 - // Col regs: zmm14 - // Row regs: zmm15 - - vmovaps zmm15, [rax] - - vbroadcastss zmm14, dword ptr [rcx + 0 * 4] - vfmadd231ps zmm0, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 1 * 4] - vfmadd231ps zmm1, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 2 * 4] - vfmadd231ps zmm2, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 3 * 4] - vfmadd231ps zmm3, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 4 * 4] - vfmadd231ps zmm4, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 5 * 4] - vfmadd231ps zmm5, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 6 * 4] - vfmadd231ps zmm6, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 7 * 4] - vfmadd231ps zmm7, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 8 * 4] - vfmadd231ps zmm8, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 9 * 4] - vfmadd231ps zmm9, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 10 * 4] - vfmadd231ps zmm10, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 11 * 4] - vfmadd231ps zmm11, zmm15, zmm14 - - add rcx, 48 - add rax, 64 \ No newline at end of file diff --git a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 118d312c82..0000000000 --- a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,53 +0,0 @@ - // Accumulators: 0-9 - // Columns: 15-16 - // Rows: 10-14 - vbroadcastss zmm10, dword ptr [rcx] - vbroadcastss zmm11, dword ptr [rcx + 4] - vbroadcastss zmm12, dword ptr [rcx + 8] - vbroadcastss zmm13, dword ptr [rcx + 12] - vbroadcastss zmm14, dword ptr [rcx + 16] - - vmovaps zmm15, [rax] - vmovaps zmm16, [rax + 64] - - vfmadd231ps zmm0, zmm15, zmm10 - vfmadd231ps zmm1, zmm16, zmm10 - - vfmadd231ps zmm2, zmm15, zmm11 - vfmadd231ps zmm3, zmm16, zmm11 - - vfmadd231ps zmm4, zmm15, zmm12 - vfmadd231ps zmm5, zmm16, zmm12 - - vfmadd231ps zmm6, zmm15, zmm13 - vfmadd231ps zmm7, zmm16, zmm13 - - vfmadd231ps zmm8, zmm15, zmm14 - vfmadd231ps zmm9, zmm16, zmm14 - - vbroadcastss zmm10, dword ptr [rcx + 20] - vbroadcastss zmm11, dword ptr [rcx + 24] - vbroadcastss zmm12, dword ptr [rcx + 28] - vbroadcastss zmm13, dword ptr [rcx + 32] - vbroadcastss zmm14, dword ptr [rcx + 36] - - vmovaps zmm15, [rax + 128] - vmovaps zmm16, [rax + 192] - - vfmadd231ps zmm0, zmm15, zmm10 - vfmadd231ps zmm1, zmm16, zmm10 - - vfmadd231ps zmm2, zmm15, zmm11 - vfmadd231ps zmm3, zmm16, zmm11 - - vfmadd231ps zmm4, zmm15, zmm12 - vfmadd231ps zmm5, zmm16, zmm12 - - vfmadd231ps zmm6, zmm15, zmm13 - vfmadd231ps zmm7, zmm16, zmm13 - - vfmadd231ps zmm8, zmm15, zmm14 - vfmadd231ps zmm9, zmm16, zmm14 - - add rcx, 40 - add rax, 256 diff --git a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index e017834d25..0000000000 --- a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,30 +0,0 @@ - // Accumulators: 0-9 - // Columns: 15 - // Rows: 10-14 - - vbroadcastss zmm10, dword ptr [rcx] - vbroadcastss zmm11, dword ptr [rcx + 4] - vbroadcastss zmm12, dword ptr [rcx + 8] - vbroadcastss zmm13, dword ptr [rcx + 12] - vbroadcastss zmm14, dword ptr [rcx + 16] - - vmovaps zmm15, [rax] - vmovaps zmm16, [rax + 64] - - vfmadd231ps zmm0, zmm15, zmm10 - vfmadd231ps zmm1, zmm16, zmm10 - - vfmadd231ps zmm2, zmm15, zmm11 - vfmadd231ps zmm3, zmm16, zmm11 - - vfmadd231ps zmm4, zmm15, zmm12 - vfmadd231ps zmm5, zmm16, zmm12 - - vfmadd231ps zmm6, zmm15, zmm13 - vfmadd231ps zmm7, zmm16, zmm13 - - vfmadd231ps zmm8, zmm15, zmm14 - vfmadd231ps zmm9, zmm16, zmm14 - - add rcx, 20 - add rax, 128 diff --git a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 9d6c940a94..0000000000 --- a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,71 +0,0 @@ - // Tile size: 2x6 - // Accumulators: 0-11 - // Col regs: zmm14-15 - // Row regs: zmm12-13 - - vbroadcastss zmm14, dword ptr [rcx] - vmovaps zmm12, [rax] - vmovaps zmm13, [rax + 64] - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm1, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 8] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm3, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm5, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 16] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 20] - - vfmadd231ps zmm8, zmm12, zmm14 - vfmadd231ps zmm9, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx+24] - - vfmadd231ps zmm10, zmm12, zmm15 - vfmadd231ps zmm11, zmm13, zmm15 - - // Iteration two - vmovaps zmm12, [rax + 128] - vmovaps zmm13, [rax + 192] - vbroadcastss zmm15, dword ptr [rcx + 24 + 4] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm1, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 24 + 8] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm3, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 24 + 12] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm5, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 24 + 16] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 24 + 20] - - vfmadd231ps zmm8, zmm12, zmm14 - vfmadd231ps zmm9, zmm13, zmm14 - - vfmadd231ps zmm10, zmm12, zmm15 - vfmadd231ps zmm11, zmm13, zmm15 - - add rax, 256 - add rcx, 48 diff --git a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 31f861b105..0000000000 --- a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,39 +0,0 @@ - // Tile size: 2x6 - // Accumulators: 0-11 - // Col regs: zmm14-15 - // Row regs: zmm12-13 - - // Load ordered by earliest use for first 2x2 block - vbroadcastss zmm14, dword ptr [rcx] - vmovaps zmm12, [rax] - vmovaps zmm13, [rax + 64] - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm1, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 8] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm3, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm5, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 16] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 20] - - vfmadd231ps zmm8, zmm12, zmm14 - vfmadd231ps zmm9, zmm13, zmm14 - - vfmadd231ps zmm10, zmm12, zmm15 - vfmadd231ps zmm11, zmm13, zmm15 - - add rax, 128 - add rcx, 24 diff --git a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index c36b7f6b6a..0000000000 --- a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,63 +0,0 @@ - // Tile size: 3x4 - // Accumulators: 0-11 - // Col regs: zmm12-14 - // Row regs: zmm15 - - vmovaps zmm12, [rax] - vmovaps zmm13, [rax+64] - vmovaps zmm14, [rax+128] - - vbroadcastss zmm15, dword ptr [rcx + 0] - - vfmadd231ps zmm0, zmm12, zmm15 - vfmadd231ps zmm1, zmm13, zmm15 - vfmadd231ps zmm2, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm3, zmm12, zmm15 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm5, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 8] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm8, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm9, zmm12, zmm15 - vfmadd231ps zmm10, zmm13, zmm15 - vfmadd231ps zmm11, zmm14, zmm15 - - vmovaps zmm12, [rax + 192] - vmovaps zmm13, [rax + 256] - vmovaps zmm14, [rax + 320] - - vbroadcastss zmm15, dword ptr [rcx + 16] - - vfmadd231ps zmm0, zmm12, zmm15 - vfmadd231ps zmm1, zmm13, zmm15 - vfmadd231ps zmm2, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 20] - - vfmadd231ps zmm3, zmm12, zmm15 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm5, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 24] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm8, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 28] - - vfmadd231ps zmm9, zmm12, zmm15 - vfmadd231ps zmm10, zmm13, zmm15 - vfmadd231ps zmm11, zmm14, zmm15 - - add rax, 384 - add rcx, 32 diff --git a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index a8b1c3221a..0000000000 --- a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,35 +0,0 @@ - // Tile size: 3x4 - // Accumulators: 0-11 - // Col regs: zmm12-14 - // Row regs: zmm15 - - vmovaps zmm12, [rax] - vmovaps zmm13, [rax+64] - vmovaps zmm14, [rax+128] - - vbroadcastss zmm15, dword ptr [rcx + 0] - - vfmadd231ps zmm0, zmm12, zmm15 - vfmadd231ps zmm1, zmm13, zmm15 - vfmadd231ps zmm2, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm3, zmm12, zmm15 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm5, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 8] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm8, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm9, zmm12, zmm15 - vfmadd231ps zmm10, zmm13, zmm15 - vfmadd231ps zmm11, zmm14, zmm15 - - add rax, 192 - add rcx, 16 diff --git a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index fe661b7fa2..0000000000 --- a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,69 +0,0 @@ - // Tile size: 4x3 - // Accumulators: 0-11 - // Col regs: zmm12 - // Row regs: zmm13-15 - - // Load col of A - vmovaps zmm12, [rax] - - // Fill 3 cols of B - vbroadcastss zmm13, dword ptr [rcx + 0] - vbroadcastss zmm14, dword ptr [rcx + 4] - vbroadcastss zmm15, dword ptr [rcx + 8] - - // N.B. Stepping cols in inner loop - vfmadd231ps zmm0, zmm12, zmm13 - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax+64] - - vfmadd231ps zmm1, zmm12, zmm13 - vfmadd231ps zmm5, zmm12, zmm14 - vfmadd231ps zmm9, zmm12, zmm15 - - vmovaps zmm12, [rax+128] - - vfmadd231ps zmm2, zmm12, zmm13 - vfmadd231ps zmm6, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vmovaps zmm12, [rax+192] - - vfmadd231ps zmm3, zmm12, zmm13 - vfmadd231ps zmm7, zmm12, zmm14 - vfmadd231ps zmm11, zmm12, zmm15 - - // Load col of A, switching col! - vmovaps zmm13, [rax + 256] - - // Fill 3 cols of B - vbroadcastss zmm14, dword ptr [rcx + 12] - vbroadcastss zmm15, dword ptr [rcx + 16] - vbroadcastss zmm12, dword ptr [rcx + 20] - - // N.B. Stepping cols in inner loop - vfmadd231ps zmm0, zmm13, zmm14 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm8, zmm13, zmm12 - - vmovaps zmm13, [rax + 320] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm5, zmm13, zmm15 - vfmadd231ps zmm9, zmm13, zmm12 - - vmovaps zmm13, [rax + 384] - - vfmadd231ps zmm2, zmm13, zmm14 - vfmadd231ps zmm6, zmm13, zmm15 - vfmadd231ps zmm10, zmm13, zmm12 - - vmovaps zmm13, [rax + 448] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm11, zmm13, zmm12 - - add rcx, 24 - add rax, 512 diff --git a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 0e71a747e4..0000000000 --- a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,38 +0,0 @@ - // Tile size: 4x3 - // Accumulators: 0-11 - // Col regs: zmm12 - // Row regs: zmm13-15 - - // Load col of A - vmovaps zmm12, [rax] - - // Fill 3 cols of B - vbroadcastss zmm13, dword ptr [rcx + 0] - vbroadcastss zmm14, dword ptr [rcx + 4] - vbroadcastss zmm15, dword ptr [rcx + 8] - - // N.B. Stepping cols in inner loop - vfmadd231ps zmm0, zmm12, zmm13 - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax+64] - - vfmadd231ps zmm1, zmm12, zmm13 - vfmadd231ps zmm5, zmm12, zmm14 - vfmadd231ps zmm9, zmm12, zmm15 - - vmovaps zmm12, [rax+128] - - vfmadd231ps zmm2, zmm12, zmm13 - vfmadd231ps zmm6, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vmovaps zmm12, [rax+192] - - vfmadd231ps zmm3, zmm12, zmm13 - vfmadd231ps zmm7, zmm12, zmm14 - vfmadd231ps zmm11, zmm12, zmm15 - - add rcx, 12 - add rax, 256 diff --git a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 6a5b887b8b..0000000000 --- a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,63 +0,0 @@ - // Tile size: 5x2 - // Accumulators: 0-9 - // Col regs: zmm10-13 - // Row regs: zmm14-15 - - vmovaps zmm10, [rax] - vbroadcastss zmm14, dword ptr [rcx + 0] - vbroadcastss zmm15, dword ptr [rcx + 4] - vmovaps zmm11, [rax + 64] - - // NB stepping column-wise - vfmadd231ps zmm0, zmm10, zmm14 - vfmadd231ps zmm5, zmm10, zmm15 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm11, zmm14 - vfmadd231ps zmm6, zmm11, zmm15 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm7, zmm12, zmm15 - - vmovaps zmm10, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm8, zmm13, zmm15 - - vmovaps zmm11, [rax + 320] - - vfmadd231ps zmm4, zmm10, zmm14 - vfmadd231ps zmm9, zmm10, zmm15 - - vbroadcastss zmm14, dword ptr [rcx + 8] - vbroadcastss zmm15, dword ptr [rcx + 12] - - vmovaps zmm12, [rax + 384] - - // NB stepping column-wise - vfmadd231ps zmm0, zmm11, zmm14 - vfmadd231ps zmm5, zmm11, zmm15 - - vmovaps zmm13, [rax + 448] - - vfmadd231ps zmm1, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm10, [rax + 512] - - vfmadd231ps zmm2, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm11, [rax + 576] - - vfmadd231ps zmm3, zmm10, zmm14 - vfmadd231ps zmm8, zmm10, zmm15 - - vfmadd231ps zmm4, zmm11, zmm14 - vfmadd231ps zmm9, zmm11, zmm15 - - add rax, 640 - add rcx, 16 diff --git a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 73ef89b588..0000000000 --- a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,34 +0,0 @@ - // Tile size: 5x2 - // Accumulators: 0-9 - // Col regs: zmm10-14 - // Row regs: zmm15-16 - - vmovaps zmm10, [rax] - vbroadcastss zmm15, dword ptr [rcx + 0] - vbroadcastss zmm16, dword ptr [rcx + 4] - vmovaps zmm11, [rax + 64] - - // NB stepping column-wise - vfmadd231ps zmm0, zmm10, zmm15 - vfmadd231ps zmm5, zmm10, zmm16 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm11, zmm15 - vfmadd231ps zmm6, zmm11, zmm16 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm7, zmm12, zmm16 - - vmovaps zmm14, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm15 - vfmadd231ps zmm8, zmm13, zmm16 - - vfmadd231ps zmm4, zmm14, zmm15 - vfmadd231ps zmm9, zmm14, zmm16 - - add rax, 320 - add rcx, 8 diff --git a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 8c77044339..0000000000 --- a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,25 +0,0 @@ - // Tile size: 6x1 - // Accumulators: 0-5 - // Col regs: 6-11 - // Row regs: 15 - - - vbroadcastss zmm15, dword ptr [rcx] - vfmadd231ps zmm0, zmm15, [rax] - vfmadd231ps zmm1, zmm15, [rax + 64] - vfmadd231ps zmm2, zmm15, [rax + 128] - vfmadd231ps zmm3, zmm15, [rax + 192] - vfmadd231ps zmm4, zmm15, [rax + 256] - vfmadd231ps zmm5, zmm15, [rax + 320] - - vbroadcastss zmm14, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm14, [rax + 384] - vfmadd231ps zmm1, zmm14, [rax + 448] - vfmadd231ps zmm2, zmm14, [rax + 512] - vfmadd231ps zmm3, zmm14, [rax + 576] - vfmadd231ps zmm4, zmm14, [rax + 640] - vfmadd231ps zmm5, zmm14, [rax + 704] - - add rax, 768 - add rcx, 8 diff --git a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index a34c40fee4..0000000000 --- a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,29 +0,0 @@ - // Tile size: 6x1 - // Accumulators: 0-5 - // Col regs: 6-11 - // Row regs: 15 - - vbroadcastss zmm15, dword ptr [rcx] - - vmovups zmm10, [rax] - vmulps zmm10, zmm10, zmm15 - vaddps zmm0, zmm0, zmm10 - vmovups zmm11, [rax + 64] - vmulps zmm11, zmm11, zmm15 - vaddps zmm1, zmm1, zmm11 - vmovups zmm12, [rax + 128] - vmulps zmm12, zmm12, zmm15 - vaddps zmm2, zmm2, zmm12 - vmovups zmm13, [rax + 192] - vmulps zmm13, zmm13, zmm15 - vaddps zmm3, zmm3, zmm13 - vmovups zmm14, [rax + 256] - vmulps zmm14, zmm14, zmm15 - vaddps zmm4, zmm4, zmm14 - vmovups zmm15, [rax + 320] - vmulps zmm15, zmm15, zmm15 - vaddps zmm5, zmm5, zmm15 - - - add rcx, 4 - add rax, 384 diff --git a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 58ed8f4331..0000000000 --- a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,70 +0,0 @@ - // Tile size: 6x2 - // Accumulators: 0-9 - // Col regs: zmm10-13 - // Row regs: zmm14-15 - - vmovaps zmm12, [rax] - vbroadcastss zmm14, dword ptr [rcx + 0] - vbroadcastss zmm15, dword ptr [rcx + 4] - vmovaps zmm13, [rax + 64] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm9, zmm13, zmm15 - - vmovaps zmm13, [rax + 320] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vmovaps zmm12, [rax + 384] - vbroadcastss zmm14, dword ptr [rcx + 8] - - vfmadd231ps zmm5, zmm13, zmm14 - vfmadd231ps zmm11, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - vmovaps zmm13, [rax + 448] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm12, [rax + 512] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm13, [rax + 576] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax + 640] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm9, zmm13, zmm15 - - vmovaps zmm13, [rax + 704] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vfmadd231ps zmm5, zmm13, zmm14 - vfmadd231ps zmm11, zmm13, zmm15 - - add rax, 768 - add rcx, 16 diff --git a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 0fa5fa8e45..0000000000 --- a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,38 +0,0 @@ - // Tile size: 6x2 - // Accumulators: 0-11 - // Col regs: 12-13 - // Row regs: 14-15 - - vmovaps zmm12, [rax] - vbroadcastss zmm14, dword ptr [rcx + 0] - vbroadcastss zmm15, dword ptr [rcx + 4] - vmovaps zmm13, [rax + 64] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm9, zmm13, zmm15 - - vmovaps zmm13, [rax + 320] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vfmadd231ps zmm5, zmm13, zmm14 - vfmadd231ps zmm11, zmm13, zmm15 - - add rcx, 8 - add rax, 384 diff --git a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index e23d79d2d5..0000000000 --- a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,40 +0,0 @@ - // Tile size: 6x1 - // Accumulators: 0-5 - // Col regs: 6-11 - // Row regs: 15 - - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm7, [rax + 0] - vmovaps zmm8, [rax + 64] - vmovaps zmm9, [rax + 128] - vmovaps zmm10, [rax + 192] - vmovaps zmm11, [rax + 256] - vmovaps zmm12, [rax + 320] - vmovaps zmm13, [rax + 384] - - vfmadd231ps zmm0, zmm7, zmm15 - vfmadd231ps zmm1, zmm8, zmm15 - vfmadd231ps zmm2, zmm9, zmm15 - vfmadd231ps zmm3, zmm10, zmm15 - vfmadd231ps zmm4, zmm11, zmm15 - vfmadd231ps zmm5, zmm12, zmm15 - vfmadd231ps zmm6, zmm13, zmm15 - - vbroadcastss zmm16, dword ptr [rcx + 4] - - vmovaps zmm7, [rax + 448 + 0] - vmovaps zmm8, [rax + 448 + 64] - vmovaps zmm9, [rax + 448 + 128] - vmovaps zmm10, [rax + 448 + 192] - vmovaps zmm11, [rax + 448 + 256] - vmovaps zmm12, [rax + 448 + 320] - vmovaps zmm13, [rax + 448 + 384] - - vfmadd231ps zmm0, zmm7, zmm15 - vfmadd231ps zmm1, zmm8, zmm15 - vfmadd231ps zmm2, zmm9, zmm15 - vfmadd231ps zmm3, zmm10, zmm15 - vfmadd231ps zmm4, zmm11, zmm15 - vfmadd231ps zmm5, zmm12, zmm15 - vfmadd231ps zmm6, zmm13, zmm15 diff --git a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 889cb34e9b..0000000000 --- a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,21 +0,0 @@ - // Tile size: 7x1 - // Accumulators: 0-6 - // Col regs: 6-13 - // Row regs: 15 - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm7, [rax + 0] - vmovaps zmm8, [rax + 64] - vmovaps zmm9, [rax + 128] - vmovaps zmm10, [rax + 192] - vmovaps zmm11, [rax + 256] - vmovaps zmm12, [rax + 320] - vmovaps zmm13, [rax + 384] - - vfmadd231ps zmm0, zmm7, zmm15 - vfmadd231ps zmm1, zmm8, zmm15 - vfmadd231ps zmm2, zmm9, zmm15 - vfmadd231ps zmm3, zmm10, zmm15 - vfmadd231ps zmm4, zmm11, zmm15 - vfmadd231ps zmm5, zmm12, zmm15 - vfmadd231ps zmm6, zmm13, zmm15 diff --git a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 96d0d9863d..0000000000 --- a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,30 +0,0 @@ - // Tile size: 8x1 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - vbroadcastss zmm17, dword ptr [rcx] - - - vfmadd231ps zmm0, zmm17, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm17, [rax + 128] - vfmadd231ps zmm3, zmm17, [rax + 192] - vfmadd231ps zmm4, zmm17, [rax + 256] - vfmadd231ps zmm5, zmm17, [rax + 320] - vfmadd231ps zmm6, zmm17, [rax + 384] - vfmadd231ps zmm7, zmm17, [rax + 448] - - vbroadcastss zmm16, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm16, [rax + 0 + 512] - vfmadd231ps zmm1, zmm16, [rax + 64 + 512] - vfmadd231ps zmm2, zmm16, [rax + 128 + 512] - vfmadd231ps zmm3, zmm16, [rax + 192 + 512] - vfmadd231ps zmm4, zmm16, [rax + 256 + 512] - vfmadd231ps zmm5, zmm16, [rax + 320 + 512] - vfmadd231ps zmm6, zmm16, [rax + 384 + 512] - vfmadd231ps zmm7, zmm16, [rax + 448 + 512] - - add rcx, 8 - add rax, 1024 diff --git a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 38d57ce66d..0000000000 --- a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,25 +0,0 @@ - // Tile size: 8x1 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm8, [rax + 0] - vfmadd231ps zmm0, zmm15, zmm8 - vmovaps zmm9, [rax + 64] - vfmadd231ps zmm1, zmm15, zmm9 - vmovaps zmm10, [rax + 128] - vfmadd231ps zmm2, zmm15, zmm10 - vmovaps zmm11, [rax + 192] - vfmadd231ps zmm3, zmm15, zmm11 - vmovaps zmm12, [rax + 256] - vfmadd231ps zmm4, zmm15, zmm12 - vmovaps zmm13, [rax + 320] - vfmadd231ps zmm5, zmm15, zmm13 - vmovaps zmm14, [rax + 384] - vfmadd231ps zmm6, zmm15, zmm14 - vmovaps zmm8, [rax + 448] - vfmadd231ps zmm7, zmm15, zmm8 - add rcx, 4 - add rax, 512 diff --git a/linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 772651ce8f..0000000000 --- a/linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,42 +0,0 @@ - // Tile size: 8x2 - // Accumulators: 0-15 - // Col regs: 16-23 - // Row regs: 24-25 - - vmovaps zmm16, [rax + 0] - vbroadcastss zmm24, dword ptr [rcx + 0] - vbroadcastss zmm25, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm16, zmm24 - vfmadd231ps zmm8, zmm16, zmm25 - - vmovaps zmm17, [rax + 64] - vfmadd231ps zmm1, zmm17, zmm24 - vfmadd231ps zmm9, zmm17, zmm25 - - vmovaps zmm18, [rax + 128] - vfmadd231ps zmm2, zmm18, zmm24 - vfmadd231ps zmm10, zmm18, zmm25 - - vmovaps zmm19, [rax + 192] - vfmadd231ps zmm3, zmm19, zmm24 - vfmadd231ps zmm11, zmm19, zmm25 - - vmovaps zmm20, [rax + 256] - vfmadd231ps zmm4, zmm20, zmm24 - vfmadd231ps zmm12, zmm20, zmm25 - - vmovaps zmm21, [rax + 320] - vfmadd231ps zmm5, zmm21, zmm24 - vfmadd231ps zmm13, zmm21, zmm25 - - vmovaps zmm22, [rax + 384] - vfmadd231ps zmm6, zmm22, zmm24 - vfmadd231ps zmm14, zmm22, zmm25 - - vmovaps zmm23, [rax + 448] - vfmadd231ps zmm7, zmm23, zmm24 - vfmadd231ps zmm15, zmm23, zmm25 - - add rax, 512 - add rcx, 8 diff --git a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 1400fdf0da..0000000000 --- a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,61 +0,0 @@ - // Tile size: 1x8 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - - vmovaps zmm15, [rax] - - vbroadcastss zmm8, dword ptr [rcx + 0 * 4] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm9, dword ptr [rcx + 1 * 4] - vfmadd231ps zmm1, zmm15, zmm9 - - vbroadcastss zmm10, dword ptr [rcx + 2 * 4] - vfmadd231ps zmm2, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 3 * 4] - vfmadd231ps zmm3, zmm15, zmm11 - - vbroadcastss zmm12, dword ptr [rcx + 4 * 4] - vfmadd231ps zmm4, zmm15, zmm12 - - vbroadcastss zmm13, dword ptr [rcx + 5 * 4] - vfmadd231ps zmm5, zmm15, zmm13 - - vbroadcastss zmm10, dword ptr [rcx + 6 * 4] - vfmadd231ps zmm6, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 7 * 4] - vfmadd231ps zmm7, zmm15, zmm11 - - - vmovaps zmm15, [rax+64] - - vbroadcastss zmm8, dword ptr [rcx + 8 * 4] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm9, dword ptr [rcx + 9 * 4] - vfmadd231ps zmm1, zmm15, zmm9 - - vbroadcastss zmm10, dword ptr [rcx + 10 * 4] - vfmadd231ps zmm2, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 11 * 4] - vfmadd231ps zmm3, zmm15, zmm11 - - vbroadcastss zmm12, dword ptr [rcx + 12 * 4] - vfmadd231ps zmm4, zmm15, zmm12 - - vbroadcastss zmm13, dword ptr [rcx + 13 * 4] - vfmadd231ps zmm5, zmm15, zmm13 - - vbroadcastss zmm10, dword ptr [rcx + 14 * 4] - vfmadd231ps zmm6, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 15 * 4] - vfmadd231ps zmm7, zmm15, zmm11 - - add rcx, 64 - add rax, 128 \ No newline at end of file diff --git a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index c08151c2ac..0000000000 --- a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,33 +0,0 @@ - // Tile size: 1x8 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - vmovaps zmm15, [rax] - - vbroadcastss zmm8, dword ptr [rcx + 0 * 4] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm9, dword ptr [rcx + 1 * 4] - vfmadd231ps zmm1, zmm15, zmm9 - - vbroadcastss zmm10, dword ptr [rcx + 2 * 4] - vfmadd231ps zmm2, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 3 * 4] - vfmadd231ps zmm3, zmm15, zmm11 - - vbroadcastss zmm12, dword ptr [rcx + 4 * 4] - vfmadd231ps zmm4, zmm15, zmm12 - - vbroadcastss zmm13, dword ptr [rcx + 5 * 4] - vfmadd231ps zmm5, zmm15, zmm13 - - vbroadcastss zmm10, dword ptr [rcx + 6 * 4] - vfmadd231ps zmm6, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 7 * 4] - vfmadd231ps zmm7, zmm15, zmm11 - - add rcx, 32 - add rax, 64 diff --git a/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq b/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq new file mode 100644 index 0000000000..1ed4b1e9db --- /dev/null +++ b/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq @@ -0,0 +1,41 @@ +{% comment %} +Generate the code for a full AVX512 f32 kernel. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +// The kernel will operate on mr times nr elements at once, +// by laying them out in the zmm registers. +// +// As an example, mmm f32 32 x 12 will be laid out this way: +// zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 zmm12 zmm14 zmm16 zmm18 zmm20 zmm22 +// zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 zmm13 zmm15 zmm17 zmm19 zmm21 zmm23 +// +// The scratch registers are currently: +// - zmm31 and zmm30 in every case +// - every zmm register from `mr_arch * nr` to the end +// +// This means you always have at least mr_arch registers scratch registers +// plus zmm31 and zmm30 available. +// +// More scratch registers may be added later if we decide to limit the +// range for data registers, as we don't really need the biggest kernel sizes. +// +// The list of possible kernel sizes is thus defined by every mr,nr combinations +// that match `mr_arch * nr + mr_arch + 2 <= 32`. + +{% assign kernel_name = mr | append:"x" | append:nr %} + +{% include "preamble.tmpliq" size:kernel_name, suffix:suffix, G:G, arch:"avx512" %} + +{% include "f32_add_mat_mul.tmpliq" mr:mr, nr:nr %} +{% include "f32_scalars.tmpliq" mr:mr, nr:nr %} +{% include "f32_per_rows.tmpliq" mr:mr, nr:nr %} +{% include "f32_per_cols.tmpliq" mr:mr, nr:nr %} +{% include "f32_store_clear.tmpliq" mr:mr, nr:nr %} +{% include "f32_add_row_col_products.tmpliq" mr:mr, nr:nr %} +{% include "f32_add_unicast.tmpliq" mr:mr, nr:nr %} + +{% include "postamble.tmpliq" size:kernel_name, suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl deleted file mode 100644 index 195e764f8a..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl +++ /dev/null @@ -1,95 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 128 x 1 - - zmm0 - zmm1 - ... - zmm7 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"128x1", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed: - {% include "8x1/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:7 %} -{% include "f32_per_rows.tmpliq" mr:128, from:0, to:7 %} -{% include "f32_per_cols.tmpliq" mr:128, from:0, to:7 %} - -{{L}}add_unicast: - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - {% for row in (0..7) %} - vaddps zmm{{row}}, zmm{{row}}, [ r10 + {{row|times:64}} ] - {% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm14, dword ptr [rbx] - -{% for i in (0..7) %} - vmovups zmm12, [rax + {{i|times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - test r8, 63 - jnz {{L}}store_unaligned - - {% for row in (0..7) %} - vmovaps [r8 + {{row|times:64}}], zmm{{row}} - {% endfor %} - - jmp {{L}}non_linear_loop - - -{{L}}store_unaligned: - {% for row in (0..7) %} - vmovups [r8 + {{row|times:64}}], zmm{{row}} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"128x1", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl deleted file mode 100644 index 4e46ca8b4c..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl +++ /dev/null @@ -1,134 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 16 x 1 - - zmm0 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - - -{% include "preamble.tmpliq" size:"16x1", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - - cmp rbx, 8 - jl {{L}}main_loop_packed_packed_tail - -{{align}} 16 -{{L}}main_loop_packed_packed: - {% include "1x1/packed_packed_loop1/unroll-4.tmpli" %} - - sub rbx, 4 - cmp rbx, 4 - jge {{L}}main_loop_packed_packed - - {% for r in (1..3) %} - vaddps zmm0, zmm0, zmm{{r}} - {% endfor %} - - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed_tail: - {% include "1x1/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed_tail - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:0 %} -{% include "f32_per_rows.tmpliq" mr:16, from:0, to:0 %} -{% include "f32_per_cols.tmpliq" mr:16, from:0, to:0 %} - -{{L}}add_unicast: - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - cmp rsi, 4 - jne {{L}}add_unicast_generic - - vaddps zmm0, zmm0, [r10] - - jmp {{L}}non_linear_loop - -{{L}}add_unicast_generic: - mov r8, [0] -// mov eax, 0 -// {% for i in (0..3) %} -// pinsrd xmm14, eax, {{i}} -// add eax, esi -// {% endfor %} -// {% for i in (0..3) %} -// pinsrd xmm15, eax, {{i}} -// add eax, esi -// {% endfor %} -// -// vperm2f128 zmm14, zmm14, zmm15, 32 // zmm14 <- xmm14::xmm15 -// -// {% for i in (0..7) %} -// vpcmpeqd zmm15, zmm15, zmm15 -// vgatherdps zmm12, [ r10 + zmm14 ], zmm15 -// -// vaddps zmm{{i}}, zmm{{i}}, zmm12 -// lea r10, [ r10 + rsi * 8 ] -// {% endfor %} -// - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm14, dword ptr [rbx] - -{% for i in (0..0) %} - vmovups zmm12, [rax + {{i|times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - cmp rsi, 4 - jne {{L}}crash - - test r8, 63 - jnz {{L}}store_unaligned - - vmovaps [r8], zmm0 - jmp {{L}}non_linear_loop - -{{L}}store_unaligned: - vmovups [r8], zmm0 - jmp {{L}}non_linear_loop - -{{L}}crash: - mov r10, [0] -{% include "postamble.tmpliq" size:"16x1", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl deleted file mode 100644 index 159f689233..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl +++ /dev/null @@ -1,164 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 16 x 12 - - zmm0 zmm1 ... zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - - -{% include "preamble.tmpliq" size:"16x12", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed_tail: - {% include "1x12/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed_tail - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:16, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:16, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..11) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i}}, zmm{{i}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - -{% for i in (0..11) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - mov r8, [rdi + 8] // c ptr - - // tops of cols - lea r8, [ r8 + 4 * rbx ] - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 4}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - mov r8, [rdi + 8] // c ptr - - // tops of cols - lea r8, [ r8 + 8 * rbx ] - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 8}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"16x12", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl deleted file mode 100644 index e45835880c..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl +++ /dev/null @@ -1,142 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 16 x 8 - - zmm0 zmm1 ... zmm8 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - - -{% include "preamble.tmpliq" size:"16x8", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - - cmp rbx, 2 - jl {{L}}main_loop_packed_packed_tail - -{{align}} 16 -{{L}}main_loop_packed_packed: - {% include "8x8/packed_packed_loop1/avx-512-unroll.tmpli" %} - - sub rbx, 2 - cmp rbx, 2 - jge {{L}}main_loop_packed_packed - - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed_tail: - {% include "8x8/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed_tail - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:7 %} -{% include "f32_per_rows.tmpliq" mr:16, from:0, to:7 %} -{% include "f32_per_cols.tmpliq" mr:16, from:0, to:7 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..7) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i}}, zmm{{i}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - -{% for i in (0..7) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r12, [ r8 + 4 * rbx ] - lea r11, [ r10 + rbx ] - lea r13, [ r12 + rbx ] - lea r14, [ r12 + 2 * rbx ] - lea r15, [ r13 + 2 * rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..7) %} - vextractf32x4 xmm{{r | plus: 8}}, zmm{{r}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..7) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 8}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"16x8", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl deleted file mode 100644 index 9d5c9f6b34..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl +++ /dev/null @@ -1,143 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 32 x 5: - - zmm0 zmm2 zmm4 zmm6 zmm8 - zmm1 zmm3 zmm5 zmm7 zmm9 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"32x5", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "2x5/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:9 %} -{% include "f32_per_rows.tmpliq" mr:32, from:0, to:9 %} -{% include "f32_per_cols.tmpliq" mr:32, from:0, to:9 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..4) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2}}, zmm{{i | times: 2}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - -{% for i in (0..4) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2 | plus: 1}}, zmm{{i | times: 2 | plus: 1}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - vmovups zmm13, zmmword ptr [rax+64] - -{% for i in (0..4) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 2}}, zmm12, zmm14 - vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - lea r12, [ r10 + 2 * rbx ] - - {% for word in (0..1) %} - {% for quarter in (0..3) %} - {% for r in (0..4) %} - vextractf32x4 xmm{{r | plus: 11}}, zmm{{r | times: 2 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..4) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 11}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"32x5", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl deleted file mode 100644 index d0f1c84f12..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl +++ /dev/null @@ -1,160 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 32 x 6: - - zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 - zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"32x6", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "2x6/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:32, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:32, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..5) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2}}, zmm{{i | times: 2}}, zmm12 -{% endfor %} - - mov r10, [rdi + 8] - imul esi, 16 - vpbroadcastd zmm15, esi - vpaddd zmm14, zmm14, zmm15 - -{% for i in (0..5) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2 | plus: 1}}, zmm{{i | times: 2 | plus: 1}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - vmovups zmm13, zmmword ptr [rax+64] - -{% for i in (0..5) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 2}}, zmm12, zmm14 - vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..1) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 2 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - // tops of cols - mov r8, r11 - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - - {% for word in (0..1) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 3 | times: 2 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"32x6", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl deleted file mode 100644 index b0410ee1ee..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl +++ /dev/null @@ -1,147 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 48 x 4: - - zmm0 zmm3 zmm6 zmm9 - zmm1 zmm4 zmm7 zmm10 - zmm2 zmm5 zmm8 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"48x4", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "3x4/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:48, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:48, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..3) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 3}}, zmm{{i | times: 3}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..2) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..3) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 3 | plus: j}}, zmm{{i | times: 3 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - vmovups zmm13, zmmword ptr [rax+64] - vmovups zmm15, zmmword ptr [rax+128] - -{% for i in (0..3) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 3}}, zmm12, zmm14 - vfmadd231ps zmm{{i | times: 3 | plus: 1}}, zmm13, zmm14 - vfmadd231ps zmm{{i | times: 3 | plus: 2}}, zmm15, zmm14 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..2) %} - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 3 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"48x4", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl deleted file mode 100644 index 0016c845b1..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl +++ /dev/null @@ -1,148 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 64 x 3: - - zmm0 zmm4 zmm8 - zmm1 zmm5 zmm9 - zmm2 zmm6 zmm10 - zmm3 zmm7 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"64x3", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "4x3/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:64, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:64, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..2) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 4}}, zmm{{i | times: 4}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..3) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..2) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 4 | plus: j}}, zmm{{i | times: 4 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm13, dword ptr [rbx] - vbroadcastss zmm14, dword ptr [rbx+4] - vbroadcastss zmm15, dword ptr [rbx+8] - -{% for i in (0..3) %} - vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm13 - vfmadd231ps zmm{{i | plus: 4}}, zmm12, zmm14 - vfmadd231ps zmm{{i | plus: 8}}, zmm12, zmm15 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..3) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 4 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"64x3", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl deleted file mode 100644 index 2d4bc2c5a0..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl +++ /dev/null @@ -1,147 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 80 x 2: - - zmm0 zmm5 - zmm1 zmm6 - zmm2 zmm7 - zmm3 zmm8 - zmm4 zmm9 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"80x2", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "5x2/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:9 %} -{% include "f32_per_rows.tmpliq" mr:80, from:0, to:9 %} -{% include "f32_per_cols.tmpliq" mr:80, from:0, to:9 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..1) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 5}}, zmm{{i | times: 5}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..4) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..1) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 5 | plus: j}}, zmm{{i | times: 5 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm14, dword ptr [rbx] - vbroadcastss zmm15, dword ptr [rbx+4] - -{% for i in (0..4) %} - vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm14 - vfmadd231ps zmm{{i | plus: 5}}, zmm12, zmm15 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..4) %} - {% for quarter in (0..3) %} - {% for r in (0..1) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 5 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..1) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"80x2", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq new file mode 100644 index 0000000000..32835a0783 --- /dev/null +++ b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq @@ -0,0 +1,103 @@ +{% comment %} +Generate the code for the add_mat_mul instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rcx, rcx + jz {{L}}non_linear_loop + +// the main loop will access A 16 elements at a time +// and B 1 element at a time +// it may be unrolled on a few elements of K + +{% assign arch_mr = mr | divided_by:16 %} +{% assign arch_mr_min_1 = mr | divided_by:16 | minus:1 %} + +{% assign nr_min_1 = nr | minus:1 %} + +// total bytes of the tile on the m axis +{% assign m_total_bytes = mr | times:4 %} +// total bytes of the tile on the n axis +{% assign n_total_bytes = nr | times:4 %} + +// first register to be used for row +{% assign row_reg = arch_mr | times:nr %} +// the column register +{% assign col_reg = row_reg | plus:arch_mr | plus:1 %} + +{% assign prefetch_dist = 2 %} + +// we limit the number of prefetches +// because otherwise we end up producing too many prefetches at once +// and, we fill the Line Fill Buffer, which is the cpu's buffer for +// outstanding fetch request for L1 - which by the way has a size of +// 10 requests on most intel cpus +// +// filling up the LFB is actually a very big deal, because subsequent prefetches +// will block until there is space in the LFB +{% assign prefetches_to_issue_min_1 = arch_mr | at_most:2 | minus:1 %} + +// how many unrolls on k should we produce +{% assign unroll_count = 4 %} +{% assign unroll_count_min_1 = unroll_count | minus:1 %} + +// this is the dispatch part +{{L}}main_loop_packed_packed: + +// hardcoded 8 unrolls + + cmp rbx, 1 + jb {{L}}non_linear_loop + je {{L}}main_loop_packed_packed_1 + cmp rbx, 3 + jb {{L}}main_loop_packed_packed_2 + je {{L}}main_loop_packed_packed_3 +{% comment %} + cmp rbx, 5 + jb {{L}}main_loop_packed_packed_4 + je {{L}}main_loop_packed_packed_5 + cmp rbx, 7 + jb {{L}}main_loop_packed_packed_6 + je {{L}}main_loop_packed_packed_7 +{% endcomment %} + +{% for unroll in (0..unroll_count_min_1) %} + + {% assign n_items_on_k = unroll_count | minus:unroll %} + + {{L}}main_loop_packed_packed_{{n_items_on_k}}: + + {% assign unroll_min_1 = n_items_on_k | minus:1 %} + {% for cur_unroll_count in (0..unroll_min_1) %} + + {% for i in (0..prefetches_to_issue_min_1) %} + prefetcht0 [rax + {{i | times:64}} + {{m_total_bytes | times:prefetch_dist}} + {{cur_unroll_count | times:m_total_bytes}}] + {% endfor %} + + {% for i in (0..arch_mr_min_1) %} + vmovaps zmm{{row_reg | plus:i}}, [rax + {{i | times:64}} + {{cur_unroll_count | times:m_total_bytes}}] + {% endfor %} + + {% for i in (0..nr_min_1) %} + vbroadcastss zmm{{col_reg}}, dword ptr [rcx + {{i | times:4}} + {{cur_unroll_count | times:n_total_bytes}}] + + {% for j in (0..arch_mr_min_1) %} + vfmadd231ps zmm{{i | times:arch_mr | plus:j}}, zmm{{row_reg | plus:j}}, zmm{{col_reg}} + {% endfor %} + {% endfor %} + {% endfor %} + + add rax, {{m_total_bytes | times:n_items_on_k}} + add rcx, {{n_total_bytes | times:n_items_on_k}} + sub rbx, {{n_items_on_k}} + + jmp {{L}}main_loop_packed_packed +{% endfor %} diff --git a/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq b/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq new file mode 100644 index 0000000000..f331c98330 --- /dev/null +++ b/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq @@ -0,0 +1,31 @@ +{% comment %} +Generate the code for the store instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +{% assign nr_min_1 = nr | minus:1 %} +{% assign mr_arch = mr | divided_by:16 %} +{% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} + +{{L}}add_row_col_products: + mov rax, [rdi + 8] + mov rbx, [rdi + 16] + +// name of the first scratch reg +{% assign scratch = mr_arch | times: nr %} + +{% for j in (0..mr_arch_min_1) %} + vmovups zmm{{scratch | plus:j}}, zmmword ptr [rax + {{j | times:64}}] +{% endfor %} + +{% for i in (0..nr_min_1) %} + vbroadcastss zmm31, dword ptr [rbx + {{i | times:4}}] + {% for j in (0..mr_arch_min_1) %} + vfmadd231ps zmm{{mr_arch | times:i | plus:j}}, zmm{{scratch | plus:j}}, zmm31 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop \ No newline at end of file diff --git a/linalg/x86_64/avx512/f32_add_unicast.tmpliq b/linalg/x86_64/avx512/f32_add_unicast.tmpliq new file mode 100644 index 0000000000..ff8bab37e5 --- /dev/null +++ b/linalg/x86_64/avx512/f32_add_unicast.tmpliq @@ -0,0 +1,67 @@ +{% comment %} +Generate the code for the add_unicast instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%}{%endcapture%} + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // gather operation + + vpbroadcastd zmm30, esi + vpmulld zmm30, zmm30, zmmword ptr [{{offset}} {{L}}numbers_seq_add_unicast] + + // zmm30 is now a sequence of 0,esi,esi*2,esi*3...esi*15 + +{% assign nr_min_1 = nr | minus:1 %} +{% assign mr_arch = mr | divided_by:16 %} +{% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} + + // r10 is cur col + imul rsi, 16 // stride for 16 elems + +{% for col in (0..nr_min_1) %} + mov r9, r10 // cur row + + {% for row in (0..mr_arch_min_1) %} + kxnorw k1,k1,k1 // set writemask to ones + vgatherdps zmm31{k1}, [r9 + zmm30] + vaddps zmm{{col | times:mr_arch | plus:row}}, zmm{{col | times:mr_arch | plus:row}}, zmm31 + + {% if row != mr_arch_min_1 %} + add r9, rsi + {% endif %} + {% endfor %} + + {% if col != nr_min_1 %} + add r10, rbx + {% endif %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}numbers_seq_add_unicast: + {{long}} 0 + {{long}} 1 + {{long}} 2 + {{long}} 3 + {{long}} 4 + {{long}} 5 + {{long}} 6 + {{long}} 7 + {{long}} 8 + {{long}} 9 + {{long}} 10 + {{long}} 11 + {{long}} 12 + {{long}} 13 + {{long}} 14 + {{long}} 15 diff --git a/linalg/x86_64/avx512/f32_per_cols.tmpliq b/linalg/x86_64/avx512/f32_per_cols.tmpliq index 6d4097d416..076543f5e2 100644 --- a/linalg/x86_64/avx512/f32_per_cols.tmpliq +++ b/linalg/x86_64/avx512/f32_per_cols.tmpliq @@ -1,8 +1,14 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for the per-col instructions for f32. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} -{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} +{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", mr:mr, nr:nr, flipped:true %} diff --git a/linalg/x86_64/avx512/f32_per_rows.tmpliq b/linalg/x86_64/avx512/f32_per_rows.tmpliq index b20fcbbbbc..b84a189694 100644 --- a/linalg/x86_64/avx512/f32_per_rows.tmpliq +++ b/linalg/x86_64/avx512/f32_per_rows.tmpliq @@ -1,8 +1,14 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for the per-row instructions for f32. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} -{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} +{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", mr:mr, nr:nr, flipped:true %} diff --git a/linalg/x86_64/avx512/f32_scalars.tmpliq b/linalg/x86_64/avx512/f32_scalars.tmpliq index d6a4a24fd9..1f2e498f2a 100644 --- a/linalg/x86_64/avx512/f32_scalars.tmpliq +++ b/linalg/x86_64/avx512/f32_scalars.tmpliq @@ -1,11 +1,17 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for the scalar instructions for f32. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} -{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} +{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", mr:mr, nr:nr, flipped:true %} {{L}}q_scale: {{L}}q_shl: diff --git a/linalg/x86_64/avx512/f32_store_clear.tmpliq b/linalg/x86_64/avx512/f32_store_clear.tmpliq new file mode 100644 index 0000000000..50b099532d --- /dev/null +++ b/linalg/x86_64/avx512/f32_store_clear.tmpliq @@ -0,0 +1,83 @@ +{% comment %} +Generate the code for the store and clear instructions. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%}{%endcapture%} + +{% assign arch_mr = mr | divided_by:16 %} +{% assign arch_mr_min_1 = mr | divided_by:16 | minus:1 %} +{% assign nr_min_1 = nr | minus:1 %} + +{{L}}store: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // scatter operation + + vpbroadcastd zmm31, esi + vpmulld zmm31, zmm31, zmmword ptr [{{offset}} {{L}}numbers_seq_store] + + // zmm31 is now a sequence of 0,esi,esi*2,esi*3...esi*15 + +{% assign nr_min_1 = nr | minus:1 %} +{% assign mr_arch = mr | divided_by:16 %} +{% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} + + // r10 is cur col + imul rsi, 16 // stride for 16 elems + +{% for col in (0..nr_min_1) %} + mov r9, r10 // cur row + + {% for row in (0..mr_arch_min_1) %} + kxnorw k1,k1,k1 // set writemask to ones + vscatterdps [r9 + zmm31]{k1}, zmm{{col | times:mr_arch | plus:row}} + + {% if row != mr_arch_min_1 %} + add r9, rsi + {% endif %} + {% endfor %} + + {% if col != nr_min_1 %} + add r10, rbx + {% endif %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}numbers_seq_store: + {{long}} 0 + {{long}} 1 + {{long}} 2 + {{long}} 3 + {{long}} 4 + {{long}} 5 + {{long}} 6 + {{long}} 7 + {{long}} 8 + {{long}} 9 + {{long}} 10 + {{long}} 11 + {{long}} 12 + {{long}} 13 + {{long}} 14 + {{long}} 15 + +{% assign last_reg = mr | divided_by:16 | times:nr | minus:1 %} + +{{L}}clear: + vzeroall + {% if last_reg >= 16 %} + // turns out vzeroall only zeroes zmm0 to zmm15 + {% for regcol in (16..last_reg) %} + vmovups zmm{{regcol}}, zmm0 + {% endfor %} + {% endif %} + + jmp {{L}}non_linear_loop diff --git a/linalg/x86_64/avx512/postamble.tmpliq b/linalg/x86_64/avx512/postamble.tmpliq index ff3071a71a..6482e66ddf 100644 --- a/linalg/x86_64/avx512/postamble.tmpliq +++ b/linalg/x86_64/avx512/postamble.tmpliq @@ -1,3 +1,25 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm f32 32 x 12: + + zmm0 zmm2 ... zmm22 + zmm1 zmm3 ... zmm23 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + {{L}}return: ldmxcsr [rsp + 4] add rsp, 8 diff --git a/linalg/x86_64/avx512/preamble.tmpliq b/linalg/x86_64/avx512/preamble.tmpliq index 3ed2f7c309..10e2437562 100644 --- a/linalg/x86_64/avx512/preamble.tmpliq +++ b/linalg/x86_64/avx512/preamble.tmpliq @@ -1,3 +1,20 @@ +{% comment %} +// vim: set syntax=asm : + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + {% if msvc %} _text segment diff --git a/linalg/x86_64/avx512/zmm_per_col.tmpliq b/linalg/x86_64/avx512/zmm_per_col.tmpliq index 16c9d32eb7..7f638482a0 100644 --- a/linalg/x86_64/avx512/zmm_per_col.tmpliq +++ b/linalg/x86_64/avx512/zmm_per_col.tmpliq @@ -1,23 +1,35 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for a per-col instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements + op - the avx512 instruction + flipped - boolean to flip the order + label - the asm label to clear +{% endcomment %} {{L}}{{label}}: - mov rax, [ rdi + 8 ] + mov rax, [rdi + 8] -{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} -{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} +{% assign mr_over_16 = mr | divided_by:16 %} +{% assign mr_over_16_min_1 = mr | divided_by:16 | minus:1 %} -{%capture tmp%}{{to | plus: 1 }}{%endcapture%} +{% assign from = 0 %} +{% assign to = mr_over_16 | times:nr | minus:1 %} -{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_16}}{%endcapture%} -{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_16|minus:1}}{%endcapture%} +{% assign tmp = to | plus:1 %} +{% assign cols = to | plus:1 | minus:from | divided_by:mr_over_16 %} + +{% assign cols_min_1 = to | plus:1 | minus:from | divided_by:mr_over_16 | minus:1 %} // {{to|minus:from|plus:1}} cols:{{cols}} {% for right in (0..cols_min_1) %} - vbroadcastss zmm{{tmp}}, dword ptr [ rax ] + vbroadcastss zmm{{tmp}}, dword ptr [rax] add rax, 4 {% for down in (0..mr_over_16_min_1) %} - {%capture acc%}{{mr_over_16|times:right|plus:from|plus:down}}{%endcapture%} + {% assign acc = mr_over_16 | times:right | plus:from | plus:down %} {% if flipped %} {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{tmp}} {% else %} diff --git a/linalg/x86_64/avx512/zmm_per_row.tmpliq b/linalg/x86_64/avx512/zmm_per_row.tmpliq index f9da1b35f7..1041a1a96d 100644 --- a/linalg/x86_64/avx512/zmm_per_row.tmpliq +++ b/linalg/x86_64/avx512/zmm_per_row.tmpliq @@ -1,22 +1,34 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for a per-row instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements + op - the avx512 instruction + flipped - boolean to flip the order + label - the asm label to clear +{% endcomment %} {{L}}{{label}}: - mov rax, [ rdi + 8 ] + mov rax, [rdi + 8] -{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} -{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} +{% assign mr_over_16 = mr | divided_by:16 %} +{% assign mr_over_16_min_1 = mr | divided_by:16 | minus:1 %} + +{% assign from = 0 %} +{% assign to = mr_over_16 | times:nr | minus:1 %} {% for ix in (0..mr_over_16_min_1) %} - vmovups zmm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 64}}] + vmovups zmm{{to | plus:1 | plus:ix}}, [rax + {{ix | times:64}}] {% endfor %} {% if flipped %} {% for acc in (from..to) %} - {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }} + {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{acc | modulo:mr_over_16 | plus:to | plus:1}} {% endfor %} {% else %} {% for acc in (from..to) %} - {{op}} zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }}, zmm{{acc}} + {{op}} zmm{{acc}}, zmm{{acc | modulo:mr_over_16 | plus:to | plus:1}}, zmm{{acc}} {% endfor %} {% endif %} diff --git a/linalg/x86_64/avx512/zmm_scalar.tmpliq b/linalg/x86_64/avx512/zmm_scalar.tmpliq index 43373c9d82..7c0aedacc3 100644 --- a/linalg/x86_64/avx512/zmm_scalar.tmpliq +++ b/linalg/x86_64/avx512/zmm_scalar.tmpliq @@ -1,14 +1,30 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for a per-row instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements + op - the avx512 instruction + flipped - boolean to flip the order + label - the asm label to clear +{% endcomment %} + +{% assign mr_over_16 = mr | divided_by:16 %} + +{% assign from = 0 %} +{% assign to = mr_over_16 | times:nr | minus:1 %} + +// from={{from}} to={{to}} {{L}}{{label}}: - vbroadcastss zmm12, dword ptr [rdi + 8] + vbroadcastss zmm31, dword ptr [rdi + 8] {% if flipped %} {% for reg in (from..to) %} - {{op}} zmm{{reg}}, zmm{{reg}}, zmm12 + {{op}} zmm{{reg}}, zmm{{reg}}, zmm31 {% endfor %} {% else %} {% for reg in (from..to) %} - {{op}} zmm{{reg}}, zmm12, zmm{{reg}} + {{op}} zmm{{reg}}, zmm31, zmm{{reg}} {% endfor %} {% endif %} diff --git a/linalg/x86_64/kernel_throughput.py b/linalg/x86_64/kernel_throughput.py new file mode 100755 index 0000000000..416cc303ee --- /dev/null +++ b/linalg/x86_64/kernel_throughput.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +""" +Display the kernel throughputs as a dataframe and a csv file. + +Usage: +1. First, run the benchmarks using `cargo bench -p tract-linalg --features compile_all_kernels --bench kernel_test`. +2. Then run this file in the project root: `python3 linalg/x86_64/kernel_throughput.py`. + +The results are in Gelem/s. +""" + +import os +import re +import json +import os.path as path +import pandas as pd + +criterion = './target/criterion' +results = os.listdir(criterion) + +mat_common_dims = '1024x1000' + +df = pd.DataFrame(index=range(16, 256+16, 16), columns=range(1, 33), dtype='float') +for r in results: + ma = re.match("avx512_mmm_f32_(\d+)x(\d+)", r) + if not ma: + continue + + m = int(ma.group(1)) + n = int(ma.group(2)) + + path_ = path.join(criterion, r, "f32_cold", f"{mat_common_dims}x{n}") + benchmark = path.join(path_, "base/benchmark.json") + with open(benchmark) as f: + benchmark = json.load(f) + + sample = path.join(path_, "base/sample.json") + with open(sample) as f: + sample = json.load(f) + + elements = benchmark["throughput"]["Elements"] + time_per_iter = sum(sample["times"]) / sum(sample["iters"]) + + df.loc[m, n] = round(1 / (time_per_iter / elements), 2) + print(df.loc[m, n]) + +pd.set_option('display.max_columns', None) +print(df) +df.to_csv("result.csv")