From 5b0ab0d1ce2650361b1639f8de70fffd7d75689a Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 4 May 2020 19:27:44 +0200 Subject: [PATCH] unroll product loop --- sbp/src/operators.rs | 225 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 219 insertions(+), 6 deletions(-) diff --git a/sbp/src/operators.rs b/sbp/src/operators.rs index f572b0c..237a87f 100644 --- a/sbp/src/operators.rs +++ b/sbp/src/operators.rs @@ -440,6 +440,221 @@ fn diff_op_col_simd( } } +#[inline(always)] +/// Manual unrolling for some common sizes for vectorisation +/// +/// best used when the lengths of u and v are known at +/// compile time +fn product(u: &[Float], v: &[Float]) -> Float { + use packed_simd::*; + #[cfg(feature = "f32")] + type SimdT = f32x2; + #[cfg(not(feature = "f32"))] + type SimdT = f64x2; + + assert_eq!(u.len(), v.len()); + match u.len() { + 0 => 0.0, + 1 => u[0] * v[0], + 2 => (SimdT::from_slice_unaligned(&u[0..]) * SimdT::from_slice_unaligned(&v[0..])).sum(), + 3 => { + (SimdT::from_slice_unaligned(&u[0..]) * SimdT::from_slice_unaligned(&v[0..])).sum() + + u[2] * v[2] + } + 4 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + p.sum() + } + 5 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + p.sum() + u[4] * v[4] + } + 6 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + p.sum() + } + 7 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + p.sum() + u[6] * v[6] + } + 8 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let u3 = SimdT::from_slice_unaligned(&u[6..]); + let v3 = SimdT::from_slice_unaligned(&v[6..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + let p = u3.mul_adde(v3, p); + p.sum() + } + 9 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let u3 = SimdT::from_slice_unaligned(&u[6..]); + let v3 = SimdT::from_slice_unaligned(&v[6..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + let p = u3.mul_adde(v3, p); + p.sum() + u[8] * v[8] + } + 10 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let u3 = SimdT::from_slice_unaligned(&u[6..]); + let v3 = SimdT::from_slice_unaligned(&v[6..]); + + let u4 = SimdT::from_slice_unaligned(&u[8..]); + let v4 = SimdT::from_slice_unaligned(&v[8..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + let p = u3.mul_adde(v3, p); + let p = u4.mul_adde(v4, p); + p.sum() + } + 11 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let u3 = SimdT::from_slice_unaligned(&u[6..]); + let v3 = SimdT::from_slice_unaligned(&v[6..]); + + let u4 = SimdT::from_slice_unaligned(&u[8..]); + let v4 = SimdT::from_slice_unaligned(&v[8..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + let p = u3.mul_adde(v3, p); + let p = u4.mul_adde(v4, p); + p.sum() + u[10] * v[10] + } + 12 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let u3 = SimdT::from_slice_unaligned(&u[6..]); + let v3 = SimdT::from_slice_unaligned(&v[6..]); + + let u4 = SimdT::from_slice_unaligned(&u[8..]); + let v4 = SimdT::from_slice_unaligned(&v[8..]); + + let u5 = SimdT::from_slice_unaligned(&u[10..]); + let v5 = SimdT::from_slice_unaligned(&v[10..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + let p = u3.mul_adde(v3, p); + let p = u4.mul_adde(v4, p); + let p = u5.mul_adde(v5, p); + p.sum() + } + 13 => { + let u0 = SimdT::from_slice_unaligned(&u[0..]); + let v0 = SimdT::from_slice_unaligned(&v[0..]); + + let u1 = SimdT::from_slice_unaligned(&u[2..]); + let v1 = SimdT::from_slice_unaligned(&v[2..]); + + let u2 = SimdT::from_slice_unaligned(&u[4..]); + let v2 = SimdT::from_slice_unaligned(&v[4..]); + + let u3 = SimdT::from_slice_unaligned(&u[6..]); + let v3 = SimdT::from_slice_unaligned(&v[6..]); + + let u4 = SimdT::from_slice_unaligned(&u[8..]); + let v4 = SimdT::from_slice_unaligned(&v[8..]); + + let u5 = SimdT::from_slice_unaligned(&u[10..]); + let v5 = SimdT::from_slice_unaligned(&v[10..]); + + let p = u0 * v0; + let p = u1.mul_adde(v1, p); + let p = u2.mul_adde(v2, p); + let p = u3.mul_adde(v3, p); + let p = u4.mul_adde(v4, p); + let p = u5.mul_adde(v5, p); + p.sum() + u[12] * v[12] + } + _ => u.iter().zip(v.iter()).map(|(u, v)| u * v).sum::(), + } +} + #[inline(always)] fn diff_op_row( block: &'static [&'static [Float]], @@ -469,13 +684,11 @@ fn diff_op_row( { let prev = prev.as_slice().unwrap(); let fut = fut.as_slice_mut().unwrap(); + assert_eq!(prev.len(), fut.len()); + assert!(prev.len() >= 2 * block.len()); for (bl, f) in block.iter().zip(fut.iter_mut()) { - let diff = bl - .iter() - .zip(prev.iter()) - .map(|(x, y)| x * y) - .sum::(); + let diff = product(bl, &prev[..bl.len()]); *f = diff * idx; } @@ -489,7 +702,7 @@ fn diff_op_row( .zip(fut.iter_mut().skip(block.len())) .take(nx - 2 * block.len()) { - let diff = diag.iter().zip(window).map(|(&x, &y)| x * y).sum::(); + let diff = product(diag, window); *f = diff * idx; }