Document what compiler is doing for diffxi

2021-03-25 22:39:00 +01:00
parent 76f5291131
commit a33e1d37ba
2 changed files with 42 additions and 15 deletions
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -102,7 +102,10 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
    prev: &[Float],
    fut: &mut [Float],
 ) {
+    use std::convert::TryInto;
    #[inline(never)]
+    /// This prevents code bloat, both start and end block gives
+    /// a matrix multiplication with the same matrix sizes
    fn dedup_matmul<const M: usize, const N: usize>(
        c: &mut ColVector<Float, M>,
        a: &Matrix<Float, M, N>,
@@ -129,7 +132,6 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
    let (futb1, fut) = fut.split_at_mut(M);
    let (fut, futb2) = fut.split_at_mut(nx - 2 * M);

-    use std::convert::TryInto;
    {
        let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
        let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap());
@@ -138,19 +140,39 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
        *fut *= idx;
    }

-    // The window needs to be aligned to the diagonal elements,
-    // based on the block size
-    let window_elems_to_skip = M - ((D - 1) / 2);
-
-    for (window, f) in prev[window_elems_to_skip..]
-        .array_windows::<D>()
-        .zip(fut.array_chunks_mut::<1>())
    {
-        let fut = ColVector::<_, 1>::map_to_col_mut(f);
-        let prev = ColVector::<_, D>::map_to_col(window);
+        // The window needs to be aligned to the diagonal elements,
+        // based on the block size
+        let window_elems_to_skip = M - ((D - 1) / 2);

-        fut.matmul_float_into(&matrix.diag, prev);
-        *fut *= idx;
+        // The compiler is pretty clever right here. It is able to
+        // inline the entire computation into a very tight loop.
+        // It seems the "optimal" way is to broadcast each element of diag
+        // into separate SIMD vectors.
+        // Then an unroll of SIMD::lanes items from fut and prev:
+        // f[0] = diag[0] * in[0] + diag[1] * in[1] + diag[2] * in[3] + ...
+        // f[1] = diag[0] * in[1] + diag[1] * in[2] + diag[2] * in[4] + ...
+        //                   \-- fma's along the vertical axis, then horizontal
+        //
+        // The resulting inner loop performs:
+        // one mul, D-1 fma,
+        // one mul (by idx),
+        // one store
+        // two integer adds (input and output offsets)
+        // one test + jump
+        //
+        // The compiler is clever enough to combine two such unrollings and merge
+        // these computations to prevent stalling
+        for (window, f) in prev[window_elems_to_skip..]
+            .array_windows::<D>()
+            .zip(fut.array_chunks_mut::<1>())
+        {
+            let fut = ColVector::<_, 1>::map_to_col_mut(f);
+            let prev = ColVector::<_, D>::map_to_col(window);
+
+            fut.matmul_float_into(&matrix.diag, prev);
+            *fut *= idx;
+        }
    }

    {
--- a/utils/constmatrix/src/lib.rs
+++ b/utils/constmatrix/src/lib.rs
@@ -123,10 +123,15 @@ impl<const M: usize, const P: usize> Matrix<Float, M, P> {
    ) {
        for i in 0..M {
            for j in 0..P {
-                let mut t = 0.0;
-                for k in 0..N {
+                // Slightly cheaper to do first computation separately
+                // rather than store zero and issue all ops as fma
+                let mut t = if N == 0 {
+                    0.0
+                } else {
+                    lhs[(i, 0)] * rhs[(0, j)]
+                };
+                for k in 1..N {
                    t = Float::mul_add(lhs[(i, k)], rhs[(k, j)], t);
-                    // t = t + lhs[(i, k)] * rhs[(k, j)];
                }
                self[(i, j)] = t;
            }