From c104082ac0430c7cced18306647ee55c1701a06f Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Thu, 28 Jan 2021 19:18:34 +0100
Subject: [PATCH 01/35] add matrix type

---
 sbp/src/operators/algos.rs | 122 +++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index a8dc30b..75fcae6 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -1,5 +1,127 @@
 use super::*;
 
+pub(crate) mod constmatrix {
+    /// A row-major matrix
+    #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+    pub struct Matrix<T, const M: usize, const N: usize> {
+        data: [[T; N]; M],
+    }
+
+    impl<T: Default, const M: usize, const N: usize> Default for Matrix<T, M, N> {
+        fn default() -> Self {
+            use std::mem::MaybeUninit;
+            let mut d: [[MaybeUninit<T>; N]; M] = unsafe { MaybeUninit::uninit().assume_init() };
+
+            for row in d.iter_mut() {
+                for item in row.iter_mut() {
+                    *item = MaybeUninit::new(T::default());
+                }
+            }
+
+            let data = unsafe { std::mem::transmute_copy::<_, [[T; N]; M]>(&d) };
+            Self { data }
+        }
+    }
+
+    impl<T, const M: usize, const N: usize> core::ops::Index<(usize, usize)> for Matrix<T, M, N> {
+        type Output = T;
+        #[inline(always)]
+        fn index(&self, (i, j): (usize, usize)) -> &Self::Output {
+            &self.data[i][j]
+        }
+    }
+
+    impl<T, const M: usize, const N: usize> core::ops::IndexMut<(usize, usize)> for Matrix<T, M, N> {
+        #[inline(always)]
+        fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output {
+            &mut self.data[i][j]
+        }
+    }
+
+    impl<T, const M: usize, const N: usize> core::ops::Index<usize> for Matrix<T, M, N> {
+        type Output = [T; N];
+        #[inline(always)]
+        fn index(&self, i: usize) -> &Self::Output {
+            &self.data[i]
+        }
+    }
+
+    impl<T, const M: usize, const N: usize> core::ops::IndexMut<usize> for Matrix<T, M, N> {
+        #[inline(always)]
+        fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+            &mut self.data[i]
+        }
+    }
+
+    impl<T, const M: usize, const N: usize> Matrix<T, M, N> {
+        pub const fn new(data: [[T; N]; M]) -> Self {
+            Self { data }
+        }
+        pub const fn nrows(&self) -> usize {
+            M
+        }
+        pub const fn ncols(&self) -> usize {
+            N
+        }
+        pub fn matmul<const P: usize>(&self, other: &Matrix<T, N, P>) -> Matrix<T, M, P>
+        where
+            T: Default + core::ops::AddAssign<T>,
+            for<'f> &'f T: std::ops::Mul<Output = T>,
+        {
+            let mut out = Matrix::default();
+            self.matmul_into(other, &mut out);
+            out
+        }
+        pub fn matmul_into<const P: usize>(
+            &self,
+            other: &Matrix<T, N, P>,
+            out: &mut Matrix<T, M, P>,
+        ) where
+            T: Default + core::ops::AddAssign<T>,
+            for<'f> &'f T: std::ops::Mul<Output = T>,
+        {
+            *out = Default::default();
+            for i in 0..M {
+                for j in 0..P {
+                    for k in 0..N {
+                        out[(i, j)] += &self[(i, k)] * &other[(k, j)];
+                    }
+                }
+            }
+        }
+        pub fn iter_rows(
+            &self,
+        ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
+            (0..M).map(move |i| &self[i])
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::{super::*, *};
+        #[test]
+        fn construct_copy_type() {
+            let _m0 = Matrix::<i32, 4, 3>::default();
+            let _m1: Matrix<u8, 8, 8> = Matrix::default();
+
+            let _m2 = Matrix::new([[1, 2], [3, 4]]);
+        }
+        #[test]
+        fn construct_non_copy() {
+            let _m = Matrix::<String, 2, 1>::default();
+        }
+
+        #[test]
+        fn matmul() {
+            let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
+            let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]);
+
+            let m3 = m1.matmul(&m2);
+            assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]]));
+        }
+    }
+}
+
 #[inline(always)]
 pub(crate) fn diff_op_1d(
     block: &[&[Float]],

From 94e8fb5b7cccff90aeff6916d9fc9df6dc8a540b Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Thu, 28 Jan 2021 20:59:11 +0100
Subject: [PATCH 02/35] checkpoint

---
 sbp/src/lib.rs             |   1 +
 sbp/src/operators/algos.rs | 183 +++++++++++++++++++++++++++++++++++++
 2 files changed, 184 insertions(+)

diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs
index e8badea..5b3eaa2 100644
--- a/sbp/src/lib.rs
+++ b/sbp/src/lib.rs
@@ -1,4 +1,5 @@
 #![feature(core_intrinsics)]
+#![feature(array_windows)]
 
 /// Type used for floats, configure with the `f32` feature
 #[cfg(feature = "f32")]
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 75fcae6..93657e3 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -6,6 +6,8 @@ pub(crate) mod constmatrix {
     pub struct Matrix<T, const M: usize, const N: usize> {
         data: [[T; N]; M],
     }
+    pub type RowVector<T, const N: usize> = Matrix<T, 1, N>;
+    pub type ColVector<T, const N: usize> = Matrix<T, N, 1>;
 
     impl<T: Default, const M: usize, const N: usize> Default for Matrix<T, M, N> {
         fn default() -> Self {
@@ -89,11 +91,61 @@ pub(crate) mod constmatrix {
                 }
             }
         }
+        pub fn iter(&self) -> impl ExactSizeIterator<Item = &T> + DoubleEndedIterator<Item = &T> {
+            (0..N * M).map(move |x| {
+                let i = x / N;
+                let j = x % N;
+                &self[(i, j)]
+            })
+        }
+        pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut T> {
+            self.data.iter_mut().flatten()
+        }
         pub fn iter_rows(
             &self,
         ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
             (0..M).map(move |i| &self[i])
         }
+
+        pub fn flip(&self) -> Self
+        where
+            T: Default + Clone,
+        {
+            let mut v = Self::default();
+            for i in 0..M {
+                for j in 0..N {
+                    v[(i, j)] = self[(N - 1 - i, M - 1 - j)].clone()
+                }
+            }
+            v
+        }
+    }
+
+    impl<T, const N: usize> ColVector<T, N> {
+        pub fn map_to_col(slice: &[T; N]) -> &ColVector<T, N> {
+            unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
+        }
+        pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector<T, N> {
+            unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
+        }
+    }
+
+    impl<T, const N: usize> RowVector<T, N> {
+        pub fn map_to_row(slice: &[T; N]) -> &Self {
+            unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
+        }
+        pub fn map_to_row_mut(slice: &mut [T; N]) -> &mut Self {
+            unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
+        }
+    }
+
+    impl<T, const M: usize, const N: usize> core::ops::MulAssign<&T> for Matrix<T, M, N>
+    where
+        for<'f> T: core::ops::MulAssign<&'f T>,
+    {
+        fn mul_assign(&mut self, other: &T) {
+            self.iter_mut().for_each(|x| *x *= other)
+        }
     }
 
     #[cfg(test)]
@@ -122,6 +174,137 @@ pub(crate) mod constmatrix {
     }
 }
 
+pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
+
+#[inline(always)]
+pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
+    block: &Matrix<Float, M, N>,
+    diag: &RowVector<Float, N>,
+    symmetry: Symmetry,
+    optype: OperatorType,
+    prev: ArrayView1<Float>,
+    mut fut: ArrayViewMut1<Float>,
+) {
+    assert_eq!(prev.shape(), fut.shape());
+    let nx = prev.shape()[0];
+    assert!(nx >= 2 * M);
+    assert!(nx >= N);
+
+    let dx = if optype == OperatorType::H2 {
+        1.0 / (nx - 2) as Float
+    } else {
+        1.0 / (nx - 1) as Float
+    };
+    let idx = 1.0 / dx;
+
+    for (bl, f) in block.iter_rows().zip(&mut fut) {
+        let diff = bl
+            .iter()
+            .zip(prev.iter())
+            .map(|(x, y)| x * y)
+            .sum::<Float>();
+        *f = diff * idx;
+    }
+
+    // The window needs to be aligned to the diagonal elements,
+    // based on the block size
+    let window_elems_to_skip = M - ((D - 1) / 2);
+
+    for (window, f) in prev
+        .windows(D)
+        .into_iter()
+        .skip(window_elems_to_skip)
+        .zip(fut.iter_mut().skip(M))
+        .take(nx - 2 * M)
+    {
+        let diff = diag.iter().zip(&window).map(|(x, y)| x * y).sum::<Float>();
+        *f = diff * idx;
+    }
+
+    for (bl, f) in block.iter_rows().zip(fut.iter_mut().rev()) {
+        let diff = bl
+            .iter()
+            .zip(prev.iter().rev())
+            .map(|(x, y)| x * y)
+            .sum::<Float>();
+
+        *f = idx
+            * if symmetry == Symmetry::Symmetric {
+                diff
+            } else {
+                -diff
+            };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: usize>(
+    block: &Matrix<Float, M, N>,
+    diag: &RowVector<Float, D>,
+    symmetry: Symmetry,
+    optype: OperatorType,
+    prev: &[Float],
+    fut: &mut [Float],
+) {
+    assert_eq!(prev.len(), fut.len());
+    let nx = prev.len();
+    assert!(nx >= 2 * M);
+    assert!(nx >= N);
+    let prev = &prev[..nx];
+    let fut = &mut fut[..nx];
+
+    let dx = if optype == OperatorType::H2 {
+        1.0 / (nx - 2) as Float
+    } else {
+        1.0 / (nx - 1) as Float
+    };
+    let idx = 1.0 / dx;
+
+    use std::convert::TryInto;
+    {
+        let prev = ColVector::<_, N>::map_to_col((&prev[0..N]).try_into().unwrap());
+        let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap());
+
+        block.matmul_into(prev, fut);
+        *fut *= &idx;
+    }
+
+    // The window needs to be aligned to the diagonal elements,
+    // based on the block size
+    let window_elems_to_skip = M - ((D - 1) / 2);
+
+    for (window, f) in prev
+        .array_windows::<D>()
+        .skip(window_elems_to_skip)
+        .zip(fut.iter_mut().skip(M))
+        .take(nx - 2 * M)
+    {
+        let fut = ColVector::<Float, 1>::map_to_col_mut(unsafe {
+            std::mem::transmute::<&mut Float, &mut [Float; 1]>(f)
+        });
+        let prev = ColVector::<_, D>::map_to_col(window);
+
+        diag.matmul_into(prev, fut);
+        *fut *= &idx;
+    }
+
+    let flipped = {
+        let mut flipped = block.flip();
+        if symmetry != Symmetry::Symmetric {
+            flipped *= &-1.0;
+        }
+        flipped
+    };
+
+    {
+        let prev = ColVector::<_, N>::map_to_col((&prev[nx - N..]).try_into().unwrap());
+        let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
+
+        flipped.matmul_into(prev, fut);
+        *fut *= &idx;
+    }
+}
+
 #[inline(always)]
 pub(crate) fn diff_op_1d(
     block: &[&[Float]],

From 30c563c19d95cc56b096f7deb2804fb15d70ea3f Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Thu, 28 Jan 2021 21:27:25 +0100
Subject: [PATCH 03/35] working d1 SBP4

---
 sbp/src/lib.rs                    |  1 +
 sbp/src/operators/algos.rs        | 12 ++++++------
 sbp/src/operators/traditional4.rs | 30 +++++++++++++++++++++++++++++-
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs
index 5b3eaa2..1d09854 100644
--- a/sbp/src/lib.rs
+++ b/sbp/src/lib.rs
@@ -1,5 +1,6 @@
 #![feature(core_intrinsics)]
 #![feature(array_windows)]
+#![feature(array_chunks)]
 
 /// Type used for floats, configure with the `f32` feature
 #[cfg(feature = "f32")]
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 93657e3..3fd83bf 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -114,7 +114,7 @@ pub(crate) mod constmatrix {
             let mut v = Self::default();
             for i in 0..M {
                 for j in 0..N {
-                    v[(i, j)] = self[(N - 1 - i, M - 1 - j)].clone()
+                    v[(i, j)] = self[(M - 1 - i, N - 1 - j)].clone()
                 }
             }
             v
@@ -276,12 +276,11 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     for (window, f) in prev
         .array_windows::<D>()
         .skip(window_elems_to_skip)
-        .zip(fut.iter_mut().skip(M))
+        .zip(fut.array_chunks_mut::<1>().skip(M))
         .take(nx - 2 * M)
     {
-        let fut = ColVector::<Float, 1>::map_to_col_mut(unsafe {
-            std::mem::transmute::<&mut Float, &mut [Float; 1]>(f)
-        });
+        // impl From here?
+        let fut = ColVector::<Float, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 
         diag.matmul_into(prev, fut);
@@ -297,7 +296,8 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     };
 
     {
-        let prev = ColVector::<_, N>::map_to_col((&prev[nx - N..]).try_into().unwrap());
+        let prev = prev.array_windows::<N>().last().unwrap();
+        let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
 
         flipped.matmul_into(prev, fut);
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 649c43c..c29944d 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -10,10 +10,14 @@ impl SBP4 {
     const HBLOCK: &'static [Float] = &[
         17.0 / 48.0, 59.0 / 48.0, 43.0 / 48.0, 49.0 / 48.0,
     ];
+
     #[rustfmt::skip]
     const DIAG: &'static [Float] = &[
         1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0,
     ];
+
+    const DIAG_MATRIX: super::RowVector<Float, 5> =
+        super::RowVector::new([[1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0]]);
     #[rustfmt::skip]
     const BLOCK: &'static [&'static [Float]] = &[
         &[-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0],
@@ -21,6 +25,13 @@ impl SBP4 {
         &[4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0],
         &[3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0]
     ];
+    #[rustfmt::skip]
+    const BLOCK_MATRIX: super::Matrix<Float, 4, 6> = super::Matrix::new([
+        [-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0, 0.0, 0.0],
+        [-1.0/2.0, 0.0, 1.0/2.0, 0.0, 0.0, 0.0],
+        [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0],
+        [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0]
+    ]);
 
     #[rustfmt::skip]
     const D2DIAG: &'static [Float] = &[
@@ -71,6 +82,22 @@ impl SbpOperator1d for SBP4 {
     }
 }
 
+fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
+    for (p, mut f) in prev
+        .axis_iter(ndarray::Axis(0))
+        .zip(fut.axis_iter_mut(ndarray::Axis(0)))
+    {
+        super::diff_op_1d_slice_matrix(
+            &SBP4::BLOCK_MATRIX,
+            &SBP4::DIAG_MATRIX,
+            super::Symmetry::AntiSymmetric,
+            super::OperatorType::Normal,
+            p.as_slice().unwrap(),
+            f.as_slice_mut().unwrap(),
+        )
+    }
+}
+
 impl SbpOperator2d for SBP4 {
     fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
@@ -81,7 +108,8 @@ impl SbpOperator2d for SBP4 {
 
         match (prev.strides(), fut.strides()) {
             ([_, 1], [_, 1]) => {
-                diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
+                //diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
+                diff_op_row_local(prev, fut)
             }
             ([1, _], [1, _]) => {
                 diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);

From 36293e75e61ee2834fd009555974afc103844f82 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Thu, 28 Jan 2021 21:59:13 +0100
Subject: [PATCH 04/35] 10% instr reduction with fast_ intr

---
 sbp/src/operators/algos.rs        | 53 ++++++++++++++++++++++---------
 sbp/src/operators/traditional4.rs |  4 ++-
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 3fd83bf..26686e0 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -59,9 +59,11 @@ pub(crate) mod constmatrix {
         pub const fn new(data: [[T; N]; M]) -> Self {
             Self { data }
         }
+        #[inline]
         pub const fn nrows(&self) -> usize {
             M
         }
+        #[inline]
         pub const fn ncols(&self) -> usize {
             N
         }
@@ -82,12 +84,13 @@ pub(crate) mod constmatrix {
             T: Default + core::ops::AddAssign<T>,
             for<'f> &'f T: std::ops::Mul<Output = T>,
         {
-            *out = Default::default();
             for i in 0..M {
                 for j in 0..P {
+                    let mut t = T::default();
                     for k in 0..N {
-                        out[(i, j)] += &self[(i, k)] * &other[(k, j)];
+                        t += &self[(i, k)] * &other[(k, j)];
                     }
+                    out[(i, j)] = t;
                 }
             }
         }
@@ -148,6 +151,34 @@ pub(crate) mod constmatrix {
         }
     }
 
+    use super::Float;
+    impl<const M: usize, const N: usize> Matrix<Float, M, N> {
+        pub fn matmul_fast<const P: usize>(
+            &self,
+            other: &Matrix<Float, N, P>,
+        ) -> Matrix<Float, M, P> {
+            let mut out = Matrix::default();
+            self.matmul_into_fast(other, &mut out);
+            out
+        }
+        pub fn matmul_into_fast<const P: usize>(
+            &self,
+            other: &Matrix<Float, N, P>,
+            out: &mut Matrix<Float, M, P>,
+        ) {
+            use core::intrinsics::{fadd_fast, fmul_fast};
+            for i in 0..M {
+                for j in 0..P {
+                    let mut t = 0.0;
+                    for k in 0..N {
+                        t = unsafe { fadd_fast(t, fmul_fast(self[(i, k)], other[(k, j)])) }
+                    }
+                    out[(i, j)] = t;
+                }
+            }
+        }
+    }
+
     #[cfg(test)]
     mod tests {
         use super::{super::*, *};
@@ -240,8 +271,8 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
 #[inline(always)]
 pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: usize>(
     block: &Matrix<Float, M, N>,
+    endblock: &Matrix<Float, M, N>,
     diag: &RowVector<Float, D>,
-    symmetry: Symmetry,
     optype: OperatorType,
     prev: &[Float],
     fut: &mut [Float],
@@ -262,10 +293,10 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
 
     use std::convert::TryInto;
     {
-        let prev = ColVector::<_, N>::map_to_col((&prev[0..N]).try_into().unwrap());
+        let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().nth(0).unwrap());
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap());
 
-        block.matmul_into(prev, fut);
+        block.matmul_into_fast(prev, fut);
         *fut *= &idx;
     }
 
@@ -283,24 +314,16 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let fut = ColVector::<Float, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 
-        diag.matmul_into(prev, fut);
+        diag.matmul_into_fast(prev, fut);
         *fut *= &idx;
     }
 
-    let flipped = {
-        let mut flipped = block.flip();
-        if symmetry != Symmetry::Symmetric {
-            flipped *= &-1.0;
-        }
-        flipped
-    };
-
     {
         let prev = prev.array_windows::<N>().last().unwrap();
         let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
 
-        flipped.matmul_into(prev, fut);
+        endblock.matmul_into_fast(prev, fut);
         *fut *= &idx;
     }
 }
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index c29944d..41f3c03 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -83,14 +83,16 @@ impl SbpOperator1d for SBP4 {
 }
 
 fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
+    let mut flipmatrix = SBP4::BLOCK_MATRIX.flip();
+    flipmatrix *= &-1.0;
     for (p, mut f) in prev
         .axis_iter(ndarray::Axis(0))
         .zip(fut.axis_iter_mut(ndarray::Axis(0)))
     {
         super::diff_op_1d_slice_matrix(
             &SBP4::BLOCK_MATRIX,
+            &flipmatrix,
             &SBP4::DIAG_MATRIX,
-            super::Symmetry::AntiSymmetric,
             super::OperatorType::Normal,
             p.as_slice().unwrap(),
             f.as_slice_mut().unwrap(),

From bcda26a51297e437eab0cf0fa998407fe6583384 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Thu, 28 Jan 2021 22:31:54 +0100
Subject: [PATCH 05/35] 15% reductions in instr count

---
 sbp/src/operators/algos.rs | 129 +++++++++++++++++++++++++++----------
 1 file changed, 96 insertions(+), 33 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 26686e0..9717a98 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -151,34 +151,6 @@ pub(crate) mod constmatrix {
         }
     }
 
-    use super::Float;
-    impl<const M: usize, const N: usize> Matrix<Float, M, N> {
-        pub fn matmul_fast<const P: usize>(
-            &self,
-            other: &Matrix<Float, N, P>,
-        ) -> Matrix<Float, M, P> {
-            let mut out = Matrix::default();
-            self.matmul_into_fast(other, &mut out);
-            out
-        }
-        pub fn matmul_into_fast<const P: usize>(
-            &self,
-            other: &Matrix<Float, N, P>,
-            out: &mut Matrix<Float, M, P>,
-        ) {
-            use core::intrinsics::{fadd_fast, fmul_fast};
-            for i in 0..M {
-                for j in 0..P {
-                    let mut t = 0.0;
-                    for k in 0..N {
-                        t = unsafe { fadd_fast(t, fmul_fast(self[(i, k)], other[(k, j)])) }
-                    }
-                    out[(i, j)] = t;
-                }
-            }
-        }
-    }
-
     #[cfg(test)]
     mod tests {
         use super::{super::*, *};
@@ -268,6 +240,90 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
     }
 }
 
+mod fastfloat {
+    use super::*;
+    #[repr(transparent)]
+    #[derive(Debug, PartialEq, Default)]
+    pub(crate) struct FastFloat(Float);
+
+    use core::intrinsics::{fadd_fast, fmul_fast};
+
+    impl core::ops::Mul for FastFloat {
+        type Output = Self;
+        fn mul(self, o: Self) -> Self::Output {
+            unsafe { Self(fmul_fast(self.0, o.0)) }
+        }
+    }
+
+    impl core::ops::MulAssign<FastFloat> for FastFloat {
+        fn mul_assign(&mut self, o: FastFloat) {
+            unsafe {
+                self.0 = fmul_fast(self.0, o.0);
+            }
+        }
+    }
+
+    impl core::ops::MulAssign<&FastFloat> for FastFloat {
+        fn mul_assign(&mut self, o: &FastFloat) {
+            unsafe {
+                self.0 = fmul_fast(self.0, o.0);
+            }
+        }
+    }
+
+    impl core::ops::MulAssign<FastFloat> for &mut FastFloat {
+        fn mul_assign(&mut self, o: FastFloat) {
+            unsafe {
+                self.0 = fmul_fast(self.0, o.0);
+            }
+        }
+    }
+    impl core::ops::MulAssign<&FastFloat> for &mut FastFloat {
+        fn mul_assign(&mut self, o: &FastFloat) {
+            unsafe {
+                self.0 = fmul_fast(self.0, o.0);
+            }
+        }
+    }
+
+    impl core::ops::AddAssign for FastFloat {
+        fn add_assign(&mut self, o: Self) {
+            unsafe {
+                self.0 = fadd_fast(self.0, o.0);
+            }
+        }
+    }
+
+    impl core::ops::Mul for &FastFloat {
+        type Output = FastFloat;
+        fn mul(self, o: Self) -> Self::Output {
+            unsafe { FastFloat(fmul_fast(self.0, o.0)) }
+        }
+    }
+
+    impl core::ops::MulAssign<&Float> for FastFloat {
+        fn mul_assign(&mut self, o: &Float) {
+            unsafe {
+                self.0 = fmul_fast(self.0, *o);
+            }
+        }
+    }
+    impl core::ops::MulAssign<&Float> for &mut FastFloat {
+        fn mul_assign(&mut self, o: &Float) {
+            unsafe {
+                self.0 = fmul_fast(self.0, *o);
+            }
+        }
+    }
+
+    impl From<Float> for FastFloat {
+        fn from(f: Float) -> Self {
+            Self(f)
+        }
+    }
+}
+use fastfloat::FastFloat;
+
 #[inline(always)]
 pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: usize>(
     block: &Matrix<Float, M, N>,
@@ -277,6 +333,13 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     prev: &[Float],
     fut: &mut [Float],
 ) {
+    use std::mem::transmute;
+    let block = unsafe { transmute::<_, &Matrix<FastFloat, M, N>>(block) };
+    let endblock = unsafe { transmute::<_, &Matrix<FastFloat, M, N>>(endblock) };
+    let diag = unsafe { transmute::<_, &RowVector<FastFloat, D>>(diag) };
+    let prev = unsafe { transmute::<_, &[FastFloat]>(prev) };
+    let fut = unsafe { transmute::<_, &mut [FastFloat]>(fut) };
+
     assert_eq!(prev.len(), fut.len());
     let nx = prev.len();
     assert!(nx >= 2 * M);
@@ -289,14 +352,14 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     } else {
         1.0 / (nx - 1) as Float
     };
-    let idx = 1.0 / dx;
+    let idx = FastFloat::from(1.0 / dx);
 
     use std::convert::TryInto;
     {
         let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().nth(0).unwrap());
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap());
 
-        block.matmul_into_fast(prev, fut);
+        block.matmul_into(prev, fut);
         *fut *= &idx;
     }
 
@@ -311,10 +374,10 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         .take(nx - 2 * M)
     {
         // impl From here?
-        let fut = ColVector::<Float, 1>::map_to_col_mut(f);
+        let fut = ColVector::<_, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 
-        diag.matmul_into_fast(prev, fut);
+        diag.matmul_into(prev, fut);
         *fut *= &idx;
     }
 
@@ -323,7 +386,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
 
-        endblock.matmul_into_fast(prev, fut);
+        endblock.matmul_into(prev, fut);
         *fut *= &idx;
     }
 }

From db552af4ff85a43bc938713fb68c98342bd71f5e Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Thu, 28 Jan 2021 23:20:52 +0100
Subject: [PATCH 06/35] add blockend with weird caveat

---
 sbp/src/operators/traditional4.rs | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 41f3c03..4e5abad 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -32,6 +32,13 @@ impl SBP4 {
         [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0],
         [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0]
     ]);
+    #[rustfmt::skip]
+    const BLOCKEND_MATRIX: super::Matrix<Float, 4, 6> = super::Matrix::new([
+        [4.0/49.0, -32.0/49.0, 0.0, 59.0/98.0, 0.0, -3.0/98.0 ],
+        [0.0,      4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0],
+        [0.0, 0.0, 0.0, -1.0/2.0, 0.0, 1.0/2.0],
+        [0.0, 0.0, 3.0/34.0, 4.0/17.0, -59.0/34.0, 24.0/17.0],
+    ]);
 
     #[rustfmt::skip]
     const D2DIAG: &'static [Float] = &[
@@ -83,15 +90,18 @@ impl SbpOperator1d for SBP4 {
 }
 
 fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
+    // Magic two lines that prevents or enables optimisation
+    // (doubles instructions when not included)
     let mut flipmatrix = SBP4::BLOCK_MATRIX.flip();
     flipmatrix *= &-1.0;
+
     for (p, mut f) in prev
         .axis_iter(ndarray::Axis(0))
         .zip(fut.axis_iter_mut(ndarray::Axis(0)))
     {
         super::diff_op_1d_slice_matrix(
             &SBP4::BLOCK_MATRIX,
-            &flipmatrix,
+            &SBP4::BLOCKEND_MATRIX,
             &SBP4::DIAG_MATRIX,
             super::OperatorType::Normal,
             p.as_slice().unwrap(),

From 4c2daf59334f3a44311f84e26a8e48a5c9302a07 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 00:08:31 +0100
Subject: [PATCH 07/35] minor thingys

---
 sbp/src/operators/algos.rs        | 14 ++++++--------
 sbp/src/operators/traditional4.rs | 13 ++++++++++++-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 9717a98..917d9e0 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -3,6 +3,7 @@ use super::*;
 pub(crate) mod constmatrix {
     /// A row-major matrix
     #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+    #[repr(transparent)]
     pub struct Matrix<T, const M: usize, const N: usize> {
         data: [[T; N]; M],
     }
@@ -94,12 +95,8 @@ pub(crate) mod constmatrix {
                 }
             }
         }
-        pub fn iter(&self) -> impl ExactSizeIterator<Item = &T> + DoubleEndedIterator<Item = &T> {
-            (0..N * M).map(move |x| {
-                let i = x / N;
-                let j = x % N;
-                &self[(i, j)]
-            })
+        pub fn iter(&self) -> impl Iterator<Item = &T> {
+            self.data.iter().flatten()
         }
         pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut T> {
             self.data.iter_mut().flatten()
@@ -352,7 +349,8 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     } else {
         1.0 / (nx - 1) as Float
     };
-    let idx = FastFloat::from(1.0 / dx);
+    let idx = 1.0 / dx;
+    let idx = FastFloat::from(idx);
 
     use std::convert::TryInto;
     {
@@ -382,7 +380,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     }
 
     {
-        let prev = prev.array_windows::<N>().last().unwrap();
+        let prev = prev.array_windows::<N>().next_back().unwrap();
         let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
 
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 4e5abad..352a92b 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -92,7 +92,7 @@ impl SbpOperator1d for SBP4 {
 fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
     // Magic two lines that prevents or enables optimisation
     // (doubles instructions when not included)
-    let mut flipmatrix = SBP4::BLOCK_MATRIX.flip();
+    let mut flipmatrix = SBP4::BLOCK_MATRIX;
     flipmatrix *= &-1.0;
 
     for (p, mut f) in prev
@@ -228,3 +228,14 @@ fn test_trad4() {
         1e-1,
     );
 }
+
+#[test]
+fn block_equality() {
+    let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip();
+    flipped_inverted *= &-1.0;
+
+    assert!(flipped_inverted
+        .iter()
+        .zip(SBP4::BLOCKEND_MATRIX.iter())
+        .all(|(x, y)| (x - y).abs() < 1e-3))
+}

From 3d34f2e7a0feec6d985ef3a7ceeaa9735380cf77 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 00:14:56 +0100
Subject: [PATCH 08/35] add fast-float feature

---
 sbp/Cargo.toml             |  1 +
 sbp/src/operators/algos.rs | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml
index a1ba176..2dd2b22 100644
--- a/sbp/Cargo.toml
+++ b/sbp/Cargo.toml
@@ -16,6 +16,7 @@ serde = { version = "1.0.115", optional = true, default-features = false, featur
 [features]
 # Use f32 as precision, default is f64
 f32 = []
+fast-float = []
 sparse = ["sprs"]
 serde1 = ["serde", "ndarray/serde"]
 
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 917d9e0..88c92e7 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -319,6 +319,7 @@ mod fastfloat {
         }
     }
 }
+#[cfg(feature = "fast-float")]
 use fastfloat::FastFloat;
 
 #[inline(always)]
@@ -330,12 +331,19 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     prev: &[Float],
     fut: &mut [Float],
 ) {
-    use std::mem::transmute;
-    let block = unsafe { transmute::<_, &Matrix<FastFloat, M, N>>(block) };
-    let endblock = unsafe { transmute::<_, &Matrix<FastFloat, M, N>>(endblock) };
-    let diag = unsafe { transmute::<_, &RowVector<FastFloat, D>>(diag) };
-    let prev = unsafe { transmute::<_, &[FastFloat]>(prev) };
-    let fut = unsafe { transmute::<_, &mut [FastFloat]>(fut) };
+    #[cfg(feature = "fast-float")]
+    let (block, endblock, diag, prev, fut) = {
+        use std::mem::transmute;
+        unsafe {
+            (
+                transmute::<_, &Matrix<FastFloat, M, N>>(block),
+                transmute::<_, &Matrix<FastFloat, M, N>>(endblock),
+                transmute::<_, &RowVector<FastFloat, D>>(diag),
+                transmute::<_, &[FastFloat]>(prev),
+                transmute::<_, &mut [FastFloat]>(fut),
+            )
+        }
+    };
 
     assert_eq!(prev.len(), fut.len());
     let nx = prev.len();
@@ -350,6 +358,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         1.0 / (nx - 1) as Float
     };
     let idx = 1.0 / dx;
+    #[cfg(feature = "fast-float")]
     let idx = FastFloat::from(idx);
 
     use std::convert::TryInto;

From 14fefe97abeaedc7dbb18d50d3a7ab20e50bc2df Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 08:28:55 +0100
Subject: [PATCH 09/35] add Matrix approach for SBP8

---
 sbp/src/operators/traditional8.rs | 39 +++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 6947022..14d0d18 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -1,4 +1,4 @@
-use super::{diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d};
+use super::{diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d};
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
 
@@ -15,6 +15,10 @@ impl SBP8 {
         3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03
     ];
     #[rustfmt::skip]
+    const DIAG_MATRIX: RowVector<Float, 9> = Matrix::new([[
+        3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03
+    ]]);
+    #[rustfmt::skip]
     const BLOCK: &'static [&'static [Float]] = &[
         &[-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02],
         &[-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03],
@@ -25,6 +29,17 @@ impl SBP8 {
         &[2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03],
         &[5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
     ];
+    #[rustfmt::skip]
+    const BLOCK_MATRIX: Matrix<Float, 8, 12> = Matrix::new([
+        [-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02, 0.0, 0.0, 0.0, 0.0],
+        [-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03, 0.0, 0.0, 0.0, 0.0],
+        [3.88218088704253e-02, -5.44329744454984e-01, 0.00000000000000e+00, 3.89516189693211e-01, 1.36433486528546e-01, 1.03290582800845e-01, -1.79720579323281e-01, 5.59882558852296e-02, 0.0, 0.0, 0.0, 0.0],
+        [1.28088632226564e-01, -4.19172130036008e-01, -5.57707021445779e-02, 0.00000000000000e+00, 1.24714160903055e-01, 2.81285212519100e-01, -3.94470423942641e-02, -1.96981310738430e-02, 0.0, 0.0, 0.0, 0.0],
+        [-1.82119472519009e-02, 9.09986646154550e-02, -8.51090570277506e-02, -5.43362886365301e-01, 0.00000000000000e+00, 6.37392455438558e-01, -1.02950081118829e-01, 2.98964956216039e-02, -8.65364391190110e-03, 0.0, 0.0, 0.0],
+        [-7.93147196245203e-02, 2.22875323171502e-01, -2.07999824391436e-02, -3.95611167748401e-01, -2.05756876210586e-01, 0.00000000000000e+00, 5.45876519966127e-01, -9.42727926638298e-02, 2.97971812952850e-02, -2.79348574643297e-03, 0.0, 0.0],
+        [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0],
+        [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
+    ]);
 }
 
 impl SbpOperator1d for SBP8 {
@@ -59,6 +74,25 @@ impl SbpOperator1d for SBP8 {
     }
 }
 
+fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
+    let mut flipmatrix = SBP8::BLOCK_MATRIX;
+    flipmatrix *= &-1.0;
+
+    for (p, mut f) in prev
+        .axis_iter(ndarray::Axis(0))
+        .zip(fut.axis_iter_mut(ndarray::Axis(0)))
+    {
+        super::diff_op_1d_slice_matrix(
+            &SBP8::BLOCK_MATRIX,
+            &flipmatrix,
+            &SBP8::DIAG_MATRIX,
+            super::OperatorType::Normal,
+            p.as_slice().unwrap(),
+            f.as_slice_mut().unwrap(),
+        )
+    }
+}
+
 impl SbpOperator2d for SBP8 {
     fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
@@ -69,7 +103,8 @@ impl SbpOperator2d for SBP8 {
 
         match (prev.strides(), fut.strides()) {
             ([_, 1], [_, 1]) => {
-                diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
+                //diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
+                diff_op_row_local(prev, fut);
             }
             ([1, _], [1, _]) => {
                 diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);

From a7660281c83af9b4d1ad6db0df097369f238a95a Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 08:32:26 +0100
Subject: [PATCH 10/35] add inline to remove magic fix

---
 sbp/src/operators/algos.rs        | 10 ++++++++--
 sbp/src/operators/traditional4.rs |  5 -----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 88c92e7..115b322 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -60,11 +60,11 @@ pub(crate) mod constmatrix {
         pub const fn new(data: [[T; N]; M]) -> Self {
             Self { data }
         }
-        #[inline]
+        #[inline(always)]
         pub const fn nrows(&self) -> usize {
             M
         }
-        #[inline]
+        #[inline(always)]
         pub const fn ncols(&self) -> usize {
             N
         }
@@ -95,12 +95,15 @@ pub(crate) mod constmatrix {
                 }
             }
         }
+        #[inline(always)]
         pub fn iter(&self) -> impl Iterator<Item = &T> {
             self.data.iter().flatten()
         }
+        #[inline(always)]
         pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut T> {
             self.data.iter_mut().flatten()
         }
+        #[inline(always)]
         pub fn iter_rows(
             &self,
         ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
@@ -122,9 +125,11 @@ pub(crate) mod constmatrix {
     }
 
     impl<T, const N: usize> ColVector<T, N> {
+        #[inline(always)]
         pub fn map_to_col(slice: &[T; N]) -> &ColVector<T, N> {
             unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
         }
+        #[inline(always)]
         pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector<T, N> {
             unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
         }
@@ -143,6 +148,7 @@ pub(crate) mod constmatrix {
     where
         for<'f> T: core::ops::MulAssign<&'f T>,
     {
+        #[inline(always)]
         fn mul_assign(&mut self, other: &T) {
             self.iter_mut().for_each(|x| *x *= other)
         }
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 352a92b..37519e2 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -90,11 +90,6 @@ impl SbpOperator1d for SBP4 {
 }
 
 fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
-    // Magic two lines that prevents or enables optimisation
-    // (doubles instructions when not included)
-    let mut flipmatrix = SBP4::BLOCK_MATRIX;
-    flipmatrix *= &-1.0;
-
     for (p, mut f) in prev
         .axis_iter(ndarray::Axis(0))
         .zip(fut.axis_iter_mut(ndarray::Axis(0)))

From f7c238f6a7d0bb5489687b2e08cd9f81a7dcf1ae Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 08:51:26 +0100
Subject: [PATCH 11/35] add blockend SBP8

---
 sbp/src/operators/traditional8.rs | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 14d0d18..4b431cd 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -40,6 +40,17 @@ impl SBP8 {
         [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0],
         [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
     ]);
+    #[rustfmt::skip]
+    const BLOCKEND_MATRIX: Matrix<Float, 8, 12> = Matrix::new([
+        [0.0035384016230155199,   -0.037742950645498902,   0.19815049088886899,   -0.79260196355547696,   -0.0,   0.75166824372712304,   -0.11941174319355199,   0.012224423573111201,   -0.035091936128702303,   0.0142810403117803,   0.010833666120933699,   -0.0058476727216045096],
+        [-0.0,   0.0038681313433548601,   -0.041260067662451799,   0.21661535522787201,   -0.821713248844682,   -0.0,   0.75587358166357999,   -0.046018121340651898,   -0.076822925360096897,   -0.050113507756358401,   0.087129564256063705,   -0.027558761526617698],
+        [-0.0,   -0.0,   0.0027934857464329702,   -0.029797181295285,   0.094272792663829805,   -0.54587651996612696,   -0.0,   0.205756876210586,   0.39561116774840099,   0.020799982439143602,   -0.22287532317150199,   0.079314719624520302],
+        [-0.0,   -0.0,   -0.0,   0.0086536439119011006,   -0.029896495621603899,   0.102950081118829,   -0.63739245543855805,   -0.0,   0.54336288636530095,   0.085109057027750595,   -0.090998664615454999,   0.01821194725190089],
+        [-0.0,   -0.0,   -0.0,   -0.0,   0.019698131073842998,   0.039447042394264102,   -0.28128521251909999,   -0.124714160903055,   -0.0,   0.055770702144577897,   0.41917213003600801,   -0.128088632226564],
+        [-0.0,   -0.0,   -0.0,   -0.0,   -0.055988255885229599,   0.179720579323281,   -0.10329058280084499,   -0.13643348652854601,   -0.389516189693211,   -0.0,   0.54432974445498405,   -0.038821808870425301],
+        [-0.0,   -0.0,   -0.0,   -0.0,   -0.0071669648308011497,   -0.052726783847581303,   0.18675940343293501,   0.024615176293723499,   -0.49400862680798402,   -0.091851192507295606,   -0.0,   0.43437898826698501],
+        [-0.0,   -0.0,   -0.0,   -0.0,   0.020015058331576099,   0.086285816263333495,   -0.34386522738887298,   -0.0254881486107905,   0.78102816812674902,   0.033893192260150001,   -2.2474124634140402,   1.6955436044319001],
+    ]);
 }
 
 impl SbpOperator1d for SBP8 {
@@ -75,8 +86,10 @@ impl SbpOperator1d for SBP8 {
 }
 
 fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
+    /*
     let mut flipmatrix = SBP8::BLOCK_MATRIX;
     flipmatrix *= &-1.0;
+    */
 
     for (p, mut f) in prev
         .axis_iter(ndarray::Axis(0))
@@ -84,7 +97,8 @@ fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayVi
     {
         super::diff_op_1d_slice_matrix(
             &SBP8::BLOCK_MATRIX,
-            &flipmatrix,
+            &SBP8::BLOCKEND_MATRIX,
+            //&flipmatrix,
             &SBP8::DIAG_MATRIX,
             super::OperatorType::Normal,
             p.as_slice().unwrap(),
@@ -189,3 +203,14 @@ fn test_trad8() {
         1e-4,
     );
 }
+
+#[test]
+fn block_equality() {
+    let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip();
+    flipped_inverted *= &-1.0;
+
+    assert!(flipped_inverted
+        .iter()
+        .zip(SBP8::BLOCKEND_MATRIX.iter())
+        .all(|(x, y)| (x - y).abs() < 1e-12))
+}

From 3c7cc4605a13652ed3330c9608e9d374aa1bd7d0 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 16:42:49 +0100
Subject: [PATCH 12/35] simplify traits

---
 sbp/Cargo.toml                    |  1 -
 sbp/src/operators/algos.rs        | 82 +++++++++----------------------
 sbp/src/operators/traditional4.rs |  2 +-
 sbp/src/operators/traditional8.rs |  2 +-
 4 files changed, 26 insertions(+), 61 deletions(-)

diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml
index 2dd2b22..9b7834f 100644
--- a/sbp/Cargo.toml
+++ b/sbp/Cargo.toml
@@ -10,7 +10,6 @@ approx = "0.3.2"
 packed_simd = { version = "0.3.3", package = "packed_simd_2" }
 rayon = { version = "1.3.0", optional = true }
 sprs = { version = "0.9.0", optional = true, default-features = false }
-num-traits = "0.2.11"
 serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] }
 
 [features]
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 115b322..bd8c4cb 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -1,6 +1,7 @@
 use super::*;
 
 pub(crate) mod constmatrix {
+    #![allow(unused)]
     /// A row-major matrix
     #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
     #[repr(transparent)]
@@ -70,8 +71,7 @@ pub(crate) mod constmatrix {
         }
         pub fn matmul<const P: usize>(&self, other: &Matrix<T, N, P>) -> Matrix<T, M, P>
         where
-            T: Default + core::ops::AddAssign<T>,
-            for<'f> &'f T: std::ops::Mul<Output = T>,
+            T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
         {
             let mut out = Matrix::default();
             self.matmul_into(other, &mut out);
@@ -82,14 +82,13 @@ pub(crate) mod constmatrix {
             other: &Matrix<T, N, P>,
             out: &mut Matrix<T, M, P>,
         ) where
-            T: Default + core::ops::AddAssign<T>,
-            for<'f> &'f T: std::ops::Mul<Output = T>,
+            T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
         {
             for i in 0..M {
                 for j in 0..P {
                     let mut t = T::default();
                     for k in 0..N {
-                        t += &self[(i, k)] * &other[(k, j)];
+                        t = t + self[(i, k)] * other[(k, j)];
                     }
                     out[(i, j)] = t;
                 }
@@ -144,12 +143,12 @@ pub(crate) mod constmatrix {
         }
     }
 
-    impl<T, const M: usize, const N: usize> core::ops::MulAssign<&T> for Matrix<T, M, N>
+    impl<T, const M: usize, const N: usize> core::ops::MulAssign<T> for Matrix<T, M, N>
     where
-        for<'f> T: core::ops::MulAssign<&'f T>,
+        T: Copy + core::ops::MulAssign<T>,
     {
         #[inline(always)]
-        fn mul_assign(&mut self, other: &T) {
+        fn mul_assign(&mut self, other: T) {
             self.iter_mut().for_each(|x| *x *= other)
         }
     }
@@ -243,22 +242,33 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
     }
 }
 
+#[cfg(feature = "fast-float")]
 mod fastfloat {
     use super::*;
     #[repr(transparent)]
-    #[derive(Debug, PartialEq, Default)]
+    #[derive(Copy, Clone, Debug, PartialEq, Default)]
     pub(crate) struct FastFloat(Float);
 
     use core::intrinsics::{fadd_fast, fmul_fast};
 
     impl core::ops::Mul for FastFloat {
         type Output = Self;
+        #[inline(always)]
         fn mul(self, o: Self) -> Self::Output {
             unsafe { Self(fmul_fast(self.0, o.0)) }
         }
     }
 
+    impl core::ops::Add for FastFloat {
+        type Output = Self;
+        #[inline(always)]
+        fn add(self, o: FastFloat) -> Self::Output {
+            unsafe { Self(fadd_fast(self.0, o.0)) }
+        }
+    }
+
     impl core::ops::MulAssign<FastFloat> for FastFloat {
+        #[inline(always)]
         fn mul_assign(&mut self, o: FastFloat) {
             unsafe {
                 self.0 = fmul_fast(self.0, o.0);
@@ -266,60 +276,16 @@ mod fastfloat {
         }
     }
 
-    impl core::ops::MulAssign<&FastFloat> for FastFloat {
-        fn mul_assign(&mut self, o: &FastFloat) {
-            unsafe {
-                self.0 = fmul_fast(self.0, o.0);
-            }
-        }
-    }
-
-    impl core::ops::MulAssign<FastFloat> for &mut FastFloat {
-        fn mul_assign(&mut self, o: FastFloat) {
-            unsafe {
-                self.0 = fmul_fast(self.0, o.0);
-            }
-        }
-    }
-    impl core::ops::MulAssign<&FastFloat> for &mut FastFloat {
-        fn mul_assign(&mut self, o: &FastFloat) {
-            unsafe {
-                self.0 = fmul_fast(self.0, o.0);
-            }
-        }
-    }
-
-    impl core::ops::AddAssign for FastFloat {
-        fn add_assign(&mut self, o: Self) {
-            unsafe {
-                self.0 = fadd_fast(self.0, o.0);
-            }
-        }
-    }
-
     impl core::ops::Mul for &FastFloat {
         type Output = FastFloat;
+        #[inline(always)]
         fn mul(self, o: Self) -> Self::Output {
             unsafe { FastFloat(fmul_fast(self.0, o.0)) }
         }
     }
 
-    impl core::ops::MulAssign<&Float> for FastFloat {
-        fn mul_assign(&mut self, o: &Float) {
-            unsafe {
-                self.0 = fmul_fast(self.0, *o);
-            }
-        }
-    }
-    impl core::ops::MulAssign<&Float> for &mut FastFloat {
-        fn mul_assign(&mut self, o: &Float) {
-            unsafe {
-                self.0 = fmul_fast(self.0, *o);
-            }
-        }
-    }
-
     impl From<Float> for FastFloat {
+        #[inline(always)]
         fn from(f: Float) -> Self {
             Self(f)
         }
@@ -373,7 +339,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap());
 
         block.matmul_into(prev, fut);
-        *fut *= &idx;
+        *fut *= idx;
     }
 
     // The window needs to be aligned to the diagonal elements,
@@ -391,7 +357,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let prev = ColVector::<_, D>::map_to_col(window);
 
         diag.matmul_into(prev, fut);
-        *fut *= &idx;
+        *fut *= idx;
     }
 
     {
@@ -400,7 +366,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
 
         endblock.matmul_into(prev, fut);
-        *fut *= &idx;
+        *fut *= idx;
     }
 }
 
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 37519e2..af84f0b 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -227,7 +227,7 @@ fn test_trad4() {
 #[test]
 fn block_equality() {
     let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip();
-    flipped_inverted *= &-1.0;
+    flipped_inverted *= -1.0;
 
     assert!(flipped_inverted
         .iter()
diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 4b431cd..76cd2e6 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -207,7 +207,7 @@ fn test_trad8() {
 #[test]
 fn block_equality() {
     let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip();
-    flipped_inverted *= &-1.0;
+    flipped_inverted *= -1.0;
 
     assert!(flipped_inverted
         .iter()

From c1335574593bf2d1a19bbed4a5c68f46486f417c Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 17:36:05 +0100
Subject: [PATCH 13/35] use Matrix in SBP diff

---
 sbp/src/operators/algos.rs        | 207 ++++++++++++++++++++++++++----
 sbp/src/operators/traditional4.rs |  25 +++-
 sbp/src/operators/traditional8.rs |  24 +++-
 3 files changed, 215 insertions(+), 41 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index bd8c4cb..2b60090 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -3,7 +3,7 @@ use super::*;
 pub(crate) mod constmatrix {
     #![allow(unused)]
     /// A row-major matrix
-    #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     #[repr(transparent)]
     pub struct Matrix<T, const M: usize, const N: usize> {
         data: [[T; N]; M],
@@ -11,19 +11,11 @@ pub(crate) mod constmatrix {
     pub type RowVector<T, const N: usize> = Matrix<T, 1, N>;
     pub type ColVector<T, const N: usize> = Matrix<T, N, 1>;
 
-    impl<T: Default, const M: usize, const N: usize> Default for Matrix<T, M, N> {
+    impl<T: Copy + Default, const M: usize, const N: usize> Default for Matrix<T, M, N> {
         fn default() -> Self {
-            use std::mem::MaybeUninit;
-            let mut d: [[MaybeUninit<T>; N]; M] = unsafe { MaybeUninit::uninit().assume_init() };
-
-            for row in d.iter_mut() {
-                for item in row.iter_mut() {
-                    *item = MaybeUninit::new(T::default());
-                }
+            Self {
+                data: [[T::default(); N]; M],
             }
-
-            let data = unsafe { std::mem::transmute_copy::<_, [[T; N]; M]>(&d) };
-            Self { data }
         }
     }
 
@@ -111,12 +103,12 @@ pub(crate) mod constmatrix {
 
         pub fn flip(&self) -> Self
         where
-            T: Default + Clone,
+            T: Default + Copy,
         {
             let mut v = Self::default();
             for i in 0..M {
                 for j in 0..N {
-                    v[(i, j)] = self[(M - 1 - i, N - 1 - j)].clone()
+                    v[(i, j)] = self[(M - 1 - i, N - 1 - j)]
                 }
             }
             v
@@ -163,10 +155,6 @@ pub(crate) mod constmatrix {
 
             let _m2 = Matrix::new([[1, 2], [3, 4]]);
         }
-        #[test]
-        fn construct_non_copy() {
-            let _m = Matrix::<String, 2, 1>::default();
-        }
 
         #[test]
         fn matmul() {
@@ -184,8 +172,8 @@ pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
 #[inline(always)]
 pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
     block: &Matrix<Float, M, N>,
-    diag: &RowVector<Float, N>,
-    symmetry: Symmetry,
+    blockend: &Matrix<Float, M, N>,
+    diag: &RowVector<Float, D>,
     optype: OperatorType,
     prev: ArrayView1<Float>,
     mut fut: ArrayViewMut1<Float>,
@@ -226,19 +214,14 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
         *f = diff * idx;
     }
 
-    for (bl, f) in block.iter_rows().zip(fut.iter_mut().rev()) {
+    for (bl, f) in blockend.iter_rows().zip(fut.iter_mut().rev().take(M).rev()) {
         let diff = bl
             .iter()
-            .zip(prev.iter().rev())
+            .zip(prev.iter())
             .map(|(x, y)| x * y)
             .sum::<Float>();
 
-        *f = idx
-            * if symmetry == Symmetry::Symmetric {
-                diff
-            } else {
-                -diff
-            };
+        *f = diff * idx;
     }
 }
 
@@ -708,6 +691,174 @@ fn product_fast<'a>(
     })
 }
 
+#[inline(always)]
+pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>(
+    block: &Matrix<Float, M, N>,
+    block2: &Matrix<Float, M, N>,
+    diag: &RowVector<Float, D>,
+    optype: OperatorType,
+    prev: ArrayView2<Float>,
+    mut fut: ArrayViewMut2<Float>,
+) {
+    assert_eq!(prev.shape(), fut.shape());
+    let nx = prev.shape()[1];
+    assert!(nx >= 2 * M);
+
+    assert_eq!(prev.strides()[0], 1);
+    assert_eq!(fut.strides()[0], 1);
+
+    let dx = if optype == OperatorType::H2 {
+        1.0 / (nx - 2) as Float
+    } else {
+        1.0 / (nx - 1) as Float
+    };
+    let idx = 1.0 / dx;
+
+    #[cfg(not(feature = "f32"))]
+    type SimdT = packed_simd::f64x8;
+    #[cfg(feature = "f32")]
+    type SimdT = packed_simd::f32x16;
+
+    let ny = prev.shape()[0];
+    // How many elements that can be simdified
+    let simdified = SimdT::lanes() * (ny / SimdT::lanes());
+
+    let half_diag_width = (D - 1) / 2;
+    assert!(half_diag_width <= M);
+
+    let fut_base_ptr = fut.as_mut_ptr();
+    let fut_stride = fut.strides()[1];
+    let fut_ptr = |j, i| {
+        debug_assert!(j < ny && i < nx);
+        unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
+    };
+
+    let prev_base_ptr = prev.as_ptr();
+    let prev_stride = prev.strides()[1];
+    let prev_ptr = |j, i| {
+        debug_assert!(j < ny && i < nx);
+        unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
+    };
+
+    // Not algo necessary, but gives performance increase
+    assert_eq!(fut_stride, prev_stride);
+
+    // First block
+    {
+        for (ifut, &bl) in block.iter_rows().enumerate() {
+            for j in (0..simdified).step_by(SimdT::lanes()) {
+                let index_to_simd = |i| unsafe {
+                    // j never moves past end of slice due to step_by and
+                    // rounding down
+                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
+                        prev_ptr(j, i),
+                        SimdT::lanes(),
+                    ))
+                };
+                let mut f = SimdT::splat(0.0);
+                for (iprev, &bl) in bl.iter().enumerate() {
+                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
+                }
+                f *= idx;
+
+                unsafe {
+                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
+                        fut_ptr(j, ifut),
+                        SimdT::lanes(),
+                    ));
+                }
+            }
+            for j in simdified..ny {
+                unsafe {
+                    let mut f = 0.0;
+                    for (iprev, bl) in bl.iter().enumerate() {
+                        f += bl * *prev_ptr(j, iprev);
+                    }
+                    *fut_ptr(j, ifut) = f * idx;
+                }
+            }
+        }
+    }
+
+    // Diagonal elements
+    {
+        for ifut in M..nx - M {
+            for j in (0..simdified).step_by(SimdT::lanes()) {
+                let index_to_simd = |i| unsafe {
+                    // j never moves past end of slice due to step_by and
+                    // rounding down
+                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
+                        prev_ptr(j, i),
+                        SimdT::lanes(),
+                    ))
+                };
+                let mut f = SimdT::splat(0.0);
+                for (id, &d) in diag.iter().enumerate() {
+                    let offset = ifut - half_diag_width + id;
+                    f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
+                }
+                f *= idx;
+                unsafe {
+                    // puts simd along stride 1, j never goes past end of slice
+                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
+                        fut_ptr(j, ifut),
+                        SimdT::lanes(),
+                    ));
+                }
+            }
+            for j in simdified..ny {
+                let mut f = 0.0;
+                for (id, &d) in diag.iter().enumerate() {
+                    let offset = ifut - half_diag_width + id;
+                    unsafe {
+                        f += d * *prev_ptr(j, offset);
+                    }
+                }
+                unsafe {
+                    *fut_ptr(j, ifut) = idx * f;
+                }
+            }
+        }
+    }
+
+    // End block
+    {
+        for (ifut, &bl) in (nx - M..nx).zip(block2.iter_rows()) {
+            for j in (0..simdified).step_by(SimdT::lanes()) {
+                let index_to_simd = |i| unsafe {
+                    // j never moves past end of slice due to step_by and
+                    // rounding down
+                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
+                        prev_ptr(j, i),
+                        SimdT::lanes(),
+                    ))
+                };
+                let mut f = SimdT::splat(0.0);
+                for (iprev, &bl) in (nx - N..nx).zip(bl.iter()) {
+                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
+                }
+                f *= idx;
+
+                unsafe {
+                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
+                        fut_ptr(j, ifut),
+                        SimdT::lanes(),
+                    ));
+                }
+            }
+            for j in simdified..ny {
+                unsafe {
+                    let mut f = 0.0;
+                    for (iprev, bl) in (nx - N..nx).zip(bl.iter()) {
+                        f += bl * *prev_ptr(j, iprev);
+                    }
+                    *fut_ptr(j, ifut) = f * idx;
+                }
+            }
+        }
+    }
+}
+
 #[inline(always)]
 pub(crate) fn diff_op_row(
     block: &'static [&'static [Float]],
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index af84f0b..57afd35 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -1,4 +1,4 @@
-use super::{diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d};
+use super::{SbpOperator1d, SbpOperator2d};
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
 
@@ -55,10 +55,10 @@ impl SBP4 {
 
 impl SbpOperator1d for SBP4 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
+        super::diff_op_1d_matrix(
+            &Self::BLOCK_MATRIX,
+            &Self::BLOCKEND_MATRIX,
+            &Self::DIAG_MATRIX,
             super::OperatorType::Normal,
             prev,
             fut,
@@ -104,6 +104,17 @@ fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayVi
         )
     }
 }
+fn diff_op_col_local(prev: ndarray::ArrayView2<Float>, fut: ndarray::ArrayViewMut2<Float>) {
+    let optype = super::OperatorType::Normal;
+    super::diff_op_col_matrix(
+        &SBP4::BLOCK_MATRIX,
+        &SBP4::BLOCKEND_MATRIX,
+        &SBP4::DIAG_MATRIX,
+        optype,
+        prev,
+        fut,
+    )
+}
 
 impl SbpOperator2d for SBP4 {
     fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
@@ -112,14 +123,14 @@ impl SbpOperator2d for SBP4 {
 
         let symmetry = super::Symmetry::AntiSymmetric;
         let optype = super::OperatorType::Normal;
-
         match (prev.strides(), fut.strides()) {
             ([_, 1], [_, 1]) => {
                 //diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
                 diff_op_row_local(prev, fut)
             }
             ([1, _], [1, _]) => {
-                diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
+                //diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
+                diff_op_col_local(prev, fut)
             }
             ([_, _], [_, _]) => {
                 // Fallback, work row by row
diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 76cd2e6..88631b3 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -55,10 +55,10 @@ impl SBP8 {
 
 impl SbpOperator1d for SBP8 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
+        super::diff_op_1d_matrix(
+            &Self::BLOCK_MATRIX,
+            &Self::BLOCKEND_MATRIX,
+            &Self::DIAG_MATRIX,
             super::OperatorType::Normal,
             prev,
             fut,
@@ -107,6 +107,18 @@ fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayVi
     }
 }
 
+fn diff_op_col_local(prev: ndarray::ArrayView2<Float>, fut: ndarray::ArrayViewMut2<Float>) {
+    let optype = super::OperatorType::Normal;
+    super::diff_op_col_matrix(
+        &SBP8::BLOCK_MATRIX,
+        &SBP8::BLOCKEND_MATRIX,
+        &SBP8::DIAG_MATRIX,
+        optype,
+        prev,
+        fut,
+    )
+}
+
 impl SbpOperator2d for SBP8 {
     fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
@@ -114,14 +126,14 @@ impl SbpOperator2d for SBP8 {
 
         let symmetry = super::Symmetry::AntiSymmetric;
         let optype = super::OperatorType::Normal;
-
         match (prev.strides(), fut.strides()) {
             ([_, 1], [_, 1]) => {
                 //diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
                 diff_op_row_local(prev, fut);
             }
             ([1, _], [1, _]) => {
-                diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
+                //diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
+                diff_op_col_local(prev, fut)
             }
             ([_, _], [_, _]) => {
                 // Fallback, work row by row

From db94caf2b2495df8855d2a3333b3a30943253cd0 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 17:47:13 +0100
Subject: [PATCH 14/35] change repr of Matrix

---
 sbp/src/operators/algos.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 2b60090..659ce42 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -4,7 +4,7 @@ pub(crate) mod constmatrix {
     #![allow(unused)]
     /// A row-major matrix
     #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-    #[repr(transparent)]
+    #[repr(C)]
     pub struct Matrix<T, const M: usize, const N: usize> {
         data: [[T; N]; M],
     }
@@ -318,7 +318,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
 
     use std::convert::TryInto;
     {
-        let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().nth(0).unwrap());
+        let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
         let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap());
 
         block.matmul_into(prev, fut);
@@ -335,7 +335,6 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         .zip(fut.array_chunks_mut::<1>().skip(M))
         .take(nx - 2 * M)
     {
-        // impl From here?
         let fut = ColVector::<_, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 

From 00f3ba6a01eeffd76dd815cbb69e70b7d1df91c8 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Fri, 29 Jan 2021 17:59:35 +0100
Subject: [PATCH 15/35] use split_at_mut

---
 sbp/src/operators/algos.rs | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 659ce42..ad13240 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -316,10 +316,14 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     #[cfg(feature = "fast-float")]
     let idx = FastFloat::from(idx);
 
+    // Help aliasing analysis
+    let (futb1, fut) = fut.split_at_mut(M);
+    let (fut, futb2) = fut.split_at_mut(nx - 2 * M);
+
     use std::convert::TryInto;
     {
         let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
-        let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap());
+        let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap());
 
         block.matmul_into(prev, fut);
         *fut *= idx;
@@ -332,8 +336,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     for (window, f) in prev
         .array_windows::<D>()
         .skip(window_elems_to_skip)
-        .zip(fut.array_chunks_mut::<1>().skip(M))
-        .take(nx - 2 * M)
+        .zip(fut.array_chunks_mut::<1>())
     {
         let fut = ColVector::<_, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
@@ -345,7 +348,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
     {
         let prev = prev.array_windows::<N>().next_back().unwrap();
         let prev = ColVector::<_, N>::map_to_col(prev);
-        let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap());
+        let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap());
 
         endblock.matmul_into(prev, fut);
         *fut *= idx;

From 481f2d607ed230a04fd78e97bc9255b9f9413d74 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Sat, 30 Jan 2021 15:58:03 +0100
Subject: [PATCH 16/35] remove a lot of unsafe, lost perf

---
 sbp/src/operators/algos.rs | 137 ++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 72 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index ad13240..1928f2d 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -700,7 +700,7 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
     diag: &RowVector<Float, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
-    mut fut: ArrayViewMut2<Float>,
+    fut: ArrayViewMut2<Float>,
 ) {
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[1];
@@ -728,12 +728,7 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
     let half_diag_width = (D - 1) / 2;
     assert!(half_diag_width <= M);
 
-    let fut_base_ptr = fut.as_mut_ptr();
     let fut_stride = fut.strides()[1];
-    let fut_ptr = |j, i| {
-        debug_assert!(j < ny && i < nx);
-        unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
-    };
 
     let prev_base_ptr = prev.as_ptr();
     let prev_stride = prev.strides()[1];
@@ -745,88 +740,92 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
     // Not algo necessary, but gives performance increase
     assert_eq!(fut_stride, prev_stride);
 
+    use ndarray::Axis;
+    let (mut fut1, fut) = fut.split_at(Axis(1), M);
+    let (mut fut, mut fut2) = fut.split_at(Axis(1), nx - 2 * M);
+
     // First block
     {
-        for (ifut, &bl) in block.iter_rows().enumerate() {
-            for j in (0..simdified).step_by(SimdT::lanes()) {
-                let index_to_simd = |i| unsafe {
-                    // j never moves past end of slice due to step_by and
-                    // rounding down
-                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                        prev_ptr(j, i),
-                        SimdT::lanes(),
-                    ))
-                };
+        let prev = prev.slice(ndarray::s![.., ..N]);
+        let (prevb, prevl) = prev.split_at(Axis(0), simdified);
+        for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(block.iter_rows()) {
+            let fut = fut.as_slice_mut().unwrap();
+            let fut = &mut fut[..ny];
+
+            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+            let mut prev = prevb.axis_chunks_iter(Axis(0), SimdT::lanes());
+
+            for (fut, prev) in fut.by_ref().zip(prev.by_ref()) {
                 let mut f = SimdT::splat(0.0);
-                for (iprev, &bl) in bl.iter().enumerate() {
-                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
+                for (&bl, prev) in bl.iter().zip(prev.axis_iter(Axis(1))) {
+                    let prev = prev.to_slice().unwrap();
+                    let prev = SimdT::from_slice_unaligned(prev);
+                    f = prev.mul_adde(SimdT::splat(bl), f);
                 }
                 f *= idx;
-
-                unsafe {
-                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                        fut_ptr(j, ifut),
-                        SimdT::lanes(),
-                    ));
-                }
+                f.write_to_slice_unaligned(fut);
             }
-            for j in simdified..ny {
-                unsafe {
-                    let mut f = 0.0;
-                    for (iprev, bl) in bl.iter().enumerate() {
-                        f += bl * *prev_ptr(j, iprev);
-                    }
-                    *fut_ptr(j, ifut) = f * idx;
+            for (fut, prev) in fut
+                .into_remainder()
+                .iter_mut()
+                .zip(prevl.axis_iter(Axis(0)))
+            {
+                let mut f = 0.0;
+                for (bl, prev) in bl.iter().zip(prev.iter()) {
+                    f += bl * prev;
                 }
+                *fut = f * idx;
             }
         }
     }
 
     // Diagonal elements
     {
-        for ifut in M..nx - M {
-            for j in (0..simdified).step_by(SimdT::lanes()) {
-                let index_to_simd = |i| unsafe {
-                    // j never moves past end of slice due to step_by and
-                    // rounding down
-                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                        prev_ptr(j, i),
-                        SimdT::lanes(),
-                    ))
-                };
+        let window_elems_to_skip = M - ((D - 1) / 2);
+        let prev = prev.slice(ndarray::s![.., window_elems_to_skip..]);
+        let prev = prev.windows((ny, D));
+        for (mut fut, prev) in fut.axis_iter_mut(Axis(1)).zip(prev) {
+            let fut = fut.as_slice_mut().unwrap();
+            let fut = &mut fut[..ny];
+
+            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+
+            let (prevb, prevl) = prev.split_at(Axis(0), simdified);
+            let prev = prevb.axis_chunks_iter(Axis(0), SimdT::lanes());
+
+            for (fut, prev) in fut.by_ref().zip(prev) {
                 let mut f = SimdT::splat(0.0);
-                for (id, &d) in diag.iter().enumerate() {
-                    let offset = ifut - half_diag_width + id;
-                    f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
+                for (&d, prev) in diag.iter().zip(prev.axis_iter(Axis(1))) {
+                    let prev = prev.to_slice().unwrap();
+                    let prev = SimdT::from_slice_unaligned(prev);
+                    f = prev.mul_adde(SimdT::splat(d), f);
                 }
                 f *= idx;
-                unsafe {
-                    // puts simd along stride 1, j never goes past end of slice
-                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                        fut_ptr(j, ifut),
-                        SimdT::lanes(),
-                    ));
-                }
+                f.write_to_slice_unaligned(fut);
             }
-            for j in simdified..ny {
+
+            for (fut, prev) in fut
+                .into_remainder()
+                .into_iter()
+                .zip(prevl.axis_iter(Axis(0)))
+            {
                 let mut f = 0.0;
-                for (id, &d) in diag.iter().enumerate() {
-                    let offset = ifut - half_diag_width + id;
-                    unsafe {
-                        f += d * *prev_ptr(j, offset);
-                    }
-                }
-                unsafe {
-                    *fut_ptr(j, ifut) = idx * f;
+                for (&d, prev) in diag.iter().zip(prev) {
+                    f += d * prev;
                 }
+                *fut = idx * f;
             }
         }
     }
 
     // End block
     {
-        for (ifut, &bl) in (nx - M..nx).zip(block2.iter_rows()) {
-            for j in (0..simdified).step_by(SimdT::lanes()) {
+        for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(block2.iter_rows()) {
+            let fut = fut.as_slice_mut().unwrap();
+            let fut = &mut fut[..ny];
+            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+
+            for (fut, j) in fut.by_ref().zip((0..simdified).step_by(SimdT::lanes())) {
                 let index_to_simd = |i| unsafe {
                     // j never moves past end of slice due to step_by and
                     // rounding down
@@ -840,21 +839,15 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
                     f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
                 }
                 f *= idx;
-
-                unsafe {
-                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                        fut_ptr(j, ifut),
-                        SimdT::lanes(),
-                    ));
-                }
+                f.write_to_slice_unaligned(fut);
             }
-            for j in simdified..ny {
+            for (fut, j) in fut.into_remainder().into_iter().zip(simdified..ny) {
                 unsafe {
                     let mut f = 0.0;
                     for (iprev, bl) in (nx - N..nx).zip(bl.iter()) {
                         f += bl * *prev_ptr(j, iprev);
                     }
-                    *fut_ptr(j, ifut) = f * idx;
+                    *fut = f * idx;
                 }
             }
         }

From b0e1ec62f86b3e19a98cdc066b3288fcd74be8bb Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Sun, 31 Jan 2021 13:23:15 +0100
Subject: [PATCH 17/35] change order in matmul_into

---
 sbp/src/operators/algos.rs | 72 +++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 1928f2d..1681501 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -61,31 +61,6 @@ pub(crate) mod constmatrix {
         pub const fn ncols(&self) -> usize {
             N
         }
-        pub fn matmul<const P: usize>(&self, other: &Matrix<T, N, P>) -> Matrix<T, M, P>
-        where
-            T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
-        {
-            let mut out = Matrix::default();
-            self.matmul_into(other, &mut out);
-            out
-        }
-        pub fn matmul_into<const P: usize>(
-            &self,
-            other: &Matrix<T, N, P>,
-            out: &mut Matrix<T, M, P>,
-        ) where
-            T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
-        {
-            for i in 0..M {
-                for j in 0..P {
-                    let mut t = T::default();
-                    for k in 0..N {
-                        t = t + self[(i, k)] * other[(k, j)];
-                    }
-                    out[(i, j)] = t;
-                }
-            }
-        }
         #[inline(always)]
         pub fn iter(&self) -> impl Iterator<Item = &T> {
             self.data.iter().flatten()
@@ -135,6 +110,45 @@ pub(crate) mod constmatrix {
         }
     }
 
+    impl<T, const M: usize, const P: usize> Matrix<T, M, P> {
+        #[inline(always)]
+        pub fn matmul_into<const N: usize>(&mut self, lhs: &Matrix<T, M, N>, rhs: &Matrix<T, N, P>)
+        where
+            T: Default + Copy + core::ops::Mul<Output = T> + core::ops::Add<Output = T>,
+        {
+            for i in 0..M {
+                for j in 0..P {
+                    let mut t = T::default();
+                    for k in 0..N {
+                        t = t + lhs[(i, k)] * rhs[(k, j)];
+                    }
+                    self[(i, j)] = t;
+                }
+            }
+        }
+    }
+
+    macro_rules! impl_op_mul_mul {
+        ($lhs:ty, $rhs:ty) => {
+            impl<T, const N: usize, const M: usize, const P: usize> core::ops::Mul<$rhs> for $lhs
+            where
+                T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
+            {
+                type Output = Matrix<T, M, P>;
+                fn mul(self, rhs: $rhs) -> Self::Output {
+                    let mut out = Matrix::default();
+                    out.matmul_into(&self, &rhs);
+                    out
+                }
+            }
+        };
+    }
+
+    impl_op_mul_mul! {  Matrix<T, M, N>,  Matrix<T, N, P> }
+    impl_op_mul_mul! { &Matrix<T, M, N>,  Matrix<T, N, P> }
+    impl_op_mul_mul! {  Matrix<T, M, N>, &Matrix<T, N, P> }
+    impl_op_mul_mul! { &Matrix<T, M, N>, &Matrix<T, N, P> }
+
     impl<T, const M: usize, const N: usize> core::ops::MulAssign<T> for Matrix<T, M, N>
     where
         T: Copy + core::ops::MulAssign<T>,
@@ -161,7 +175,7 @@ pub(crate) mod constmatrix {
             let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
             let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]);
 
-            let m3 = m1.matmul(&m2);
+            let m3 = m1 * m2;
             assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]]));
         }
     }
@@ -325,7 +339,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
         let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap());
 
-        block.matmul_into(prev, fut);
+        fut.matmul_into(block, prev);
         *fut *= idx;
     }
 
@@ -341,7 +355,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let fut = ColVector::<_, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 
-        diag.matmul_into(prev, fut);
+        fut.matmul_into(diag, prev);
         *fut *= idx;
     }
 
@@ -350,7 +364,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap());
 
-        endblock.matmul_into(prev, fut);
+        fut.matmul_into(endblock, prev);
         *fut *= idx;
     }
 }

From 45e4d515136f09cc0942205644f85924bf547850 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Sun, 31 Jan 2021 13:32:53 +0100
Subject: [PATCH 18/35] remove a closure

---
 sbp/src/operators/algos.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 1681501..6c579e5 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -73,7 +73,7 @@ pub(crate) mod constmatrix {
         pub fn iter_rows(
             &self,
         ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
-            (0..M).map(move |i| &self[i])
+            self.data.iter()
         }
 
         pub fn flip(&self) -> Self

From c660354c3fcd738ec06f575f24116d235f3599fe Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Sun, 31 Jan 2021 14:54:15 +0100
Subject: [PATCH 19/35] Matrix for Upwind4

---
 sbp/src/operators/algos.rs        | 66 +++++++++++++++++++++++++++++++
 sbp/src/operators/traditional4.rs |  5 +--
 sbp/src/operators/traditional8.rs |  5 +--
 sbp/src/operators/upwind4.rs      | 60 ++++++++++++++++++++++------
 4 files changed, 117 insertions(+), 19 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 6c579e5..a504b4c 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -178,6 +178,71 @@ pub(crate) mod constmatrix {
             let m3 = m1 * m2;
             assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]]));
         }
+        #[test]
+        fn iter() {
+            let m = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
+            let mut iter = m.iter();
+            assert_eq!(iter.next(), Some(&1));
+            assert_eq!(iter.next(), Some(&2));
+            assert_eq!(iter.next(), Some(&3));
+            assert_eq!(iter.next(), Some(&4));
+            assert_eq!(iter.next(), Some(&5));
+            assert_eq!(iter.next(), Some(&6));
+            assert_eq!(iter.next(), None);
+        }
+    }
+
+    mod approx {
+        use super::Matrix;
+        use ::approx::{AbsDiffEq, RelativeEq, UlpsEq};
+
+        impl<T, const M: usize, const N: usize> AbsDiffEq for Matrix<T, M, N>
+        where
+            T: AbsDiffEq,
+        {
+            type Epsilon = T::Epsilon;
+            fn default_epsilon() -> Self::Epsilon {
+                T::default_epsilon()
+            }
+            fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
+                self.iter()
+                    .zip(other.iter())
+                    .all(|(r, l)| r.abs_diff_eq(l, T::default_epsilon()))
+            }
+        }
+        impl<T, const M: usize, const N: usize> RelativeEq for Matrix<T, M, N>
+        where
+            T: RelativeEq,
+            Self::Epsilon: Copy,
+        {
+            fn default_max_relative() -> Self::Epsilon {
+                T::default_max_relative()
+            }
+            fn relative_eq(
+                &self,
+                other: &Self,
+                epsilon: Self::Epsilon,
+                max_relative: Self::Epsilon,
+            ) -> bool {
+                self.iter()
+                    .zip(other.iter())
+                    .all(|(r, l)| r.relative_eq(l, epsilon, max_relative))
+            }
+        }
+        impl<T, const M: usize, const N: usize> UlpsEq for Matrix<T, M, N>
+        where
+            T: UlpsEq,
+            Self::Epsilon: Copy,
+        {
+            fn default_max_ulps() -> u32 {
+                T::default_max_ulps()
+            }
+            fn ulps_eq(&self, other: &Self, epsilon: Self::Epsilon, max_ulps: u32) -> bool {
+                self.iter()
+                    .zip(other.iter())
+                    .all(|(r, l)| r.ulps_eq(l, epsilon, max_ulps))
+            }
+        }
     }
 }
 
@@ -228,6 +293,7 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
         *f = diff * idx;
     }
 
+    let prev = prev.slice(ndarray::s![nx - N..]);
     for (bl, f) in blockend.iter_rows().zip(fut.iter_mut().rev().take(M).rev()) {
         let diff = bl
             .iter()
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 57afd35..db5802f 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -240,8 +240,5 @@ fn block_equality() {
     let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip();
     flipped_inverted *= -1.0;
 
-    assert!(flipped_inverted
-        .iter()
-        .zip(SBP4::BLOCKEND_MATRIX.iter())
-        .all(|(x, y)| (x - y).abs() < 1e-3))
+    approx::assert_ulps_eq!(SBP4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1);
 }
diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 88631b3..48891f8 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -221,8 +221,5 @@ fn block_equality() {
     let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip();
     flipped_inverted *= -1.0;
 
-    assert!(flipped_inverted
-        .iter()
-        .zip(SBP8::BLOCKEND_MATRIX.iter())
-        .all(|(x, y)| (x - y).abs() < 1e-12))
+    approx::assert_ulps_eq!(SBP8::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1);
 }
diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs
index 8e577b6..66b9852 100644
--- a/sbp/src/operators/upwind4.rs
+++ b/sbp/src/operators/upwind4.rs
@@ -1,5 +1,6 @@
 use super::{
-    diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d,
+    diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d, UpwindOperator1d,
+    UpwindOperator2d,
 };
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Axis};
@@ -174,12 +175,30 @@ impl Upwind4 {
         -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
     ];
     #[rustfmt::skip]
+    const DIAG_MATRIX : RowVector<Float, 7> = RowVector::new([
+        [-1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0]
+    ]);
+    #[rustfmt::skip]
     const BLOCK: &'static [&'static [Float]] = &[
         &[  -72.0 / 49.0, 187.0 / 98.0,   -20.0 / 49.0,   -3.0 / 98.0,           0.0,           0.0,         0.0],
         &[-187.0 / 366.0,          0.0,   69.0 / 122.0, -16.0 / 183.0,    2.0 / 61.0,           0.0,         0.0],
         &[  20.0 / 123.0, -69.0 / 82.0,            0.0, 227.0 / 246.0,  -12.0 / 41.0,    2.0 / 41.0,         0.0],
         &[   3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0,           0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
     ];
+    #[rustfmt::skip]
+    const BLOCK_MATRIX: Matrix<Float, 4, 7> = Matrix::new([
+        [  -72.0 / 49.0, 187.0 / 98.0,   -20.0 / 49.0,   -3.0 / 98.0,           0.0,           0.0,         0.0],
+        [-187.0 / 366.0,          0.0,   69.0 / 122.0, -16.0 / 183.0,    2.0 / 61.0,           0.0,         0.0],
+        [  20.0 / 123.0, -69.0 / 82.0,            0.0, 227.0 / 246.0,  -12.0 / 41.0,    2.0 / 41.0,         0.0],
+        [   3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0,           0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
+    ]);
+    #[rustfmt::skip]
+    const BLOCKEND_MATRIX: Matrix<Float, 4, 7> = Matrix::new([
+        [ -6.0 / 149.0, 36.0 / 149.0, -126.0 / 149.0, 0.0, 227.0 / 298.0, -16.0 / 149.0, -3.0 / 298.0],
+        [ 0.0, -2.0/41.0, 12.0/41.0, -227.0/246.0, 0.0, 69.0/82.0,  -20.0/123.0],
+        [ 0.0, 0.0, -2.0/61.0, 16.0/183.0, -69.0/122.0, 0.0, 187.0/366.0],
+        [  0.0, 0.0, 0.0, 3.0 / 98.0, 20.0 / 49.0, -187.0 / 98.0, 72.0/49.0],
+    ]);
 
     #[rustfmt::skip]
     const DISS_BLOCK: &'static [&'static [Float]] = &[
@@ -197,10 +216,10 @@ impl Upwind4 {
 
 impl SbpOperator1d for Upwind4 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
+        super::diff_op_1d_matrix(
+            &Self::BLOCK_MATRIX,
+            &Self::BLOCKEND_MATRIX,
+            &Self::DIAG_MATRIX,
             super::OperatorType::Normal,
             prev,
             fut,
@@ -229,18 +248,29 @@ impl SbpOperator1d for Upwind4 {
     }
 }
 
+fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
+    for (p, mut f) in prev
+        .axis_iter(ndarray::Axis(0))
+        .zip(fut.axis_iter_mut(ndarray::Axis(0)))
+    {
+        super::diff_op_1d_slice_matrix(
+            &Upwind4::BLOCK_MATRIX,
+            &Upwind4::BLOCKEND_MATRIX,
+            &Upwind4::DIAG_MATRIX,
+            super::OperatorType::Normal,
+            p.as_slice().unwrap(),
+            f.as_slice_mut().unwrap(),
+        )
+    }
+}
+
 impl SbpOperator2d for Upwind4 {
     fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
         assert!(prev.shape()[1] >= 2 * Upwind4::BLOCK.len());
 
         match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => diff_op_row(
-                Upwind4::BLOCK,
-                Upwind4::DIAG,
-                super::Symmetry::AntiSymmetric,
-                super::OperatorType::Normal,
-            )(prev, fut),
+            ([_, 1], [_, 1]) => diff_op_row_local(prev, fut),
             ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
                 diff_simd_col(prev, fut)
             }
@@ -465,3 +495,11 @@ fn upwind4_test2() {
         1e-1,
     );
 }
+
+#[test]
+fn block_equality() {
+    let mut flipped_inverted = Upwind4::BLOCK_MATRIX.flip();
+    flipped_inverted *= -1.0;
+
+    approx::assert_ulps_eq!(Upwind4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1);
+}

From c73c6e7407ea045bb19d246e42b61085aa83088e Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Sun, 31 Jan 2021 15:40:28 +0100
Subject: [PATCH 20/35] diff_op_col_naive_matrix

---
 sbp/src/operators/algos.rs        | 76 +++++++++++++++++++++++++++++++
 sbp/src/operators/traditional4.rs |  2 +-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index a504b4c..b32425c 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -572,6 +572,82 @@ pub(crate) fn diff_op_col_naive(
     }
 }
 
+#[inline(always)]
+#[allow(unused)]
+pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D: usize>(
+    block: &Matrix<Float, M, N>,
+    blockend: &Matrix<Float, M, N>,
+    diag: &RowVector<Float, D>,
+    optype: OperatorType,
+    prev: ArrayView2<Float>,
+    mut fut: ArrayViewMut2<Float>,
+) {
+    assert_eq!(prev.shape(), fut.shape());
+    let nx = prev.shape()[1];
+    let ny = prev.shape()[0];
+    assert!(nx >= 2 * M);
+
+    assert_eq!(prev.strides()[0], 1);
+    assert_eq!(fut.strides()[0], 1);
+
+    let dx = if optype == OperatorType::H2 {
+        1.0 / (nx - 2) as Float
+    } else {
+        1.0 / (nx - 1) as Float
+    };
+    let idx = 1.0 / dx;
+
+    fut.fill(0.0);
+
+    let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut((
+        ndarray::s![.., ..M],
+        ndarray::s![.., M..nx - M],
+        ndarray::s![.., nx - M..],
+    ));
+
+    // First block
+    for (bl, mut fut) in block.iter_rows().zip(fut0.axis_iter_mut(ndarray::Axis(1))) {
+        debug_assert_eq!(fut.len(), prev.shape()[0]);
+        for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
+            if bl == 0.0 {
+                continue;
+            }
+            debug_assert_eq!(prev.len(), fut.len());
+            fut.scaled_add(idx * bl, &prev);
+        }
+    }
+
+    let window_elems_to_skip = M - ((D - 1) / 2);
+
+    // Diagonal entries
+    for (mut fut, id) in futmid
+        .axis_iter_mut(ndarray::Axis(1))
+        .zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip))
+    {
+        for (&d, id) in diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
+            if d == 0.0 {
+                continue;
+            }
+            fut.scaled_add(idx * d, &id)
+        }
+    }
+
+    // End block
+    let prev = prev.slice(ndarray::s!(.., nx - N..));
+    for (bl, mut fut) in blockend
+        .iter_rows()
+        .zip(futn.axis_iter_mut(ndarray::Axis(1)))
+    {
+        fut.fill(0.0);
+        for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
+            if bl == 0.0 {
+                continue;
+            }
+            fut.scaled_add(idx * bl, &prev);
+        }
+    }
+}
+
 #[inline(always)]
 pub(crate) fn diff_op_col(
     block: &'static [&'static [Float]],
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index db5802f..f480cda 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -106,7 +106,7 @@ fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayVi
 }
 fn diff_op_col_local(prev: ndarray::ArrayView2<Float>, fut: ndarray::ArrayViewMut2<Float>) {
     let optype = super::OperatorType::Normal;
-    super::diff_op_col_matrix(
+    super::diff_op_col_naive_matrix(
         &SBP4::BLOCK_MATRIX,
         &SBP4::BLOCKEND_MATRIX,
         &SBP4::DIAG_MATRIX,

From 1f3aa2c11660698bcd251c5c299fa70d23332fec Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 18:01:20 +0100
Subject: [PATCH 21/35] make core-intrinsics cfg'ed

---
 sbp/src/lib.rs             |  2 +-
 sbp/src/operators/algos.rs | 31 ++++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs
index 1d09854..62cb872 100644
--- a/sbp/src/lib.rs
+++ b/sbp/src/lib.rs
@@ -1,4 +1,4 @@
-#![feature(core_intrinsics)]
+#![cfg_attr(feature = "fast-float", feature(core_intrinsics))]
 #![feature(array_windows)]
 #![feature(array_chunks)]
 
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index b32425c..11b9923 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -353,6 +353,12 @@ mod fastfloat {
             Self(f)
         }
     }
+    impl From<FastFloat> for Float {
+        #[inline(always)]
+        fn from(f: FastFloat) -> Self {
+            f.0
+        }
+    }
 }
 #[cfg(feature = "fast-float")]
 use fastfloat::FastFloat;
@@ -838,14 +844,17 @@ pub(crate) fn diff_op_col_simd(
 }
 
 #[inline(always)]
-fn product_fast<'a>(
-    u: impl Iterator<Item = &'a Float>,
-    v: impl Iterator<Item = &'a Float>,
-) -> Float {
-    use std::intrinsics::{fadd_fast, fmul_fast};
-    u.zip(v).fold(0.0, |acc, (&u, &v)| unsafe {
-        // We do not care about the order of multiplication nor addition
-        fadd_fast(acc, fmul_fast(u, v))
+fn dotproduct<'a>(u: impl Iterator<Item = &'a Float>, v: impl Iterator<Item = &'a Float>) -> Float {
+    u.zip(v).fold(0.0, |acc, (&u, &v)| {
+        #[cfg(feature = "fast-float")]
+        unsafe {
+            // We do not care about the order of multiplication nor addition
+            (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into()
+        }
+        #[cfg(not(feature = "fast-float"))]
+        {
+            acc + u * v
+        }
     })
 }
 
@@ -1043,7 +1052,7 @@ pub(crate) fn diff_op_row(
             assert!(prev.len() >= 2 * block.len());
 
             for (bl, f) in block.iter().zip(fut.iter_mut()) {
-                let diff = product_fast(bl.iter(), prev[..bl.len()].iter());
+                let diff = dotproduct(bl.iter(), prev[..bl.len()].iter());
                 *f = diff * idx;
             }
 
@@ -1057,12 +1066,12 @@ pub(crate) fn diff_op_row(
                 .zip(fut.iter_mut().skip(block.len()))
                 .take(nx - 2 * block.len())
             {
-                let diff = product_fast(diag.iter(), window.iter());
+                let diff = dotproduct(diag.iter(), window.iter());
                 *f = diff * idx;
             }
 
             for (bl, f) in block.iter().zip(fut.iter_mut().rev()) {
-                let diff = product_fast(bl.iter(), prev.iter().rev());
+                let diff = dotproduct(bl.iter(), prev.iter().rev());
 
                 *f = idx
                     * if symmetry == Symmetry::Symmetric {

From f7f8a7ffff09250142c554104f0d788d78026496 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 18:37:59 +0100
Subject: [PATCH 22/35] make flip const functions

---
 sbp/src/lib.rs                    |  1 +
 sbp/src/operators/algos.rs        | 85 +++++++++++++++++++++++++------
 sbp/src/operators/traditional4.rs | 17 +------
 sbp/src/operators/traditional8.rs | 21 +-------
 sbp/src/operators/upwind4.rs      | 17 +------
 5 files changed, 77 insertions(+), 64 deletions(-)

diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs
index 62cb872..52a1afa 100644
--- a/sbp/src/lib.rs
+++ b/sbp/src/lib.rs
@@ -1,6 +1,7 @@
 #![cfg_attr(feature = "fast-float", feature(core_intrinsics))]
 #![feature(array_windows)]
 #![feature(array_chunks)]
+#![feature(const_fn_floating_point_arithmetic)]
 
 /// Type used for floats, configure with the `f32` feature
 #[cfg(feature = "f32")]
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 11b9923..7fe8dd6 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -6,7 +6,7 @@ pub(crate) mod constmatrix {
     #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     #[repr(C)]
     pub struct Matrix<T, const M: usize, const N: usize> {
-        data: [[T; N]; M],
+        pub data: [[T; N]; M],
     }
     pub type RowVector<T, const N: usize> = Matrix<T, 1, N>;
     pub type ColVector<T, const N: usize> = Matrix<T, N, 1>;
@@ -75,19 +75,6 @@ pub(crate) mod constmatrix {
         ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
             self.data.iter()
         }
-
-        pub fn flip(&self) -> Self
-        where
-            T: Default + Copy,
-        {
-            let mut v = Self::default();
-            for i in 0..M {
-                for j in 0..N {
-                    v[(i, j)] = self[(M - 1 - i, N - 1 - j)]
-                }
-            }
-            v
-        }
     }
 
     impl<T, const N: usize> ColVector<T, N> {
@@ -244,9 +231,77 @@ pub(crate) mod constmatrix {
             }
         }
     }
+
+    pub(crate) const fn flip_ud<const M: usize, const N: usize>(
+        mut m: Matrix<super::Float, M, N>,
+    ) -> Matrix<super::Float, M, N> {
+        let mut i = 0;
+        while i < M / 2 {
+            let tmp = m.data[i];
+            m.data[i] = m.data[M - 1 - i];
+            m.data[M - 1 - i] = tmp;
+            i += 1;
+        }
+        m
+    }
+
+    pub(crate) const fn flip_lr<const M: usize, const N: usize>(
+        mut m: Matrix<super::Float, M, N>,
+    ) -> Matrix<super::Float, M, N> {
+        let mut i = 0;
+        while i < M {
+            let mut j = 0;
+            while j < N / 2 {
+                let tmp = m.data[i][j];
+                m.data[i][j] = m.data[i][N - 1 - j];
+                m.data[i][N - 1 - j] = tmp;
+                j += 1;
+            }
+            i += 1;
+        }
+        m
+    }
+
+    /// Flip all sign bits
+    pub(crate) const fn flip_sign<const M: usize, const N: usize>(
+        mut m: Matrix<super::Float, M, N>,
+    ) -> Matrix<super::Float, M, N> {
+        let mut i = 0;
+        while i < M {
+            let mut j = 0;
+            while j < N {
+                m.data[i][j] = -m.data[i][j];
+                j += 1;
+            }
+            i += 1;
+        }
+        m
+    }
+    mod flipping {
+        use super::*;
+
+        #[test]
+        fn flip_lr_test() {
+            let m = Matrix::new([[1.0, 2.0, 3.0, 4.0]]);
+            let flipped = flip_lr(m);
+            assert_eq!(flipped, Matrix::new([[4.0, 3.0, 2.0, 1.0]]));
+            let m = Matrix::new([[1.0, 2.0, 3.0, 4.0, 5.0]]);
+            let flipped = flip_lr(m);
+            assert_eq!(flipped, Matrix::new([[5.0, 4.0, 3.0, 2.0, 1.0]]));
+        }
+        #[test]
+        fn flip_ud_test() {
+            let m = Matrix::new([[1.0], [2.0], [3.0], [4.0]]);
+            let flipped = flip_ud(m);
+            assert_eq!(flipped, Matrix::new([[4.0], [3.0], [2.0], [1.0]]));
+            let m = Matrix::new([[1.0], [2.0], [3.0], [4.0], [5.0]]);
+            let flipped = flip_ud(m);
+            assert_eq!(flipped, Matrix::new([[5.0], [4.0], [3.0], [2.0], [1.0]]));
+        }
+    }
 }
 
-pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
+pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector};
 
 #[inline(always)]
 pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index f480cda..417b799 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -32,13 +32,8 @@ impl SBP4 {
         [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0],
         [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0]
     ]);
-    #[rustfmt::skip]
-    const BLOCKEND_MATRIX: super::Matrix<Float, 4, 6> = super::Matrix::new([
-        [4.0/49.0, -32.0/49.0, 0.0, 59.0/98.0, 0.0, -3.0/98.0 ],
-        [0.0,      4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0],
-        [0.0, 0.0, 0.0, -1.0/2.0, 0.0, 1.0/2.0],
-        [0.0, 0.0, 3.0/34.0, 4.0/17.0, -59.0/34.0, 24.0/17.0],
-    ]);
+    const BLOCKEND_MATRIX: super::Matrix<Float, 4, 6> =
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX)));
 
     #[rustfmt::skip]
     const D2DIAG: &'static [Float] = &[
@@ -234,11 +229,3 @@ fn test_trad4() {
         1e-1,
     );
 }
-
-#[test]
-fn block_equality() {
-    let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip();
-    flipped_inverted *= -1.0;
-
-    approx::assert_ulps_eq!(SBP4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1);
-}
diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 48891f8..32b392c 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -40,17 +40,8 @@ impl SBP8 {
         [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0],
         [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
     ]);
-    #[rustfmt::skip]
-    const BLOCKEND_MATRIX: Matrix<Float, 8, 12> = Matrix::new([
-        [0.0035384016230155199,   -0.037742950645498902,   0.19815049088886899,   -0.79260196355547696,   -0.0,   0.75166824372712304,   -0.11941174319355199,   0.012224423573111201,   -0.035091936128702303,   0.0142810403117803,   0.010833666120933699,   -0.0058476727216045096],
-        [-0.0,   0.0038681313433548601,   -0.041260067662451799,   0.21661535522787201,   -0.821713248844682,   -0.0,   0.75587358166357999,   -0.046018121340651898,   -0.076822925360096897,   -0.050113507756358401,   0.087129564256063705,   -0.027558761526617698],
-        [-0.0,   -0.0,   0.0027934857464329702,   -0.029797181295285,   0.094272792663829805,   -0.54587651996612696,   -0.0,   0.205756876210586,   0.39561116774840099,   0.020799982439143602,   -0.22287532317150199,   0.079314719624520302],
-        [-0.0,   -0.0,   -0.0,   0.0086536439119011006,   -0.029896495621603899,   0.102950081118829,   -0.63739245543855805,   -0.0,   0.54336288636530095,   0.085109057027750595,   -0.090998664615454999,   0.01821194725190089],
-        [-0.0,   -0.0,   -0.0,   -0.0,   0.019698131073842998,   0.039447042394264102,   -0.28128521251909999,   -0.124714160903055,   -0.0,   0.055770702144577897,   0.41917213003600801,   -0.128088632226564],
-        [-0.0,   -0.0,   -0.0,   -0.0,   -0.055988255885229599,   0.179720579323281,   -0.10329058280084499,   -0.13643348652854601,   -0.389516189693211,   -0.0,   0.54432974445498405,   -0.038821808870425301],
-        [-0.0,   -0.0,   -0.0,   -0.0,   -0.0071669648308011497,   -0.052726783847581303,   0.18675940343293501,   0.024615176293723499,   -0.49400862680798402,   -0.091851192507295606,   -0.0,   0.43437898826698501],
-        [-0.0,   -0.0,   -0.0,   -0.0,   0.020015058331576099,   0.086285816263333495,   -0.34386522738887298,   -0.0254881486107905,   0.78102816812674902,   0.033893192260150001,   -2.2474124634140402,   1.6955436044319001],
-    ]);
+    const BLOCKEND_MATRIX: super::Matrix<Float, 8, 12> =
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX)));
 }
 
 impl SbpOperator1d for SBP8 {
@@ -215,11 +206,3 @@ fn test_trad8() {
         1e-4,
     );
 }
-
-#[test]
-fn block_equality() {
-    let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip();
-    flipped_inverted *= -1.0;
-
-    approx::assert_ulps_eq!(SBP8::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1);
-}
diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs
index 66b9852..d2a4eb4 100644
--- a/sbp/src/operators/upwind4.rs
+++ b/sbp/src/operators/upwind4.rs
@@ -192,13 +192,8 @@ impl Upwind4 {
         [  20.0 / 123.0, -69.0 / 82.0,            0.0, 227.0 / 246.0,  -12.0 / 41.0,    2.0 / 41.0,         0.0],
         [   3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0,           0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
     ]);
-    #[rustfmt::skip]
-    const BLOCKEND_MATRIX: Matrix<Float, 4, 7> = Matrix::new([
-        [ -6.0 / 149.0, 36.0 / 149.0, -126.0 / 149.0, 0.0, 227.0 / 298.0, -16.0 / 149.0, -3.0 / 298.0],
-        [ 0.0, -2.0/41.0, 12.0/41.0, -227.0/246.0, 0.0, 69.0/82.0,  -20.0/123.0],
-        [ 0.0, 0.0, -2.0/61.0, 16.0/183.0, -69.0/122.0, 0.0, 187.0/366.0],
-        [  0.0, 0.0, 0.0, 3.0 / 98.0, 20.0 / 49.0, -187.0 / 98.0, 72.0/49.0],
-    ]);
+    const BLOCKEND_MATRIX: Matrix<Float, 4, 7> =
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX)));
 
     #[rustfmt::skip]
     const DISS_BLOCK: &'static [&'static [Float]] = &[
@@ -495,11 +490,3 @@ fn upwind4_test2() {
         1e-1,
     );
 }
-
-#[test]
-fn block_equality() {
-    let mut flipped_inverted = Upwind4::BLOCK_MATRIX.flip();
-    flipped_inverted *= -1.0;
-
-    approx::assert_ulps_eq!(Upwind4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1);
-}

From 31ac46e3868f35138afb0d4caeff5b9b843c063a Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 18:59:59 +0100
Subject: [PATCH 23/35] move data structs into separate files

---
 sbp/src/operators/algos.rs             | 367 +------------------------
 sbp/src/operators/algos/constmatrix.rs | 298 ++++++++++++++++++++
 sbp/src/operators/algos/fastfloat.rs   |  53 ++++
 3 files changed, 358 insertions(+), 360 deletions(-)
 create mode 100644 sbp/src/operators/algos/constmatrix.rs
 create mode 100644 sbp/src/operators/algos/fastfloat.rs

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 7fe8dd6..53d2c63 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -1,308 +1,13 @@
 use super::*;
 
-pub(crate) mod constmatrix {
-    #![allow(unused)]
-    /// A row-major matrix
-    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-    #[repr(C)]
-    pub struct Matrix<T, const M: usize, const N: usize> {
-        pub data: [[T; N]; M],
-    }
-    pub type RowVector<T, const N: usize> = Matrix<T, 1, N>;
-    pub type ColVector<T, const N: usize> = Matrix<T, N, 1>;
-
-    impl<T: Copy + Default, const M: usize, const N: usize> Default for Matrix<T, M, N> {
-        fn default() -> Self {
-            Self {
-                data: [[T::default(); N]; M],
-            }
-        }
-    }
-
-    impl<T, const M: usize, const N: usize> core::ops::Index<(usize, usize)> for Matrix<T, M, N> {
-        type Output = T;
-        #[inline(always)]
-        fn index(&self, (i, j): (usize, usize)) -> &Self::Output {
-            &self.data[i][j]
-        }
-    }
-
-    impl<T, const M: usize, const N: usize> core::ops::IndexMut<(usize, usize)> for Matrix<T, M, N> {
-        #[inline(always)]
-        fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output {
-            &mut self.data[i][j]
-        }
-    }
-
-    impl<T, const M: usize, const N: usize> core::ops::Index<usize> for Matrix<T, M, N> {
-        type Output = [T; N];
-        #[inline(always)]
-        fn index(&self, i: usize) -> &Self::Output {
-            &self.data[i]
-        }
-    }
-
-    impl<T, const M: usize, const N: usize> core::ops::IndexMut<usize> for Matrix<T, M, N> {
-        #[inline(always)]
-        fn index_mut(&mut self, i: usize) -> &mut Self::Output {
-            &mut self.data[i]
-        }
-    }
-
-    impl<T, const M: usize, const N: usize> Matrix<T, M, N> {
-        pub const fn new(data: [[T; N]; M]) -> Self {
-            Self { data }
-        }
-        #[inline(always)]
-        pub const fn nrows(&self) -> usize {
-            M
-        }
-        #[inline(always)]
-        pub const fn ncols(&self) -> usize {
-            N
-        }
-        #[inline(always)]
-        pub fn iter(&self) -> impl Iterator<Item = &T> {
-            self.data.iter().flatten()
-        }
-        #[inline(always)]
-        pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut T> {
-            self.data.iter_mut().flatten()
-        }
-        #[inline(always)]
-        pub fn iter_rows(
-            &self,
-        ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
-            self.data.iter()
-        }
-    }
-
-    impl<T, const N: usize> ColVector<T, N> {
-        #[inline(always)]
-        pub fn map_to_col(slice: &[T; N]) -> &ColVector<T, N> {
-            unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
-        }
-        #[inline(always)]
-        pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector<T, N> {
-            unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
-        }
-    }
-
-    impl<T, const N: usize> RowVector<T, N> {
-        pub fn map_to_row(slice: &[T; N]) -> &Self {
-            unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
-        }
-        pub fn map_to_row_mut(slice: &mut [T; N]) -> &mut Self {
-            unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
-        }
-    }
-
-    impl<T, const M: usize, const P: usize> Matrix<T, M, P> {
-        #[inline(always)]
-        pub fn matmul_into<const N: usize>(&mut self, lhs: &Matrix<T, M, N>, rhs: &Matrix<T, N, P>)
-        where
-            T: Default + Copy + core::ops::Mul<Output = T> + core::ops::Add<Output = T>,
-        {
-            for i in 0..M {
-                for j in 0..P {
-                    let mut t = T::default();
-                    for k in 0..N {
-                        t = t + lhs[(i, k)] * rhs[(k, j)];
-                    }
-                    self[(i, j)] = t;
-                }
-            }
-        }
-    }
-
-    macro_rules! impl_op_mul_mul {
-        ($lhs:ty, $rhs:ty) => {
-            impl<T, const N: usize, const M: usize, const P: usize> core::ops::Mul<$rhs> for $lhs
-            where
-                T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
-            {
-                type Output = Matrix<T, M, P>;
-                fn mul(self, rhs: $rhs) -> Self::Output {
-                    let mut out = Matrix::default();
-                    out.matmul_into(&self, &rhs);
-                    out
-                }
-            }
-        };
-    }
-
-    impl_op_mul_mul! {  Matrix<T, M, N>,  Matrix<T, N, P> }
-    impl_op_mul_mul! { &Matrix<T, M, N>,  Matrix<T, N, P> }
-    impl_op_mul_mul! {  Matrix<T, M, N>, &Matrix<T, N, P> }
-    impl_op_mul_mul! { &Matrix<T, M, N>, &Matrix<T, N, P> }
-
-    impl<T, const M: usize, const N: usize> core::ops::MulAssign<T> for Matrix<T, M, N>
-    where
-        T: Copy + core::ops::MulAssign<T>,
-    {
-        #[inline(always)]
-        fn mul_assign(&mut self, other: T) {
-            self.iter_mut().for_each(|x| *x *= other)
-        }
-    }
-
-    #[cfg(test)]
-    mod tests {
-        use super::{super::*, *};
-        #[test]
-        fn construct_copy_type() {
-            let _m0 = Matrix::<i32, 4, 3>::default();
-            let _m1: Matrix<u8, 8, 8> = Matrix::default();
-
-            let _m2 = Matrix::new([[1, 2], [3, 4]]);
-        }
-
-        #[test]
-        fn matmul() {
-            let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
-            let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]);
-
-            let m3 = m1 * m2;
-            assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]]));
-        }
-        #[test]
-        fn iter() {
-            let m = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
-            let mut iter = m.iter();
-            assert_eq!(iter.next(), Some(&1));
-            assert_eq!(iter.next(), Some(&2));
-            assert_eq!(iter.next(), Some(&3));
-            assert_eq!(iter.next(), Some(&4));
-            assert_eq!(iter.next(), Some(&5));
-            assert_eq!(iter.next(), Some(&6));
-            assert_eq!(iter.next(), None);
-        }
-    }
-
-    mod approx {
-        use super::Matrix;
-        use ::approx::{AbsDiffEq, RelativeEq, UlpsEq};
-
-        impl<T, const M: usize, const N: usize> AbsDiffEq for Matrix<T, M, N>
-        where
-            T: AbsDiffEq,
-        {
-            type Epsilon = T::Epsilon;
-            fn default_epsilon() -> Self::Epsilon {
-                T::default_epsilon()
-            }
-            fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
-                self.iter()
-                    .zip(other.iter())
-                    .all(|(r, l)| r.abs_diff_eq(l, T::default_epsilon()))
-            }
-        }
-        impl<T, const M: usize, const N: usize> RelativeEq for Matrix<T, M, N>
-        where
-            T: RelativeEq,
-            Self::Epsilon: Copy,
-        {
-            fn default_max_relative() -> Self::Epsilon {
-                T::default_max_relative()
-            }
-            fn relative_eq(
-                &self,
-                other: &Self,
-                epsilon: Self::Epsilon,
-                max_relative: Self::Epsilon,
-            ) -> bool {
-                self.iter()
-                    .zip(other.iter())
-                    .all(|(r, l)| r.relative_eq(l, epsilon, max_relative))
-            }
-        }
-        impl<T, const M: usize, const N: usize> UlpsEq for Matrix<T, M, N>
-        where
-            T: UlpsEq,
-            Self::Epsilon: Copy,
-        {
-            fn default_max_ulps() -> u32 {
-                T::default_max_ulps()
-            }
-            fn ulps_eq(&self, other: &Self, epsilon: Self::Epsilon, max_ulps: u32) -> bool {
-                self.iter()
-                    .zip(other.iter())
-                    .all(|(r, l)| r.ulps_eq(l, epsilon, max_ulps))
-            }
-        }
-    }
-
-    pub(crate) const fn flip_ud<const M: usize, const N: usize>(
-        mut m: Matrix<super::Float, M, N>,
-    ) -> Matrix<super::Float, M, N> {
-        let mut i = 0;
-        while i < M / 2 {
-            let tmp = m.data[i];
-            m.data[i] = m.data[M - 1 - i];
-            m.data[M - 1 - i] = tmp;
-            i += 1;
-        }
-        m
-    }
-
-    pub(crate) const fn flip_lr<const M: usize, const N: usize>(
-        mut m: Matrix<super::Float, M, N>,
-    ) -> Matrix<super::Float, M, N> {
-        let mut i = 0;
-        while i < M {
-            let mut j = 0;
-            while j < N / 2 {
-                let tmp = m.data[i][j];
-                m.data[i][j] = m.data[i][N - 1 - j];
-                m.data[i][N - 1 - j] = tmp;
-                j += 1;
-            }
-            i += 1;
-        }
-        m
-    }
-
-    /// Flip all sign bits
-    pub(crate) const fn flip_sign<const M: usize, const N: usize>(
-        mut m: Matrix<super::Float, M, N>,
-    ) -> Matrix<super::Float, M, N> {
-        let mut i = 0;
-        while i < M {
-            let mut j = 0;
-            while j < N {
-                m.data[i][j] = -m.data[i][j];
-                j += 1;
-            }
-            i += 1;
-        }
-        m
-    }
-    mod flipping {
-        use super::*;
-
-        #[test]
-        fn flip_lr_test() {
-            let m = Matrix::new([[1.0, 2.0, 3.0, 4.0]]);
-            let flipped = flip_lr(m);
-            assert_eq!(flipped, Matrix::new([[4.0, 3.0, 2.0, 1.0]]));
-            let m = Matrix::new([[1.0, 2.0, 3.0, 4.0, 5.0]]);
-            let flipped = flip_lr(m);
-            assert_eq!(flipped, Matrix::new([[5.0, 4.0, 3.0, 2.0, 1.0]]));
-        }
-        #[test]
-        fn flip_ud_test() {
-            let m = Matrix::new([[1.0], [2.0], [3.0], [4.0]]);
-            let flipped = flip_ud(m);
-            assert_eq!(flipped, Matrix::new([[4.0], [3.0], [2.0], [1.0]]));
-            let m = Matrix::new([[1.0], [2.0], [3.0], [4.0], [5.0]]);
-            let flipped = flip_ud(m);
-            assert_eq!(flipped, Matrix::new([[5.0], [4.0], [3.0], [2.0], [1.0]]));
-        }
-    }
-}
-
+pub(crate) mod constmatrix;
 pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector};
 
+#[cfg(feature = "fast-float")]
+mod fastfloat;
+#[cfg(feature = "fast-float")]
+use fastfloat::FastFloat;
+
 #[inline(always)]
 pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
     block: &Matrix<Float, M, N>,
@@ -360,64 +65,6 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
     }
 }
 
-#[cfg(feature = "fast-float")]
-mod fastfloat {
-    use super::*;
-    #[repr(transparent)]
-    #[derive(Copy, Clone, Debug, PartialEq, Default)]
-    pub(crate) struct FastFloat(Float);
-
-    use core::intrinsics::{fadd_fast, fmul_fast};
-
-    impl core::ops::Mul for FastFloat {
-        type Output = Self;
-        #[inline(always)]
-        fn mul(self, o: Self) -> Self::Output {
-            unsafe { Self(fmul_fast(self.0, o.0)) }
-        }
-    }
-
-    impl core::ops::Add for FastFloat {
-        type Output = Self;
-        #[inline(always)]
-        fn add(self, o: FastFloat) -> Self::Output {
-            unsafe { Self(fadd_fast(self.0, o.0)) }
-        }
-    }
-
-    impl core::ops::MulAssign<FastFloat> for FastFloat {
-        #[inline(always)]
-        fn mul_assign(&mut self, o: FastFloat) {
-            unsafe {
-                self.0 = fmul_fast(self.0, o.0);
-            }
-        }
-    }
-
-    impl core::ops::Mul for &FastFloat {
-        type Output = FastFloat;
-        #[inline(always)]
-        fn mul(self, o: Self) -> Self::Output {
-            unsafe { FastFloat(fmul_fast(self.0, o.0)) }
-        }
-    }
-
-    impl From<Float> for FastFloat {
-        #[inline(always)]
-        fn from(f: Float) -> Self {
-            Self(f)
-        }
-    }
-    impl From<FastFloat> for Float {
-        #[inline(always)]
-        fn from(f: FastFloat) -> Self {
-            f.0
-        }
-    }
-}
-#[cfg(feature = "fast-float")]
-use fastfloat::FastFloat;
-
 #[inline(always)]
 pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: usize>(
     block: &Matrix<Float, M, N>,
@@ -902,7 +549,7 @@ pub(crate) fn diff_op_col_simd(
 fn dotproduct<'a>(u: impl Iterator<Item = &'a Float>, v: impl Iterator<Item = &'a Float>) -> Float {
     u.zip(v).fold(0.0, |acc, (&u, &v)| {
         #[cfg(feature = "fast-float")]
-        unsafe {
+        {
             // We do not care about the order of multiplication nor addition
             (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into()
         }
diff --git a/sbp/src/operators/algos/constmatrix.rs b/sbp/src/operators/algos/constmatrix.rs
new file mode 100644
index 0000000..6d66fea
--- /dev/null
+++ b/sbp/src/operators/algos/constmatrix.rs
@@ -0,0 +1,298 @@
+#![allow(unused)]
+/// A row-major matrix
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[repr(C)]
+pub struct Matrix<T, const M: usize, const N: usize> {
+    pub data: [[T; N]; M],
+}
+pub type RowVector<T, const N: usize> = Matrix<T, 1, N>;
+pub type ColVector<T, const N: usize> = Matrix<T, N, 1>;
+
+impl<T: Copy + Default, const M: usize, const N: usize> Default for Matrix<T, M, N> {
+    fn default() -> Self {
+        Self {
+            data: [[T::default(); N]; M],
+        }
+    }
+}
+
+impl<T, const M: usize, const N: usize> core::ops::Index<(usize, usize)> for Matrix<T, M, N> {
+    type Output = T;
+    #[inline(always)]
+    fn index(&self, (i, j): (usize, usize)) -> &Self::Output {
+        &self.data[i][j]
+    }
+}
+
+impl<T, const M: usize, const N: usize> core::ops::IndexMut<(usize, usize)> for Matrix<T, M, N> {
+    #[inline(always)]
+    fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output {
+        &mut self.data[i][j]
+    }
+}
+
+impl<T, const M: usize, const N: usize> core::ops::Index<usize> for Matrix<T, M, N> {
+    type Output = [T; N];
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.data[i]
+    }
+}
+
+impl<T, const M: usize, const N: usize> core::ops::IndexMut<usize> for Matrix<T, M, N> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.data[i]
+    }
+}
+
+impl<T, const M: usize, const N: usize> Matrix<T, M, N> {
+    pub const fn new(data: [[T; N]; M]) -> Self {
+        Self { data }
+    }
+    #[inline(always)]
+    pub const fn nrows(&self) -> usize {
+        M
+    }
+    #[inline(always)]
+    pub const fn ncols(&self) -> usize {
+        N
+    }
+    #[inline(always)]
+    pub fn iter(&self) -> impl Iterator<Item = &T> {
+        self.data.iter().flatten()
+    }
+    #[inline(always)]
+    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut T> {
+        self.data.iter_mut().flatten()
+    }
+    #[inline(always)]
+    pub fn iter_rows(
+        &self,
+    ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
+        self.data.iter()
+    }
+}
+
+impl<T, const N: usize> ColVector<T, N> {
+    #[inline(always)]
+    pub fn map_to_col(slice: &[T; N]) -> &ColVector<T, N> {
+        unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
+    }
+    #[inline(always)]
+    pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector<T, N> {
+        unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
+    }
+}
+
+impl<T, const N: usize> RowVector<T, N> {
+    pub fn map_to_row(slice: &[T; N]) -> &Self {
+        unsafe { std::mem::transmute::<&[T; N], &Self>(slice) }
+    }
+    pub fn map_to_row_mut(slice: &mut [T; N]) -> &mut Self {
+        unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) }
+    }
+}
+
+impl<T, const M: usize, const P: usize> Matrix<T, M, P> {
+    #[inline(always)]
+    pub fn matmul_into<const N: usize>(&mut self, lhs: &Matrix<T, M, N>, rhs: &Matrix<T, N, P>)
+    where
+        T: Default + Copy + core::ops::Mul<Output = T> + core::ops::Add<Output = T>,
+    {
+        for i in 0..M {
+            for j in 0..P {
+                let mut t = T::default();
+                for k in 0..N {
+                    t = t + lhs[(i, k)] * rhs[(k, j)];
+                }
+                self[(i, j)] = t;
+            }
+        }
+    }
+}
+
+macro_rules! impl_op_mul_mul {
+    ($lhs:ty, $rhs:ty) => {
+        impl<T, const N: usize, const M: usize, const P: usize> core::ops::Mul<$rhs> for $lhs
+        where
+            T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
+        {
+            type Output = Matrix<T, M, P>;
+            fn mul(self, rhs: $rhs) -> Self::Output {
+                let mut out = Matrix::default();
+                out.matmul_into(&self, &rhs);
+                out
+            }
+        }
+    };
+}
+
+impl_op_mul_mul! {  Matrix<T, M, N>,  Matrix<T, N, P> }
+impl_op_mul_mul! { &Matrix<T, M, N>,  Matrix<T, N, P> }
+impl_op_mul_mul! {  Matrix<T, M, N>, &Matrix<T, N, P> }
+impl_op_mul_mul! { &Matrix<T, M, N>, &Matrix<T, N, P> }
+
+impl<T, const M: usize, const N: usize> core::ops::MulAssign<T> for Matrix<T, M, N>
+where
+    T: Copy + core::ops::MulAssign<T>,
+{
+    #[inline(always)]
+    fn mul_assign(&mut self, other: T) {
+        self.iter_mut().for_each(|x| *x *= other)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{super::*, *};
+    #[test]
+    fn construct_copy_type() {
+        let _m0 = Matrix::<i32, 4, 3>::default();
+        let _m1: Matrix<u8, 8, 8> = Matrix::default();
+
+        let _m2 = Matrix::new([[1, 2], [3, 4]]);
+    }
+
+    #[test]
+    fn matmul() {
+        let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
+        let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]);
+
+        let m3 = m1 * m2;
+        assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]]));
+    }
+    #[test]
+    fn iter() {
+        let m = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]);
+        let mut iter = m.iter();
+        assert_eq!(iter.next(), Some(&1));
+        assert_eq!(iter.next(), Some(&2));
+        assert_eq!(iter.next(), Some(&3));
+        assert_eq!(iter.next(), Some(&4));
+        assert_eq!(iter.next(), Some(&5));
+        assert_eq!(iter.next(), Some(&6));
+        assert_eq!(iter.next(), None);
+    }
+}
+
+mod approx {
+    use super::Matrix;
+    use ::approx::{AbsDiffEq, RelativeEq, UlpsEq};
+
+    impl<T, const M: usize, const N: usize> AbsDiffEq for Matrix<T, M, N>
+    where
+        T: AbsDiffEq,
+    {
+        type Epsilon = T::Epsilon;
+        fn default_epsilon() -> Self::Epsilon {
+            T::default_epsilon()
+        }
+        fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
+            self.iter()
+                .zip(other.iter())
+                .all(|(r, l)| r.abs_diff_eq(l, T::default_epsilon()))
+        }
+    }
+    impl<T, const M: usize, const N: usize> RelativeEq for Matrix<T, M, N>
+    where
+        T: RelativeEq,
+        Self::Epsilon: Copy,
+    {
+        fn default_max_relative() -> Self::Epsilon {
+            T::default_max_relative()
+        }
+        fn relative_eq(
+            &self,
+            other: &Self,
+            epsilon: Self::Epsilon,
+            max_relative: Self::Epsilon,
+        ) -> bool {
+            self.iter()
+                .zip(other.iter())
+                .all(|(r, l)| r.relative_eq(l, epsilon, max_relative))
+        }
+    }
+    impl<T, const M: usize, const N: usize> UlpsEq for Matrix<T, M, N>
+    where
+        T: UlpsEq,
+        Self::Epsilon: Copy,
+    {
+        fn default_max_ulps() -> u32 {
+            T::default_max_ulps()
+        }
+        fn ulps_eq(&self, other: &Self, epsilon: Self::Epsilon, max_ulps: u32) -> bool {
+            self.iter()
+                .zip(other.iter())
+                .all(|(r, l)| r.ulps_eq(l, epsilon, max_ulps))
+        }
+    }
+}
+
+pub(crate) const fn flip_ud<const M: usize, const N: usize>(
+    mut m: Matrix<super::Float, M, N>,
+) -> Matrix<super::Float, M, N> {
+    let mut i = 0;
+    while i < M / 2 {
+        let tmp = m.data[i];
+        m.data[i] = m.data[M - 1 - i];
+        m.data[M - 1 - i] = tmp;
+        i += 1;
+    }
+    m
+}
+
+pub(crate) const fn flip_lr<const M: usize, const N: usize>(
+    mut m: Matrix<super::Float, M, N>,
+) -> Matrix<super::Float, M, N> {
+    let mut i = 0;
+    while i < M {
+        let mut j = 0;
+        while j < N / 2 {
+            let tmp = m.data[i][j];
+            m.data[i][j] = m.data[i][N - 1 - j];
+            m.data[i][N - 1 - j] = tmp;
+            j += 1;
+        }
+        i += 1;
+    }
+    m
+}
+
+/// Flip all sign bits
+pub(crate) const fn flip_sign<const M: usize, const N: usize>(
+    mut m: Matrix<super::Float, M, N>,
+) -> Matrix<super::Float, M, N> {
+    let mut i = 0;
+    while i < M {
+        let mut j = 0;
+        while j < N {
+            m.data[i][j] = -m.data[i][j];
+            j += 1;
+        }
+        i += 1;
+    }
+    m
+}
+mod flipping {
+    use super::*;
+
+    #[test]
+    fn flip_lr_test() {
+        let m = Matrix::new([[1.0, 2.0, 3.0, 4.0]]);
+        let flipped = flip_lr(m);
+        assert_eq!(flipped, Matrix::new([[4.0, 3.0, 2.0, 1.0]]));
+        let m = Matrix::new([[1.0, 2.0, 3.0, 4.0, 5.0]]);
+        let flipped = flip_lr(m);
+        assert_eq!(flipped, Matrix::new([[5.0, 4.0, 3.0, 2.0, 1.0]]));
+    }
+    #[test]
+    fn flip_ud_test() {
+        let m = Matrix::new([[1.0], [2.0], [3.0], [4.0]]);
+        let flipped = flip_ud(m);
+        assert_eq!(flipped, Matrix::new([[4.0], [3.0], [2.0], [1.0]]));
+        let m = Matrix::new([[1.0], [2.0], [3.0], [4.0], [5.0]]);
+        let flipped = flip_ud(m);
+        assert_eq!(flipped, Matrix::new([[5.0], [4.0], [3.0], [2.0], [1.0]]));
+    }
+}
diff --git a/sbp/src/operators/algos/fastfloat.rs b/sbp/src/operators/algos/fastfloat.rs
new file mode 100644
index 0000000..40d6633
--- /dev/null
+++ b/sbp/src/operators/algos/fastfloat.rs
@@ -0,0 +1,53 @@
+use super::*;
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug, PartialEq, Default)]
+pub(crate) struct FastFloat(Float);
+
+use core::intrinsics::{fadd_fast, fmul_fast};
+
+impl core::ops::Mul for FastFloat {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, o: Self) -> Self::Output {
+        unsafe { Self(fmul_fast(self.0, o.0)) }
+    }
+}
+
+impl core::ops::Add for FastFloat {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, o: FastFloat) -> Self::Output {
+        unsafe { Self(fadd_fast(self.0, o.0)) }
+    }
+}
+
+impl core::ops::MulAssign<FastFloat> for FastFloat {
+    #[inline(always)]
+    fn mul_assign(&mut self, o: FastFloat) {
+        unsafe {
+            self.0 = fmul_fast(self.0, o.0);
+        }
+    }
+}
+
+impl core::ops::Mul for &FastFloat {
+    type Output = FastFloat;
+    #[inline(always)]
+    fn mul(self, o: Self) -> Self::Output {
+        unsafe { FastFloat(fmul_fast(self.0, o.0)) }
+    }
+}
+
+impl From<Float> for FastFloat {
+    #[inline(always)]
+    fn from(f: Float) -> Self {
+        Self(f)
+    }
+}
+impl From<FastFloat> for Float {
+    #[inline(always)]
+    fn from(f: FastFloat) -> Self {
+        f.0
+    }
+}

From 6f7268bf339abebb83d3bb2e1419405fefff7f10 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 23:08:55 +0100
Subject: [PATCH 24/35] use matrices everywhere

---
 sbp/src/operators.rs                   |   8 +-
 sbp/src/operators/algos.rs             | 628 +++++++------------------
 sbp/src/operators/algos/constmatrix.rs |   4 +
 sbp/src/operators/traditional4.rs      | 146 ++----
 sbp/src/operators/traditional8.rs      | 116 +----
 sbp/src/operators/upwind4.rs           | 326 ++-----------
 sbp/src/operators/upwind4h2.rs         | 149 ++----
 sbp/src/operators/upwind9.rs           | 154 ++----
 sbp/src/operators/upwind9h2.rs         | 163 ++-----
 9 files changed, 405 insertions(+), 1289 deletions(-)

diff --git a/sbp/src/operators.rs b/sbp/src/operators.rs
index 8f8598f..63580f9 100644
--- a/sbp/src/operators.rs
+++ b/sbp/src/operators.rs
@@ -78,7 +78,9 @@ pub trait SbpOperator2d: Send + Sync {
     }
 
     fn op_xi(&self) -> &dyn SbpOperator1d;
-    fn op_eta(&self) -> &dyn SbpOperator1d;
+    fn op_eta(&self) -> &dyn SbpOperator1d {
+        self.op_xi()
+    }
 
     fn upwind(&self) -> Option<Box<dyn UpwindOperator2d>> {
         None
@@ -98,7 +100,9 @@ pub trait UpwindOperator2d: Send + Sync {
     }
 
     fn op_xi(&self) -> &dyn UpwindOperator1d;
-    fn op_eta(&self) -> &dyn UpwindOperator1d;
+    fn op_eta(&self) -> &dyn UpwindOperator1d {
+        self.op_xi()
+    }
 }
 
 pub trait InterpolationOperator: Send + Sync {
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 53d2c63..5a608a6 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -1,4 +1,5 @@
 use super::*;
+use ndarray::s;
 
 pub(crate) mod constmatrix;
 pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector};
@@ -8,11 +9,55 @@ mod fastfloat;
 #[cfg(feature = "fast-float")]
 use fastfloat::FastFloat;
 
+#[derive(Clone, Debug, PartialEq)]
+pub(crate) struct DiagonalMatrix<const B: usize> {
+    pub start: [Float; B],
+    pub diag: Float,
+    pub end: [Float; B],
+}
+
+impl<const B: usize> DiagonalMatrix<B> {
+    pub const fn new(block: [Float; B]) -> Self {
+        let start = block;
+        let diag = 1.0;
+        let mut end = block;
+        let mut i = 0;
+        while i < B {
+            end[i] = block[B - 1 - i];
+            i += 1;
+        }
+        Self { start, diag, end }
+    }
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub(crate) struct BlockMatrix<const M: usize, const N: usize, const D: usize> {
+    pub start: Matrix<Float, M, N>,
+    pub diag: RowVector<Float, D>,
+    pub end: Matrix<Float, M, N>,
+}
+
+impl<const M: usize, const N: usize, const D: usize> BlockMatrix<M, N, D> {
+    pub const fn new(
+        start: Matrix<Float, M, N>,
+        diag: RowVector<Float, D>,
+        end: Matrix<Float, M, N>,
+    ) -> Self {
+        Self { start, diag, end }
+    }
+}
+
+#[derive(PartialEq, Copy, Clone)]
+pub(crate) enum OperatorType {
+    Normal,
+    H2,
+    // TODO: D2
+}
+
 #[inline(always)]
-pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
-    block: &Matrix<Float, M, N>,
-    blockend: &Matrix<Float, M, N>,
-    diag: &RowVector<Float, D>,
+/// Works on all 1d vectors
+pub(crate) fn diff_op_1d_fallback<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
     prev: ArrayView1<Float>,
     mut fut: ArrayViewMut1<Float>,
@@ -29,12 +74,11 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
     };
     let idx = 1.0 / dx;
 
-    for (bl, f) in block.iter_rows().zip(&mut fut) {
-        let diff = bl
-            .iter()
-            .zip(prev.iter())
-            .map(|(x, y)| x * y)
-            .sum::<Float>();
+    let (futstart, futmid, futend) =
+        fut.multi_slice_mut((s![..M], s![M..nx - 2 * M], s![nx - 2 * M..]));
+
+    for (bl, f) in matrix.start.iter_rows().zip(futstart) {
+        let diff = dotproduct(bl.iter(), prev.iter());
         *f = diff * idx;
     }
 
@@ -46,42 +90,33 @@ pub(crate) fn diff_op_1d_matrix<const M: usize, const N: usize, const D: usize>(
         .windows(D)
         .into_iter()
         .skip(window_elems_to_skip)
-        .zip(fut.iter_mut().skip(M))
-        .take(nx - 2 * M)
+        .zip(futmid)
     {
-        let diff = diag.iter().zip(&window).map(|(x, y)| x * y).sum::<Float>();
+        let diff = dotproduct(matrix.diag.row(0), window);
         *f = diff * idx;
     }
 
     let prev = prev.slice(ndarray::s![nx - N..]);
-    for (bl, f) in blockend.iter_rows().zip(fut.iter_mut().rev().take(M).rev()) {
-        let diff = bl
-            .iter()
-            .zip(prev.iter())
-            .map(|(x, y)| x * y)
-            .sum::<Float>();
-
+    for (bl, f) in matrix.end.iter_rows().zip(futend) {
+        let diff = dotproduct(bl, prev);
         *f = diff * idx;
     }
 }
 
 #[inline(always)]
-pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: usize>(
-    block: &Matrix<Float, M, N>,
-    endblock: &Matrix<Float, M, N>,
-    diag: &RowVector<Float, D>,
+/// diff op in 1d for slices
+pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
     prev: &[Float],
     fut: &mut [Float],
 ) {
     #[cfg(feature = "fast-float")]
-    let (block, endblock, diag, prev, fut) = {
+    let (matrix, prev, fut) = {
         use std::mem::transmute;
         unsafe {
             (
-                transmute::<_, &Matrix<FastFloat, M, N>>(block),
-                transmute::<_, &Matrix<FastFloat, M, N>>(endblock),
-                transmute::<_, &RowVector<FastFloat, D>>(diag),
+                transmute::<_, &BlockMatrix<FastFloat, M, N, D>>(matrix),
                 transmute::<_, &[FastFloat]>(prev),
                 transmute::<_, &mut [FastFloat]>(fut),
             )
@@ -113,7 +148,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
         let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap());
 
-        fut.matmul_into(block, prev);
+        fut.matmul_into(&matrix.start, prev);
         *fut *= idx;
     }
 
@@ -129,7 +164,7 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let fut = ColVector::<_, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 
-        fut.matmul_into(diag, prev);
+        fut.matmul_into(&matrix.diag, prev);
         *fut *= idx;
     }
 
@@ -138,154 +173,35 @@ pub(crate) fn diff_op_1d_slice_matrix<const M: usize, const N: usize, const D: u
         let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap());
 
-        fut.matmul_into(endblock, prev);
+        fut.matmul_into(&matrix.end, prev);
         *fut *= idx;
     }
 }
 
 #[inline(always)]
-pub(crate) fn diff_op_1d(
-    block: &[&[Float]],
-    diag: &[Float],
-    symmetry: Symmetry,
+/// Will always work on 1d, delegated based on slicedness
+pub(crate) fn diff_op_1d<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
     prev: ArrayView1<Float>,
     mut fut: ArrayViewMut1<Float>,
 ) {
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[0];
-    assert!(nx >= 2 * block.len());
+    assert!(nx >= 2 * M);
 
-    let dx = if optype == OperatorType::H2 {
-        1.0 / (nx - 2) as Float
+    if let Some((prev, fut)) = prev.as_slice().zip(fut.as_slice_mut()) {
+        diff_op_1d_slice(matrix, optype, prev, fut)
     } else {
-        1.0 / (nx - 1) as Float
-    };
-    let idx = 1.0 / dx;
-
-    for (bl, f) in block.iter().zip(&mut fut) {
-        let diff = bl
-            .iter()
-            .zip(prev.iter())
-            .map(|(x, y)| x * y)
-            .sum::<Float>();
-        *f = diff * idx;
-    }
-
-    // The window needs to be aligned to the diagonal elements,
-    // based on the block size
-    let window_elems_to_skip = block.len() - ((diag.len() - 1) / 2);
-
-    for (window, f) in prev
-        .windows(diag.len())
-        .into_iter()
-        .skip(window_elems_to_skip)
-        .zip(fut.iter_mut().skip(block.len()))
-        .take(nx - 2 * block.len())
-    {
-        let diff = diag.iter().zip(&window).map(|(x, y)| x * y).sum::<Float>();
-        *f = diff * idx;
-    }
-
-    for (bl, f) in block.iter().zip(fut.iter_mut().rev()) {
-        let diff = bl
-            .iter()
-            .zip(prev.iter().rev())
-            .map(|(x, y)| x * y)
-            .sum::<Float>();
-
-        *f = idx
-            * if symmetry == Symmetry::Symmetric {
-                diff
-            } else {
-                -diff
-            };
-    }
-}
-
-#[derive(PartialEq, Copy, Clone)]
-pub(crate) enum Symmetry {
-    Symmetric,
-    AntiSymmetric,
-}
-
-#[derive(PartialEq, Copy, Clone)]
-pub(crate) enum OperatorType {
-    Normal,
-    H2,
-}
-
-#[inline(always)]
-#[allow(unused)]
-pub(crate) fn diff_op_col_naive(
-    block: &'static [&'static [Float]],
-    diag: &'static [Float],
-    symmetry: Symmetry,
-    optype: OperatorType,
-) -> impl Fn(ArrayView2<Float>, ArrayViewMut2<Float>) {
-    #[inline(always)]
-    move |prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>| {
-        assert_eq!(prev.shape(), fut.shape());
-        let nx = prev.shape()[1];
-        assert!(nx >= 2 * block.len());
-
-        assert_eq!(prev.strides()[0], 1);
-        assert_eq!(fut.strides()[0], 1);
-
-        let dx = if optype == OperatorType::H2 {
-            1.0 / (nx - 2) as Float
-        } else {
-            1.0 / (nx - 1) as Float
-        };
-        let idx = 1.0 / dx;
-
-        fut.fill(0.0);
-
-        // First block
-        for (bl, mut fut) in block.iter().zip(fut.axis_iter_mut(ndarray::Axis(1))) {
-            debug_assert_eq!(fut.len(), prev.shape()[0]);
-            for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
-                debug_assert_eq!(prev.len(), fut.len());
-                fut.scaled_add(idx * bl, &prev);
-            }
-        }
-
-        let half_diag_width = (diag.len() - 1) / 2;
-        assert!(half_diag_width <= block.len());
-
-        // Diagonal entries
-        for (ifut, mut fut) in fut
-            .axis_iter_mut(ndarray::Axis(1))
-            .enumerate()
-            .skip(block.len())
-            .take(nx - 2 * block.len())
-        {
-            for (id, &d) in diag.iter().enumerate() {
-                let offset = ifut - half_diag_width + id;
-                fut.scaled_add(idx * d, &prev.slice(ndarray::s![.., offset]))
-            }
-        }
-
-        // End block
-        for (bl, mut fut) in block.iter().zip(fut.axis_iter_mut(ndarray::Axis(1)).rev()) {
-            fut.fill(0.0);
-            for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1)).rev()) {
-                if symmetry == Symmetry::Symmetric {
-                    fut.scaled_add(idx * bl, &prev);
-                } else {
-                    fut.scaled_add(-idx * bl, &prev);
-                }
-            }
-        }
+        diff_op_1d_fallback(matrix, optype, prev, fut)
     }
 }
 
 #[inline(always)]
 #[allow(unused)]
-pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D: usize>(
-    block: &Matrix<Float, M, N>,
-    blockend: &Matrix<Float, M, N>,
-    diag: &RowVector<Float, D>,
+/// 2D diff fallback for when matrices are not slicable
+pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
     mut fut: ArrayViewMut2<Float>,
@@ -295,9 +211,6 @@ pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D:
     let ny = prev.shape()[0];
     assert!(nx >= 2 * M);
 
-    assert_eq!(prev.strides()[0], 1);
-    assert_eq!(fut.strides()[0], 1);
-
     let dx = if optype == OperatorType::H2 {
         1.0 / (nx - 2) as Float
     } else {
@@ -314,7 +227,11 @@ pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D:
     ));
 
     // First block
-    for (bl, mut fut) in block.iter_rows().zip(fut0.axis_iter_mut(ndarray::Axis(1))) {
+    for (bl, mut fut) in matrix
+        .start
+        .iter_rows()
+        .zip(fut0.axis_iter_mut(ndarray::Axis(1)))
+    {
         debug_assert_eq!(fut.len(), prev.shape()[0]);
         for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
             if bl == 0.0 {
@@ -332,7 +249,7 @@ pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D:
         .axis_iter_mut(ndarray::Axis(1))
         .zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip))
     {
-        for (&d, id) in diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
+        for (&d, id) in matrix.diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
             if d == 0.0 {
                 continue;
             }
@@ -342,7 +259,8 @@ pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D:
 
     // End block
     let prev = prev.slice(ndarray::s!(.., nx - N..));
-    for (bl, mut fut) in blockend
+    for (bl, mut fut) in matrix
+        .end
         .iter_rows()
         .zip(futn.axis_iter_mut(ndarray::Axis(1)))
     {
@@ -356,215 +274,49 @@ pub(crate) fn diff_op_col_naive_matrix<const M: usize, const N: usize, const D:
     }
 }
 
-#[inline(always)]
-pub(crate) fn diff_op_col(
-    block: &'static [&'static [Float]],
-    diag: &'static [Float],
-    symmetry: Symmetry,
+pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
-) -> impl Fn(ArrayView2<Float>, ArrayViewMut2<Float>) {
-    diff_op_col_simd(block, diag, symmetry, optype)
-}
-
-#[inline(always)]
-pub(crate) fn diff_op_col_simd(
-    block: &'static [&'static [Float]],
-    diag: &'static [Float],
-    symmetry: Symmetry,
-    optype: OperatorType,
-) -> impl Fn(ArrayView2<Float>, ArrayViewMut2<Float>) {
-    #[inline(always)]
-    move |prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>| {
-        assert_eq!(prev.shape(), fut.shape());
-        let nx = prev.shape()[1];
-        assert!(nx >= 2 * block.len());
-
-        assert_eq!(prev.strides()[0], 1);
-        assert_eq!(fut.strides()[0], 1);
-
-        let dx = if optype == OperatorType::H2 {
-            1.0 / (nx - 2) as Float
-        } else {
-            1.0 / (nx - 1) as Float
-        };
-        let idx = 1.0 / dx;
-
-        #[cfg(not(feature = "f32"))]
-        type SimdT = packed_simd::f64x8;
-        #[cfg(feature = "f32")]
-        type SimdT = packed_simd::f32x16;
-
-        let ny = prev.shape()[0];
-        // How many elements that can be simdified
-        let simdified = SimdT::lanes() * (ny / SimdT::lanes());
-
-        let half_diag_width = (diag.len() - 1) / 2;
-        assert!(half_diag_width <= block.len());
-
-        let fut_base_ptr = fut.as_mut_ptr();
-        let fut_stride = fut.strides()[1];
-        let fut_ptr = |j, i| {
-            debug_assert!(j < ny && i < nx);
-            unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
-        };
-
-        let prev_base_ptr = prev.as_ptr();
-        let prev_stride = prev.strides()[1];
-        let prev_ptr = |j, i| {
-            debug_assert!(j < ny && i < nx);
-            unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
-        };
-
-        // Not algo necessary, but gives performance increase
-        assert_eq!(fut_stride, prev_stride);
-
-        // First block
-        {
-            for (ifut, &bl) in block.iter().enumerate() {
-                for j in (0..simdified).step_by(SimdT::lanes()) {
-                    let index_to_simd = |i| unsafe {
-                        // j never moves past end of slice due to step_by and
-                        // rounding down
-                        SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                            prev_ptr(j, i),
-                            SimdT::lanes(),
-                        ))
-                    };
-                    let mut f = SimdT::splat(0.0);
-                    for (iprev, &bl) in bl.iter().enumerate() {
-                        f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
-                    }
-                    f *= idx;
-
-                    unsafe {
-                        f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                            fut_ptr(j, ifut),
-                            SimdT::lanes(),
-                        ));
-                    }
-                }
-                for j in simdified..ny {
-                    unsafe {
-                        let mut f = 0.0;
-                        for (iprev, bl) in bl.iter().enumerate() {
-                            f += bl * *prev_ptr(j, iprev);
-                        }
-                        *fut_ptr(j, ifut) = f * idx;
-                    }
-                }
-            }
-        }
-
-        // Diagonal elements
-        {
-            for ifut in block.len()..nx - block.len() {
-                for j in (0..simdified).step_by(SimdT::lanes()) {
-                    let index_to_simd = |i| unsafe {
-                        // j never moves past end of slice due to step_by and
-                        // rounding down
-                        SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                            prev_ptr(j, i),
-                            SimdT::lanes(),
-                        ))
-                    };
-                    let mut f = SimdT::splat(0.0);
-                    for (id, &d) in diag.iter().enumerate() {
-                        let offset = ifut - half_diag_width + id;
-                        f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
-                    }
-                    f *= idx;
-                    unsafe {
-                        // puts simd along stride 1, j never goes past end of slice
-                        f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                            fut_ptr(j, ifut),
-                            SimdT::lanes(),
-                        ));
-                    }
-                }
-                for j in simdified..ny {
-                    let mut f = 0.0;
-                    for (id, &d) in diag.iter().enumerate() {
-                        let offset = ifut - half_diag_width + id;
-                        unsafe {
-                            f += d * *prev_ptr(j, offset);
-                        }
-                    }
-                    unsafe {
-                        *fut_ptr(j, ifut) = idx * f;
-                    }
-                }
-            }
-        }
-
-        // End block
-        {
-            // Get blocks and corresponding offsets
-            // (rev to iterate in ifut increasing order)
-            for (bl, ifut) in block.iter().zip((0..nx).rev()) {
-                for j in (0..simdified).step_by(SimdT::lanes()) {
-                    let index_to_simd = |i| unsafe {
-                        // j never moves past end of slice due to step_by and
-                        // rounding down
-                        SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                            prev_ptr(j, i),
-                            SimdT::lanes(),
-                        ))
-                    };
-                    let mut f = SimdT::splat(0.0);
-                    for (&bl, iprev) in bl.iter().zip((0..nx).rev()) {
-                        f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
-                    }
-                    f = if symmetry == Symmetry::Symmetric {
-                        f * idx
-                    } else {
-                        -f * idx
-                    };
-                    unsafe {
-                        f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                            fut_ptr(j, ifut),
-                            SimdT::lanes(),
-                        ));
-                    }
-                }
-
-                for j in simdified..ny {
-                    unsafe {
-                        let mut f = 0.0;
-                        for (&bl, iprev) in bl.iter().zip((0..nx).rev()).rev() {
-                            f += bl * *prev_ptr(j, iprev);
-                        }
-                        *fut_ptr(j, ifut) = if symmetry == Symmetry::Symmetric {
-                            f * idx
-                        } else {
-                            -f * idx
-                        };
-                    }
-                }
-            }
-        }
+    prev: ArrayView2<Float>,
+    mut fut: ArrayViewMut2<Float>,
+) {
+    assert_eq!(prev.shape(), fut.shape());
+    let nx = prev.shape()[1];
+    for (prev, mut fut) in prev.outer_iter().zip(fut.outer_iter_mut()) {
+        let prev = &prev.as_slice().unwrap()[..nx];
+        let fut = &mut fut.as_slice_mut().unwrap()[..nx];
+        diff_op_1d_slice(matrix, optype, prev, fut)
     }
 }
 
 #[inline(always)]
-fn dotproduct<'a>(u: impl Iterator<Item = &'a Float>, v: impl Iterator<Item = &'a Float>) -> Float {
-    u.zip(v).fold(0.0, |acc, (&u, &v)| {
-        #[cfg(feature = "fast-float")]
-        {
-            // We do not care about the order of multiplication nor addition
-            (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into()
-        }
-        #[cfg(not(feature = "fast-float"))]
-        {
-            acc + u * v
-        }
-    })
+/// Dispatch based on strides
+pub(crate) fn diff_op_2d<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
+    optype: OperatorType,
+    prev: ArrayView2<Float>,
+    fut: ArrayViewMut2<Float>,
+) {
+    assert_eq!(prev.shape(), fut.shape());
+    match (prev.strides(), fut.strides()) {
+        ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut),
+        _ => diff_op_2d_fallback(matrix, optype, prev, fut),
+    }
 }
 
+/*
 #[inline(always)]
+/// Way to too much overhead with SIMD:
+/// output SIMD oriented:
+/// |S     | =   |P0 P1|     |P0 P1|
+/// |S     | = a1|P0 P1| + b1|P0 P1|
+/// |S     | =   |P0 P1|     |P0 P1|
+///
+/// | S    | =   |P0 P1|     |P0 P1|
+/// | S    | = a2|P0 P1| + b1|P0 P1|
+/// | S    | =   |P0 P1|     |P0 P1|
 pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>(
-    block: &Matrix<Float, M, N>,
-    block2: &Matrix<Float, M, N>,
-    diag: &RowVector<Float, D>,
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
     fut: ArrayViewMut2<Float>,
@@ -615,7 +367,7 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
     {
         let prev = prev.slice(ndarray::s![.., ..N]);
         let (prevb, prevl) = prev.split_at(Axis(0), simdified);
-        for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(block.iter_rows()) {
+        for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(matrix.start.iter_rows()) {
             let fut = fut.as_slice_mut().unwrap();
             let fut = &mut fut[..ny];
 
@@ -662,7 +414,7 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
 
             for (fut, prev) in fut.by_ref().zip(prev) {
                 let mut f = SimdT::splat(0.0);
-                for (&d, prev) in diag.iter().zip(prev.axis_iter(Axis(1))) {
+                for (&d, prev) in matrix.diag.iter().zip(prev.axis_iter(Axis(1))) {
                     let prev = prev.to_slice().unwrap();
                     let prev = SimdT::from_slice_unaligned(prev);
                     f = prev.mul_adde(SimdT::splat(d), f);
@@ -677,7 +429,7 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
                 .zip(prevl.axis_iter(Axis(0)))
             {
                 let mut f = 0.0;
-                for (&d, prev) in diag.iter().zip(prev) {
+                for (&d, prev) in matrix.diag.iter().zip(prev) {
                     f += d * prev;
                 }
                 *fut = idx * f;
@@ -687,7 +439,7 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
 
     // End block
     {
-        for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(block2.iter_rows()) {
+        for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(matrix.end.iter_rows()) {
             let fut = fut.as_slice_mut().unwrap();
             let fut = &mut fut[..ny];
             let mut fut = fut.chunks_exact_mut(SimdT::lanes());
@@ -720,94 +472,51 @@ pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>
         }
     }
 }
+*/
 
 #[inline(always)]
-pub(crate) fn diff_op_row(
-    block: &'static [&'static [Float]],
-    diag: &'static [Float],
-    symmetry: Symmetry,
-    optype: OperatorType,
-) -> impl Fn(ArrayView2<Float>, ArrayViewMut2<Float>) {
-    #[inline(always)]
-    move |prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>| {
-        assert_eq!(prev.shape(), fut.shape());
-        let nx = prev.shape()[1];
-        assert!(nx >= 2 * block.len());
-
-        assert_eq!(prev.strides()[1], 1);
-        assert_eq!(fut.strides()[1], 1);
-
-        let dx = if optype == OperatorType::H2 {
-            1.0 / (nx - 2) as Float
-        } else {
-            1.0 / (nx - 1) as Float
-        };
-        let idx = 1.0 / dx;
-
-        for (prev, mut fut) in prev
-            .axis_iter(ndarray::Axis(0))
-            .zip(fut.axis_iter_mut(ndarray::Axis(0)))
+fn dotproduct<'a>(
+    u: impl IntoIterator<Item = &'a Float>,
+    v: impl IntoIterator<Item = &'a Float>,
+) -> Float {
+    u.into_iter().zip(v.into_iter()).fold(0.0, |acc, (&u, &v)| {
+        #[cfg(feature = "fast-float")]
         {
-            let prev = prev.as_slice().unwrap();
-            let fut = fut.as_slice_mut().unwrap();
-            assert_eq!(prev.len(), fut.len());
-            assert!(prev.len() >= 2 * block.len());
-
-            for (bl, f) in block.iter().zip(fut.iter_mut()) {
-                let diff = dotproduct(bl.iter(), prev[..bl.len()].iter());
-                *f = diff * idx;
-            }
-
-            // The window needs to be aligned to the diagonal elements,
-            // based on the block size
-            let window_elems_to_skip = block.len() - ((diag.len() - 1) / 2);
-
-            for (window, f) in prev
-                .windows(diag.len())
-                .skip(window_elems_to_skip)
-                .zip(fut.iter_mut().skip(block.len()))
-                .take(nx - 2 * block.len())
-            {
-                let diff = dotproduct(diag.iter(), window.iter());
-                *f = diff * idx;
-            }
-
-            for (bl, f) in block.iter().zip(fut.iter_mut().rev()) {
-                let diff = dotproduct(bl.iter(), prev.iter().rev());
-
-                *f = idx
-                    * if symmetry == Symmetry::Symmetric {
-                        diff
-                    } else {
-                        -diff
-                    };
-            }
+            // We do not care about the order of multiplication nor addition
+            (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into()
         }
-    }
+        #[cfg(not(feature = "fast-float"))]
+        {
+            acc + u * v
+        }
+    })
 }
 
 #[cfg(feature = "sparse")]
-pub(crate) fn sparse_from_block(
-    block: &[&[Float]],
-    diag: &[Float],
-    symmetry: Symmetry,
+pub(crate) fn sparse_from_block<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<M, N, D>,
     optype: OperatorType,
     n: usize,
 ) -> sprs::CsMat<Float> {
-    assert!(n >= 2 * block.len());
+    assert!(n >= 2 * M);
 
     let nnz = {
-        let block_elems = block.iter().fold(0, |acc, x| {
-            acc + x
-                .iter()
-                .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc })
-        });
-
-        let diag_elems = diag
+        let blockstart_elems = matrix
+            .start
             .iter()
             .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc });
 
-        2 * block_elems + (n - 2 * block.len()) * diag_elems
+        let diag_elems = matrix
+            .diag
+            .iter()
+            .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc });
+
+        let blockend_elems = matrix
+            .end
+            .iter()
+            .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc });
+
+        blockstart_elems + (n - 2 * M) * diag_elems + blockend_elems
     };
 
     let mut mat = sprs::TriMat::with_capacity((n, n), nnz);
@@ -819,7 +528,7 @@ pub(crate) fn sparse_from_block(
     };
     let idx = 1.0 / dx;
 
-    for (j, bl) in block.iter().enumerate() {
+    for (j, bl) in matrix.start.iter_rows().enumerate() {
         for (i, &b) in bl.iter().enumerate() {
             if b == 0.0 {
                 continue;
@@ -828,9 +537,9 @@ pub(crate) fn sparse_from_block(
         }
     }
 
-    for j in block.len()..n - block.len() {
-        let half_diag_len = diag.len() / 2;
-        for (&d, i) in diag.iter().zip(j - half_diag_len..) {
+    for j in M..n - M {
+        let half_diag_len = D / 2;
+        for (&d, i) in matrix.diag.iter().zip(j - half_diag_len..) {
             if d == 0.0 {
                 continue;
             }
@@ -838,16 +547,12 @@ pub(crate) fn sparse_from_block(
         }
     }
 
-    for (bl, j) in block.iter().zip((0..n).rev()).rev() {
-        for (&b, i) in bl.iter().zip((0..n).rev()).rev() {
+    for (bl, j) in matrix.end.iter_rows().zip(n - M..) {
+        for (&b, i) in bl.iter().zip(n - N..) {
             if b == 0.0 {
                 continue;
             }
-            if symmetry == Symmetry::AntiSymmetric {
-                mat.add_triplet(j, i, -b * idx);
-            } else {
-                mat.add_triplet(j, i, b * idx);
-            }
+            mat.add_triplet(j, i, b * idx);
         }
     }
 
@@ -855,17 +560,22 @@ pub(crate) fn sparse_from_block(
 }
 
 #[cfg(feature = "sparse")]
-pub(crate) fn h_matrix(diag: &[Float], n: usize, is_h2: bool) -> sprs::CsMat<Float> {
+pub(crate) fn h_matrix<const D: usize>(
+    hmatrix: &DiagonalMatrix<D>,
+    n: usize,
+    is_h2: bool,
+) -> sprs::CsMat<Float> {
     let h = if is_h2 {
         1.0 / (n - 2) as Float
     } else {
         1.0 / (n - 1) as Float
     };
-    let nmiddle = n - 2 * diag.len();
-    let iter = diag
+    let nmiddle = n - 2 * D;
+    let iter = hmatrix
+        .start
         .iter()
-        .chain(std::iter::repeat(&1.0).take(nmiddle))
-        .chain(diag.iter().rev())
+        .chain(std::iter::repeat(&hmatrix.diag).take(nmiddle))
+        .chain(hmatrix.end.iter())
         .map(|&x| h * x);
 
     let mut mat = sprs::TriMat::with_capacity((n, n), n);
diff --git a/sbp/src/operators/algos/constmatrix.rs b/sbp/src/operators/algos/constmatrix.rs
index 6d66fea..b409bed 100644
--- a/sbp/src/operators/algos/constmatrix.rs
+++ b/sbp/src/operators/algos/constmatrix.rs
@@ -72,6 +72,10 @@ impl<T, const M: usize, const N: usize> Matrix<T, M, N> {
     ) -> impl ExactSizeIterator<Item = &[T; N]> + DoubleEndedIterator<Item = &[T; N]> {
         self.data.iter()
     }
+    #[inline(always)]
+    pub const fn row(&self, idx: usize) -> &[T; N] {
+        &self.data[idx]
+    }
 }
 
 impl<T, const N: usize> ColVector<T, N> {
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 417b799..167d36b 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -1,4 +1,6 @@
-use super::{SbpOperator1d, SbpOperator2d};
+use super::{
+    BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d,
+};
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
 
@@ -7,76 +9,61 @@ pub struct SBP4;
 
 impl SBP4 {
     #[rustfmt::skip]
-    const HBLOCK: &'static [Float] = &[
+    const H: DiagonalMatrix<4> = DiagonalMatrix::new([
         17.0 / 48.0, 59.0 / 48.0, 43.0 / 48.0, 49.0 / 48.0,
-    ];
+    ]);
 
     #[rustfmt::skip]
-    const DIAG: &'static [Float] = &[
-        1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0,
-    ];
-
-    const DIAG_MATRIX: super::RowVector<Float, 5> =
-        super::RowVector::new([[1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0]]);
+    const DIFF_DIAG: RowVector<Float, 5> = RowVector::new([[
+        1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0
+    ]]);
     #[rustfmt::skip]
-    const BLOCK: &'static [&'static [Float]] = &[
-        &[-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0],
-        &[-1.0/2.0, 0.0, 1.0/2.0],
-        &[4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0],
-        &[3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0]
-    ];
-    #[rustfmt::skip]
-    const BLOCK_MATRIX: super::Matrix<Float, 4, 6> = super::Matrix::new([
+    const DIFF_BLOCK: Matrix<Float, 4, 6> = Matrix::new([
         [-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0, 0.0, 0.0],
         [-1.0/2.0, 0.0, 1.0/2.0, 0.0, 0.0, 0.0],
         [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0],
         [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0]
     ]);
-    const BLOCKEND_MATRIX: super::Matrix<Float, 4, 6> =
-        super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX)));
+    const DIFF_BLOCKEND: super::Matrix<Float, 4, 6> =
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK)));
+
+    const DIFF: BlockMatrix<4, 6, 5> =
+        BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND);
 
     #[rustfmt::skip]
-    const D2DIAG: &'static [Float] = &[
+    const D2DIAG: RowVector<Float, 5> = RowVector::new([[
         -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0
-    ];
+    ]]);
     #[rustfmt::skip]
-    const D2BLOCK: &'static [&'static [Float]] = &[
-        &[2.0, -5.0, 4.0, -1.0],
-        &[1.0, -2.0, 1.0],
-        &[-4.0/43.0, 59.0/43.0, -110.0/43.0, 59.0/43.0, -4.0/43.0],
-        &[-1.0/49.0, 0.0, 59.0/49.0, -118.0/49.0, 64.0/49.0, -4.0/49.0]
-    ];
+    const D2BLOCK: Matrix<Float, 4, 6> = Matrix::new([
+        [2.0, -5.0, 4.0, -1.0, 0.0, 0.0],
+        [1.0, -2.0, 1.0, 0.0, 0.0, 0.0],
+        [-4.0/43.0, 59.0/43.0, -110.0/43.0, 59.0/43.0, -4.0/43.0, 0.0],
+        [-1.0/49.0, 0.0, 59.0/49.0, -118.0/49.0, 64.0/49.0, -4.0/49.0]
+    ]);
+    const D2: BlockMatrix<4, 6, 5> = BlockMatrix::new(
+        Self::D2BLOCK,
+        Self::D2DIAG,
+        super::flip_ud(super::flip_lr(Self::D2BLOCK)),
+    );
 }
 
 impl SbpOperator1d for SBP4 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d_matrix(
-            &Self::BLOCK_MATRIX,
-            &Self::BLOCKEND_MATRIX,
-            &Self::DIAG_MATRIX,
-            super::OperatorType::Normal,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut)
     }
 
     fn h(&self) -> &'static [Float] {
-        Self::HBLOCK
+        &Self::H.start
     }
 
     #[cfg(feature = "sparse")]
     fn diff_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::Normal,
-            n,
-        )
+        super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n)
     }
     #[cfg(feature = "sparse")]
     fn h_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::h_matrix(Self::HBLOCK, n, self.is_h2())
+        super::h_matrix(&Self::H, n, self.is_h2())
     }
 
     fn d2(&self) -> Option<&dyn super::SbpOperator1d2> {
@@ -84,76 +71,19 @@ impl SbpOperator1d for SBP4 {
     }
 }
 
-fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
-    for (p, mut f) in prev
-        .axis_iter(ndarray::Axis(0))
-        .zip(fut.axis_iter_mut(ndarray::Axis(0)))
-    {
-        super::diff_op_1d_slice_matrix(
-            &SBP4::BLOCK_MATRIX,
-            &SBP4::BLOCKEND_MATRIX,
-            &SBP4::DIAG_MATRIX,
-            super::OperatorType::Normal,
-            p.as_slice().unwrap(),
-            f.as_slice_mut().unwrap(),
-        )
-    }
-}
-fn diff_op_col_local(prev: ndarray::ArrayView2<Float>, fut: ndarray::ArrayViewMut2<Float>) {
-    let optype = super::OperatorType::Normal;
-    super::diff_op_col_naive_matrix(
-        &SBP4::BLOCK_MATRIX,
-        &SBP4::BLOCKEND_MATRIX,
-        &SBP4::DIAG_MATRIX,
-        optype,
-        prev,
-        fut,
-    )
-}
-
 impl SbpOperator2d for SBP4 {
-    fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn diffxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * SBP4::BLOCK.len());
-
-        let symmetry = super::Symmetry::AntiSymmetric;
-        let optype = super::OperatorType::Normal;
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                //diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
-                diff_op_row_local(prev, fut)
-            }
-            ([1, _], [1, _]) => {
-                //diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut);
-                diff_op_col_local(prev, fut)
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    SBP4.diff(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut)
     }
     fn op_xi(&self) -> &dyn SbpOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn SbpOperator1d {
-        &Self
-    }
 }
 
 impl super::SbpOperator1d2 for SBP4 {
     fn diff2(&self, prev: ArrayView1<Float>, mut fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::D2BLOCK,
-            Self::D2DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::Normal,
-            prev,
-            fut.view_mut(),
-        );
+        super::diff_op_1d(&Self::D2, OperatorType::Normal, prev, fut.view_mut());
         let hi = (prev.len() - 1) as Float;
         fut.map_inplace(|x| *x *= hi)
     }
@@ -162,13 +92,7 @@ impl super::SbpOperator1d2 for SBP4 {
     }
     #[cfg(feature = "sparse")]
     fn diff2_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        let mut m = super::sparse_from_block(
-            Self::D2BLOCK,
-            Self::D2DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::Normal,
-            n,
-        );
+        let mut m = super::sparse_from_block(&Self::D2, OperatorType::Normal, n);
         let hi = (n - 1) as Float;
         m.map_inplace(|v| v * hi);
         m
diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index 32b392c..cfd7b8b 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -1,4 +1,6 @@
-use super::{diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d};
+use super::{
+    BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d,
+};
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
 
@@ -7,30 +9,15 @@ pub struct SBP8;
 
 impl SBP8 {
     #[rustfmt::skip]
-    const HBLOCK: &'static [Float] = &[
+    const H: DiagonalMatrix<8> = DiagonalMatrix::new([
         2.94890676177879e-01, 1.52572062389771e+00, 2.57452876984127e-01, 1.79811370149912e+00, 4.12708057760141e-01, 1.27848462301587e+00, 9.23295579805997e-01, 1.00933386085916e+00
-    ];
+    ]);
     #[rustfmt::skip]
-    const DIAG: &'static [Float] = &[
-        3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03
-    ];
-    #[rustfmt::skip]
-    const DIAG_MATRIX: RowVector<Float, 9> = Matrix::new([[
+    const DIFF_DIAG: RowVector<Float, 9> = Matrix::new([[
         3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03
     ]]);
     #[rustfmt::skip]
-    const BLOCK: &'static [&'static [Float]] = &[
-        &[-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02],
-        &[-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03],
-        &[3.88218088704253e-02, -5.44329744454984e-01, 0.00000000000000e+00, 3.89516189693211e-01, 1.36433486528546e-01, 1.03290582800845e-01, -1.79720579323281e-01, 5.59882558852296e-02],
-        &[1.28088632226564e-01, -4.19172130036008e-01, -5.57707021445779e-02, 0.00000000000000e+00, 1.24714160903055e-01, 2.81285212519100e-01, -3.94470423942641e-02, -1.96981310738430e-02],
-        &[-1.82119472519009e-02, 9.09986646154550e-02, -8.51090570277506e-02, -5.43362886365301e-01, 0.00000000000000e+00, 6.37392455438558e-01, -1.02950081118829e-01, 2.98964956216039e-02, -8.65364391190110e-03],
-        &[-7.93147196245203e-02, 2.22875323171502e-01, -2.07999824391436e-02, -3.95611167748401e-01, -2.05756876210586e-01, 0.00000000000000e+00, 5.45876519966127e-01, -9.42727926638298e-02, 2.97971812952850e-02, -2.79348574643297e-03],
-        &[2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03],
-        &[5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
-    ];
-    #[rustfmt::skip]
-    const BLOCK_MATRIX: Matrix<Float, 8, 12> = Matrix::new([
+    const DIFF_BLOCK: Matrix<Float, 8, 12> = Matrix::new([
         [-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02, 0.0, 0.0, 0.0, 0.0],
         [-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03, 0.0, 0.0, 0.0, 0.0],
         [3.88218088704253e-02, -5.44329744454984e-01, 0.00000000000000e+00, 3.89516189693211e-01, 1.36433486528546e-01, 1.03290582800845e-01, -1.79720579323281e-01, 5.59882558852296e-02, 0.0, 0.0, 0.0, 0.0],
@@ -40,107 +27,40 @@ impl SBP8 {
         [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0],
         [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
     ]);
-    const BLOCKEND_MATRIX: super::Matrix<Float, 8, 12> =
-        super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX)));
+    const DIFF: BlockMatrix<8, 12, 9> = BlockMatrix::new(
+        Self::DIFF_BLOCK,
+        Self::DIFF_DIAG,
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
+    );
 }
 
 impl SbpOperator1d for SBP8 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d_matrix(
-            &Self::BLOCK_MATRIX,
-            &Self::BLOCKEND_MATRIX,
-            &Self::DIAG_MATRIX,
-            super::OperatorType::Normal,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut)
     }
 
     fn h(&self) -> &'static [Float] {
-        Self::HBLOCK
+        &Self::H.start
     }
 
     #[cfg(feature = "sparse")]
     fn diff_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::Normal,
-            n,
-        )
+        super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n)
     }
     #[cfg(feature = "sparse")]
     fn h_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::h_matrix(Self::HBLOCK, n, self.is_h2())
+        super::h_matrix(&Self::H, n, self.is_h2())
     }
 }
 
-fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
-    /*
-    let mut flipmatrix = SBP8::BLOCK_MATRIX;
-    flipmatrix *= &-1.0;
-    */
-
-    for (p, mut f) in prev
-        .axis_iter(ndarray::Axis(0))
-        .zip(fut.axis_iter_mut(ndarray::Axis(0)))
-    {
-        super::diff_op_1d_slice_matrix(
-            &SBP8::BLOCK_MATRIX,
-            &SBP8::BLOCKEND_MATRIX,
-            //&flipmatrix,
-            &SBP8::DIAG_MATRIX,
-            super::OperatorType::Normal,
-            p.as_slice().unwrap(),
-            f.as_slice_mut().unwrap(),
-        )
-    }
-}
-
-fn diff_op_col_local(prev: ndarray::ArrayView2<Float>, fut: ndarray::ArrayViewMut2<Float>) {
-    let optype = super::OperatorType::Normal;
-    super::diff_op_col_matrix(
-        &SBP8::BLOCK_MATRIX,
-        &SBP8::BLOCKEND_MATRIX,
-        &SBP8::DIAG_MATRIX,
-        optype,
-        prev,
-        fut,
-    )
-}
-
 impl SbpOperator2d for SBP8 {
-    fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn diffxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * SBP8::BLOCK.len());
-
-        let symmetry = super::Symmetry::AntiSymmetric;
-        let optype = super::OperatorType::Normal;
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                //diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
-                diff_op_row_local(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                //diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut);
-                diff_op_col_local(prev, fut)
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    SBP8.diff(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut)
     }
     fn op_xi(&self) -> &dyn SbpOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn SbpOperator1d {
-        &Self
-    }
 }
 
 #[test]
diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs
index d2a4eb4..b9742bf 100644
--- a/sbp/src/operators/upwind4.rs
+++ b/sbp/src/operators/upwind4.rs
@@ -1,241 +1,66 @@
 use super::{
-    diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d, UpwindOperator1d,
-    UpwindOperator2d,
+    BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d,
+    UpwindOperator1d, UpwindOperator2d,
 };
 use crate::Float;
-use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Axis};
+use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
 
 #[derive(Debug, Copy, Clone)]
 pub struct Upwind4;
 
-/// Simdtype used in diff_simd_col
-#[cfg(feature = "f32")]
-type SimdT = packed_simd::f32x8;
-#[cfg(not(feature = "f32"))]
-type SimdT = packed_simd::f64x8;
-
-macro_rules! diff_simd_col_7_47 {
-    ($name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => {
-        #[inline(never)]
-        fn $name(prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
-            use std::slice;
-            assert_eq!(prev.shape(), fut.shape());
-            assert_eq!(prev.stride_of(Axis(0)), 1);
-            assert_eq!(fut.stride_of(Axis(0)), 1);
-            let ny = prev.len_of(Axis(0));
-            let nx = prev.len_of(Axis(1));
-            assert!(nx >= 2 * $BLOCK.len());
-            assert!(ny >= SimdT::lanes());
-            assert!(ny % SimdT::lanes() == 0);
-
-            let dx = 1.0 / (nx - 1) as Float;
-            let idx = 1.0 / dx;
-
-            for j in (0..ny).step_by(SimdT::lanes()) {
-                let a = unsafe {
-                    [
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 0)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 1)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 2)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 3)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 4)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 5)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, 6)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                    ]
-                };
-
-                for (i, bl) in $BLOCK.iter().enumerate() {
-                    let b = idx
-                        * (a[0] * bl[0]
-                            + a[1] * bl[1]
-                            + a[2] * bl[2]
-                            + a[3] * bl[3]
-                            + a[4] * bl[4]
-                            + a[5] * bl[5]
-                            + a[6] * bl[6]);
-                    unsafe {
-                        b.write_to_slice_unaligned(slice::from_raw_parts_mut(
-                            fut.uget_mut((j, i)) as *mut Float,
-                            SimdT::lanes(),
-                        ));
-                    }
-                }
-
-                let mut a = a;
-                for i in $BLOCK.len()..nx - $BLOCK.len() {
-                    // Push a onto circular buffer
-                    a = [a[1], a[2], a[3], a[4], a[5], a[6], unsafe {
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, i + 3)) as *const Float,
-                            SimdT::lanes(),
-                        ))
-                    }];
-                    let b = idx
-                        * (a[0] * $DIAG[0]
-                            + a[1] * $DIAG[1]
-                            + a[2] * $DIAG[2]
-                            + a[3] * $DIAG[3]
-                            + a[4] * $DIAG[4]
-                            + a[5] * $DIAG[5]
-                            + a[6] * $DIAG[6]);
-                    unsafe {
-                        b.write_to_slice_unaligned(slice::from_raw_parts_mut(
-                            fut.uget_mut((j, i)) as *mut Float,
-                            SimdT::lanes(),
-                        ));
-                    }
-                }
-
-                let a = unsafe {
-                    [
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 1)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 2)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 3)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 4)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 5)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 6)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                        SimdT::from_slice_unaligned(slice::from_raw_parts(
-                            prev.uget((j, nx - 7)) as *const Float,
-                            SimdT::lanes(),
-                        )),
-                    ]
-                };
-
-                for (i, bl) in $BLOCK.iter().enumerate() {
-                    let idx = if $symmetric { idx } else { -idx };
-                    let b = idx
-                        * (a[0] * bl[0]
-                            + a[1] * bl[1]
-                            + a[2] * bl[2]
-                            + a[3] * bl[3]
-                            + a[4] * bl[4]
-                            + a[5] * bl[5]
-                            + a[6] * bl[6]);
-                    unsafe {
-                        b.write_to_slice_unaligned(slice::from_raw_parts_mut(
-                            fut.uget_mut((j, nx - 1 - i)) as *mut Float,
-                            SimdT::lanes(),
-                        ));
-                    }
-                }
-            }
-        }
-    };
-}
-
-diff_simd_col_7_47!(diff_simd_col, Upwind4::BLOCK, Upwind4::DIAG, false);
-diff_simd_col_7_47!(diss_simd_col, Upwind4::DISS_BLOCK, Upwind4::DISS_DIAG, true);
-
 impl Upwind4 {
     #[rustfmt::skip]
-    pub const HBLOCK: &'static [Float] = &[
+    const H: DiagonalMatrix<4> = DiagonalMatrix::new([
         49.0 / 144.0, 61.0 / 48.0, 41.0 / 48.0, 149.0 / 144.0
-    ];
-    #[rustfmt::skip]
-    const DIAG: &'static [Float] = &[
-        -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
-    ];
-    #[rustfmt::skip]
-    const DIAG_MATRIX : RowVector<Float, 7> = RowVector::new([
-        [-1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0]
     ]);
     #[rustfmt::skip]
-    const BLOCK: &'static [&'static [Float]] = &[
-        &[  -72.0 / 49.0, 187.0 / 98.0,   -20.0 / 49.0,   -3.0 / 98.0,           0.0,           0.0,         0.0],
-        &[-187.0 / 366.0,          0.0,   69.0 / 122.0, -16.0 / 183.0,    2.0 / 61.0,           0.0,         0.0],
-        &[  20.0 / 123.0, -69.0 / 82.0,            0.0, 227.0 / 246.0,  -12.0 / 41.0,    2.0 / 41.0,         0.0],
-        &[   3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0,           0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
-    ];
-    #[rustfmt::skip]
-    const BLOCK_MATRIX: Matrix<Float, 4, 7> = Matrix::new([
+    const DIFF_BLOCK: Matrix<Float, 4, 7> = Matrix::new([
         [  -72.0 / 49.0, 187.0 / 98.0,   -20.0 / 49.0,   -3.0 / 98.0,           0.0,           0.0,         0.0],
         [-187.0 / 366.0,          0.0,   69.0 / 122.0, -16.0 / 183.0,    2.0 / 61.0,           0.0,         0.0],
         [  20.0 / 123.0, -69.0 / 82.0,            0.0, 227.0 / 246.0,  -12.0 / 41.0,    2.0 / 41.0,         0.0],
         [   3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0,           0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
     ]);
-    const BLOCKEND_MATRIX: Matrix<Float, 4, 7> =
-        super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX)));
+    #[rustfmt::skip]
+    const DIFF_DIAG: RowVector<Float, 7> = RowVector::new([[
+        -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
+    ]]);
+    const DIFF_BLOCKEND: Matrix<Float, 4, 7> =
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK)));
+
+    const DIFF: BlockMatrix<4, 7, 7> =
+        BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND);
 
     #[rustfmt::skip]
-    const DISS_BLOCK: &'static [&'static [Float]] = &[
-        &[-3.0 / 49.0,    9.0 / 49.0,  -9.0 / 49.0,     3.0 / 49.0,          0.0,           0.0,         0.0],
-        &[ 3.0 / 61.0,  -11.0 / 61.0,  15.0 / 61.0,    -9.0 / 61.0,   2.0 / 61.0,           0.0,         0.0],
-        &[-3.0 / 41.0,   15.0 / 41.0, -29.0 / 41.0,    27.0 / 41.0, -12.0 / 41.0,    2.0 / 41.0,         0.0],
-        &[3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
-    ];
-
+    const DISS_BLOCK: Matrix<Float, 4, 7> = Matrix::new([
+        [-3.0 / 49.0,    9.0 / 49.0,  -9.0 / 49.0,     3.0 / 49.0,          0.0,           0.0,         0.0],
+        [ 3.0 / 61.0,  -11.0 / 61.0,  15.0 / 61.0,    -9.0 / 61.0,   2.0 / 61.0,           0.0,         0.0],
+        [-3.0 / 41.0,   15.0 / 41.0, -29.0 / 41.0,    27.0 / 41.0, -12.0 / 41.0,    2.0 / 41.0,         0.0],
+        [3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
+    ]);
     #[rustfmt::skip]
-    const DISS_DIAG: &'static [Float; 7] = &[
+    const DISS_DIAG: RowVector<Float, 7> = Matrix::new([[
         1.0 / 24.0, -1.0 / 4.0, 5.0 / 8.0, -5.0 / 6.0, 5.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
-    ];
+    ]]);
+    const DISS_BLOCKEND: Matrix<Float, 4, 7> = super::flip_ud(super::flip_lr(Self::DISS_BLOCK));
+
+    const DISS: BlockMatrix<4, 7, 7> =
+        BlockMatrix::new(Self::DISS_BLOCK, Self::DISS_DIAG, Self::DISS_BLOCKEND);
 }
 
 impl SbpOperator1d for Upwind4 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d_matrix(
-            &Self::BLOCK_MATRIX,
-            &Self::BLOCKEND_MATRIX,
-            &Self::DIAG_MATRIX,
-            super::OperatorType::Normal,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut)
     }
     fn h(&self) -> &'static [Float] {
-        Self::HBLOCK
+        &Self::H.start
     }
     #[cfg(feature = "sparse")]
     fn diff_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::Normal,
-            n,
-        )
+        super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n)
     }
     #[cfg(feature = "sparse")]
     fn h_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::h_matrix(Self::HBLOCK, n, self.is_h2())
+        super::h_matrix(&Self::H, n, self.is_h2())
     }
 
     fn upwind(&self) -> Option<&dyn UpwindOperator1d> {
@@ -243,53 +68,14 @@ impl SbpOperator1d for Upwind4 {
     }
 }
 
-fn diff_op_row_local(prev: ndarray::ArrayView2<Float>, mut fut: ndarray::ArrayViewMut2<Float>) {
-    for (p, mut f) in prev
-        .axis_iter(ndarray::Axis(0))
-        .zip(fut.axis_iter_mut(ndarray::Axis(0)))
-    {
-        super::diff_op_1d_slice_matrix(
-            &Upwind4::BLOCK_MATRIX,
-            &Upwind4::BLOCKEND_MATRIX,
-            &Upwind4::DIAG_MATRIX,
-            super::OperatorType::Normal,
-            p.as_slice().unwrap(),
-            f.as_slice_mut().unwrap(),
-        )
-    }
-}
-
 impl SbpOperator2d for Upwind4 {
-    fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn diffxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind4::BLOCK.len());
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => diff_op_row_local(prev, fut),
-            ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
-                diff_simd_col(prev, fut)
-            }
-            ([1, _], [1, _]) => diff_op_col(
-                Upwind4::BLOCK,
-                Upwind4::DIAG,
-                super::Symmetry::AntiSymmetric,
-                super::OperatorType::Normal,
-            )(prev, fut),
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind4.diff(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut);
     }
     fn op_xi(&self) -> &dyn SbpOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn SbpOperator1d {
-        &Self
-    }
     fn upwind(&self) -> Option<Box<dyn UpwindOperator2d>> {
         Some(Box::new(Self))
     }
@@ -385,14 +171,7 @@ fn upwind4_test() {
 
 impl UpwindOperator1d for Upwind4 {
     fn diss(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::Normal,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DISS, super::OperatorType::Normal, prev, fut)
     }
 
     fn as_sbp(&self) -> &dyn SbpOperator1d {
@@ -401,53 +180,20 @@ impl UpwindOperator1d for Upwind4 {
 
     #[cfg(feature = "sparse")]
     fn diss_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::Normal,
-            n,
-        )
+        super::sparse_from_block(&Self::DISS, super::OperatorType::Normal, n)
     }
 }
 
 impl UpwindOperator2d for Upwind4 {
-    fn dissxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn dissxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind4::BLOCK.len());
 
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => diff_op_row(
-                Upwind4::DISS_BLOCK,
-                Upwind4::DISS_DIAG,
-                super::Symmetry::Symmetric,
-                super::OperatorType::Normal,
-            )(prev, fut),
-            ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
-                diss_simd_col(prev, fut);
-            }
-            ([1, _], [1, _]) => diff_op_col(
-                Upwind4::DISS_BLOCK,
-                Upwind4::DISS_DIAG,
-                super::Symmetry::Symmetric,
-                super::OperatorType::Normal,
-            )(prev, fut),
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind4.diss(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DISS, OperatorType::Normal, prev, fut)
     }
 
     fn op_xi(&self) -> &dyn UpwindOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn UpwindOperator1d {
-        &Self
-    }
 }
 
 #[test]
diff --git a/sbp/src/operators/upwind4h2.rs b/sbp/src/operators/upwind4h2.rs
index 0ef5f9b..e1cbf60 100644
--- a/sbp/src/operators/upwind4h2.rs
+++ b/sbp/src/operators/upwind4h2.rs
@@ -1,5 +1,6 @@
 use super::{
-    diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d,
+    BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d,
+    UpwindOperator1d, UpwindOperator2d,
 };
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
@@ -9,49 +10,52 @@ pub struct Upwind4h2;
 
 impl Upwind4h2 {
     #[rustfmt::skip]
-    const HBLOCK: &'static [Float] = &[
+    const H: DiagonalMatrix<4> = DiagonalMatrix::new([
         0.91e2/0.720e3, 0.325e3/0.384e3, 0.595e3/0.576e3, 0.1909e4/0.1920e4,
-    ];
+    ]);
     #[rustfmt::skip]
-    const DIAG: &'static [Float] = &[
+    const DIFF_BLOCK: Matrix<Float, 4, 7> = Matrix::new([
+        [-3.95604395604396e+00, 5.41758241758242e+00, -1.94505494505495e+00, 4.83516483516484e-01, 0.0, 0.0, 0.0],
+        [-8.09025641025641e-01, 0.00000000000000e+00, 1.03948717948718e+00, -2.47384615384615e-01, 1.69230769230769e-02, 0.0, 0.0],
+        [2.37983193277311e-01, -8.51680672268908e-01, 0.00000000000000e+00, 7.35966386554622e-01, -1.36134453781513e-01, 1.38655462184874e-02, 0.0],
+        [-6.14632442814737e-02, 2.10581456259822e-01, -7.64623712240265e-01, 0.00000000000000e+00, 7.42535358826611e-01, -1.41435306443164e-01, 1.44054478784704e-02],
+    ]);
+    #[rustfmt::skip]
+    const DIFF_DIAG: RowVector<Float, 7> = RowVector::new([[
         -1.43229166666667e-02, 1.40625000000000e-01, -7.38281250000000e-01, 0.00000000000000e+00, 7.38281250000000e-01, -1.40625000000000e-01, 1.43229166666667e-02
-    ];
-    #[rustfmt::skip]
-    const BLOCK: &'static [&'static [Float]] = &[
-        &[-3.95604395604396e+00, 5.41758241758242e+00, -1.94505494505495e+00, 4.83516483516484e-01],
-        &[-8.09025641025641e-01, 0.00000000000000e+00, 1.03948717948718e+00, -2.47384615384615e-01, 1.69230769230769e-02],
-        &[2.37983193277311e-01, -8.51680672268908e-01, 0.00000000000000e+00, 7.35966386554622e-01, -1.36134453781513e-01, 1.38655462184874e-02],
-        &[-6.14632442814737e-02, 2.10581456259822e-01, -7.64623712240265e-01, 0.00000000000000e+00, 7.42535358826611e-01, -1.41435306443164e-01, 1.44054478784704e-02],
-    ];
+    ]]);
+    const DIFF: BlockMatrix<4, 7, 7> = BlockMatrix::new(
+        Self::DIFF_BLOCK,
+        Self::DIFF_DIAG,
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
+    );
 
     #[rustfmt::skip]
-    const DISS_BLOCK: &'static [&'static [Float]] = &[
-        &[-2.76989342315976e-01, 5.19355016842454e-01, -3.46236677894969e-01, 1.03871003368491e-01],
-        &[7.75570158484731e-02, -1.62342481638964e-01, 1.47715500579822e-01, -7.98531117124082e-02, 1.69230769230769e-02],
-        &[-4.23630758836198e-02, 1.21027405937249e-01, -1.91609307039399e-01, 1.82272708078206e-01, -8.31932773109244e-02, 1.38655462184874e-02],
-        &[1.32037874021759e-02, -6.79734450144910e-02, 1.89370108794365e-01, -2.78654929966754e-01, 2.16081718177056e-01, -8.64326872708224e-02, 1.44054478784704e-02],
-    ];
+    const DISS_BLOCK: Matrix<Float, 4, 7> = Matrix::new([
+        [-2.76989342315976e-01, 5.19355016842454e-01, -3.46236677894969e-01, 1.03871003368491e-01, 0.0, 0.0, 0.0],
+        [7.75570158484731e-02, -1.62342481638964e-01, 1.47715500579822e-01, -7.98531117124082e-02, 1.69230769230769e-02, 0.0, 0.0],
+        [-4.23630758836198e-02, 1.21027405937249e-01, -1.91609307039399e-01, 1.82272708078206e-01, -8.31932773109244e-02, 1.38655462184874e-02, 0.0],
+        [1.32037874021759e-02, -6.79734450144910e-02, 1.89370108794365e-01, -2.78654929966754e-01, 2.16081718177056e-01, -8.64326872708224e-02, 1.44054478784704e-02],
+    ]);
 
     #[rustfmt::skip]
-    const DISS_DIAG: &'static [Float; 7] = &[
+    const DISS_DIAG: RowVector<Float, 7> = RowVector::new([[
         1.43229166666667e-02, -8.59375000000000e-02, 2.14843750000000e-01, -2.86458333333333e-01, 2.14843750000000e-01, -8.59375000000000e-02, 1.43229166666667e-02,
-    ];
+    ]]);
+    const DISS: BlockMatrix<4, 7, 7> = BlockMatrix::new(
+        Self::DISS_BLOCK,
+        Self::DISS_DIAG,
+        super::flip_ud(super::flip_lr(Self::DISS_BLOCK)),
+    );
 }
 
 impl SbpOperator1d for Upwind4h2 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::H2,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DIFF, OperatorType::H2, prev, fut)
     }
 
     fn h(&self) -> &'static [Float] {
-        Self::HBLOCK
+        &Self::H.start
     }
     fn is_h2(&self) -> bool {
         true
@@ -59,17 +63,11 @@ impl SbpOperator1d for Upwind4h2 {
 
     #[cfg(feature = "sparse")]
     fn diff_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::H2,
-            n,
-        )
+        super::sparse_from_block(&Self::DIFF, OperatorType::H2, n)
     }
     #[cfg(feature = "sparse")]
     fn h_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::h_matrix(Self::HBLOCK, n, self.is_h2())
+        super::h_matrix(&Self::H, n, self.is_h2())
     }
 
     fn upwind(&self) -> Option<&dyn UpwindOperator1d> {
@@ -78,80 +76,28 @@ impl SbpOperator1d for Upwind4h2 {
 }
 
 impl SbpOperator2d for Upwind4h2 {
-    fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn diffxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind4h2::BLOCK.len());
 
-        let symmetry = super::Symmetry::AntiSymmetric;
-        let optype = super::OperatorType::H2;
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                diff_op_row(Upwind4h2::BLOCK, Upwind4h2::DIAG, symmetry, optype)(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                diff_op_col(Upwind4h2::BLOCK, Upwind4h2::DIAG, symmetry, optype)(prev, fut);
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind4h2.diff(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DIFF, OperatorType::H2, prev, fut)
     }
     fn op_xi(&self) -> &dyn SbpOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn SbpOperator1d {
-        &Self
-    }
     fn upwind(&self) -> Option<Box<dyn UpwindOperator2d>> {
         Some(Box::new(Self))
     }
 }
 
 impl UpwindOperator2d for Upwind4h2 {
-    fn dissxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn dissxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind4h2::BLOCK.len());
 
-        let symmetry = super::Symmetry::Symmetric;
-        let optype = super::OperatorType::H2;
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                diff_op_row(
-                    Upwind4h2::DISS_BLOCK,
-                    Upwind4h2::DISS_DIAG,
-                    symmetry,
-                    optype,
-                )(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                diff_op_col(
-                    Upwind4h2::DISS_BLOCK,
-                    Upwind4h2::DISS_DIAG,
-                    symmetry,
-                    optype,
-                )(prev, fut);
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind4h2.diss(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DISS, OperatorType::H2, prev, fut)
     }
     fn op_xi(&self) -> &dyn UpwindOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn UpwindOperator1d {
-        &Self
-    }
 }
 
 #[test]
@@ -181,14 +127,7 @@ fn upwind4h2_test() {
 
 impl UpwindOperator1d for Upwind4h2 {
     fn diss(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::H2,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DISS, super::OperatorType::H2, prev, fut)
     }
 
     fn as_sbp(&self) -> &dyn SbpOperator1d {
@@ -197,12 +136,6 @@ impl UpwindOperator1d for Upwind4h2 {
 
     #[cfg(feature = "sparse")]
     fn diss_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::H2,
-            n,
-        )
+        super::sparse_from_block(&Self::DISS, super::OperatorType::H2, n)
     }
 }
diff --git a/sbp/src/operators/upwind9.rs b/sbp/src/operators/upwind9.rs
index 596a354..c6d374e 100644
--- a/sbp/src/operators/upwind9.rs
+++ b/sbp/src/operators/upwind9.rs
@@ -1,5 +1,6 @@
 use super::{
-    diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d,
+    BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d,
+    UpwindOperator1d, UpwindOperator2d,
 };
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
@@ -9,72 +10,69 @@ pub struct Upwind9;
 
 impl Upwind9 {
     #[rustfmt::skip]
-    const HBLOCK: &'static [Float] = &[
+    const H: DiagonalMatrix<8> = DiagonalMatrix::new([
         1070017.0/3628800.0, 5537111.0/3628800.0, 103613.0/403200.0, 261115.0/145152.0, 298951.0/725760.0, 515677.0/403200.0, 3349879.0/3628800.0, 3662753.0/3628800.0
-    ];
+    ]);
     #[rustfmt::skip]
-    const DIAG: &'static [Float] = &[
+    const DIFF_DIAG: RowVector<Float, 11> = RowVector::new([[
         -1.0/1260.0, 5.0/504.0, -5.0/84.0, 5.0/21.0, -5.0/6.0, 0.0, 5.0/6.0, -5.0/21.0, 5.0/84.0, -5.0/504.0, 1.0/1260.0,
-    ];
+    ]]);
     #[rustfmt::skip]
-    const BLOCK: &'static [&'static [Float]] = &[
-        &[-1.69567399396458e+00, 2.29023358159400e+00, -2.16473500425698e-01, -5.05879766354449e-01, -1.01161106778154e-01, 2.59147072064383e-01, 1.93922119400659e-02, -4.95844980755642e-02],
-        &[-4.42575354959737e-01, 0.00000000000000e+00, 1.91582959381899e-01, 2.82222626681305e-01, 1.12083989713257e-01, -1.51334868892111e-01, -2.23600502721044e-02, 3.03806983474913e-02],
-        &[2.48392603571843e-01, -1.13758367065272e+00, 0.00000000000000e+00, 1.95334726810969e+00, -1.58879011773212e+00, 3.93797129320378e-01, 2.52140821030291e-01, -1.21304033647356e-01],
-        &[8.29213105268236e-02, -2.39388470313226e-01, -2.79038666398460e-01, 0.00000000000000e+00, 3.43018053395471e-01, 1.10370852514749e-01, 1.72029988649808e-03, -2.00445645303789e-02, 4.41184918522490e-04],
-        &[7.24159504343116e-02, -4.15199475743626e-01, 9.91181694804303e-01, -1.49802407438608e+00, 0.00000000000000e+00, 1.30188867830442e+00, -6.03535071819214e-01, 1.73429775718218e-01, -2.40842144699299e-02, 1.92673715759439e-03],
-        &[-5.97470838462221e-02, 1.80551858630298e-01, -7.91241454636765e-02, -1.55240829877729e-01, -4.19298775383066e-01, 0.00000000000000e+00, 6.42287612546289e-01, -1.48833147569152e-01, 4.65407609802260e-02, -7.75679349670433e-03, 6.20543479736347e-04],
-        &[-6.19425252179959e-03, 3.69595678895333e-02, -7.01892820620398e-02, -3.35233082197107e-03, 2.69304373763091e-01, -8.89857974743355e-01, 0.00000000000000e+00, 8.66656645522330e-01, -2.57919763669076e-01, 6.44799409172690e-02, -1.07466568195448e-02, 8.59732545563586e-04],
-        &[1.44853491014330e-02, -4.59275574977554e-02, 3.08833474560615e-02, 3.57240610228828e-02, -7.07760049349999e-02, 1.88587240076292e-01, -7.92626447113877e-01, 0.00000000000000e+00, 8.25608497215073e-01, -2.35888142061449e-01, 5.89720355153623e-02, -9.82867258589373e-03, 7.86293806871498e-04],
-    ];
+    const DIFF_BLOCK: Matrix<Float, 8, 13> = Matrix::new([
+        [-1.69567399396458e+00, 2.29023358159400e+00, -2.16473500425698e-01, -5.05879766354449e-01, -1.01161106778154e-01, 2.59147072064383e-01, 1.93922119400659e-02, -4.95844980755642e-02, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [-4.42575354959737e-01, 0.00000000000000e+00, 1.91582959381899e-01, 2.82222626681305e-01, 1.12083989713257e-01, -1.51334868892111e-01, -2.23600502721044e-02, 3.03806983474913e-02, 0.0, 0.0, 0.0, 0.0,0.0],
+        [2.48392603571843e-01, -1.13758367065272e+00, 0.00000000000000e+00, 1.95334726810969e+00, -1.58879011773212e+00, 3.93797129320378e-01, 2.52140821030291e-01, -1.21304033647356e-01, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [8.29213105268236e-02, -2.39388470313226e-01, -2.79038666398460e-01, 0.00000000000000e+00, 3.43018053395471e-01, 1.10370852514749e-01, 1.72029988649808e-03, -2.00445645303789e-02, 4.41184918522490e-04, 0.0, 0.0, 0.0, 0.0],
+        [7.24159504343116e-02, -4.15199475743626e-01, 9.91181694804303e-01, -1.49802407438608e+00, 0.00000000000000e+00, 1.30188867830442e+00, -6.03535071819214e-01, 1.73429775718218e-01, -2.40842144699299e-02, 1.92673715759439e-03, 0.0, 0.0, 0.0],
+        [-5.97470838462221e-02, 1.80551858630298e-01, -7.91241454636765e-02, -1.55240829877729e-01, -4.19298775383066e-01, 0.00000000000000e+00, 6.42287612546289e-01, -1.48833147569152e-01, 4.65407609802260e-02, -7.75679349670433e-03, 6.20543479736347e-04, 0.0, 0.0],
+        [-6.19425252179959e-03, 3.69595678895333e-02, -7.01892820620398e-02, -3.35233082197107e-03, 2.69304373763091e-01, -8.89857974743355e-01, 0.00000000000000e+00, 8.66656645522330e-01, -2.57919763669076e-01, 6.44799409172690e-02, -1.07466568195448e-02, 8.59732545563586e-04, 0.0],
+        [1.44853491014330e-02, -4.59275574977554e-02, 3.08833474560615e-02, 3.57240610228828e-02, -7.07760049349999e-02, 1.88587240076292e-01, -7.92626447113877e-01, 0.00000000000000e+00, 8.25608497215073e-01, -2.35888142061449e-01, 5.89720355153623e-02, -9.82867258589373e-03, 7.86293806871498e-04],
+    ]);
+    const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+        Self::DIFF_BLOCK,
+        Self::DIFF_DIAG,
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
+    );
 
     #[rustfmt::skip]
-    const DISS_BLOCK: &'static [&'static [Float]] = &[
-        &[-3.99020778658945e-04, 2.05394169917502e-03, -4.24493243399805e-03, 4.38126393542801e-03, -2.18883813216888e-03, 2.98565988131608e-04, 1.38484104084115e-04, -3.94643819928825e-05],
-        &[3.96913216138553e-04, -2.28230530115522e-03, 5.43069719436758e-03, -6.81086901935894e-03, 4.69064759201504e-03, -1.61429862514855e-03, 1.62083873811316e-04, 2.71310693302277e-05],
-        &[-4.87084939816571e-03, 3.22464611075207e-02, -9.06094757860846e-02, 1.39830191253413e-01, -1.27675500367419e-01, 6.87310321912961e-02, -2.00917702215270e-02, 2.43991122096699e-03],
-        &[7.18155125886276e-04, -5.77715378536685e-03, 1.99749582302141e-02, -3.87940986951101e-02, 4.62756436981388e-02, -3.46770570075288e-02, 1.59058082995305e-02, -4.06744078428648e-03, 4.41184918522490e-04],
-        &[-1.56687484682703e-03, 1.73758484693946e-02, -7.96515646886111e-02, 2.02094401829054e-01, -3.16098733124618e-01, 3.17999240131250e-01, -2.06522928911140e-01, 8.37112455598470e-02, -1.92673715759439e-02, 1.92673715759439e-03],
-        &[6.88352254356072e-05, -1.92595810396278e-03, 1.38098624496279e-02, -4.87746083763075e-02, 1.02417890394006e-01, -1.38292226669620e-01, 1.23829022892659e-01, -7.34723830823462e-02, 2.79244565881356e-02, -6.20543479736347e-03, 6.20543479736347e-04],
-        &[4.42345367100640e-05, 2.67913080025652e-04, -5.59301314813691e-03, 3.09954862110834e-02, -9.21529346596015e-02, 1.71559035817103e-01, -2.12738289547735e-01, 1.79835101537893e-01, -1.03167905467630e-01, 3.86879645503614e-02, -8.59732545563586e-03, 8.59732545563586e-04],
-        &[-1.15289127131636e-05, 4.10149803795578e-05, 6.21188131452618e-04, -7.24912245235322e-03, 3.41622279353287e-02, -9.30972311856124e-02, 1.64473506705108e-01, -1.98013074867399e-01, 1.65121699443015e-01, -9.43552568245798e-02, 3.53832213092174e-02, -7.86293806871498e-03, 7.86293806871498e-04]
-    ];
+    const DISS_BLOCK: Matrix<Float, 8, 13> = Matrix::new([
+        [-3.99020778658945e-04, 2.05394169917502e-03, -4.24493243399805e-03, 4.38126393542801e-03, -2.18883813216888e-03, 2.98565988131608e-04, 1.38484104084115e-04, -3.94643819928825e-05, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [3.96913216138553e-04, -2.28230530115522e-03, 5.43069719436758e-03, -6.81086901935894e-03, 4.69064759201504e-03, -1.61429862514855e-03, 1.62083873811316e-04, 2.71310693302277e-05, 0.0, 0.0, 0.0, 0.0, 0.0 ],
+        [-4.87084939816571e-03, 3.22464611075207e-02, -9.06094757860846e-02, 1.39830191253413e-01, -1.27675500367419e-01, 6.87310321912961e-02, -2.00917702215270e-02, 2.43991122096699e-03, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [7.18155125886276e-04, -5.77715378536685e-03, 1.99749582302141e-02, -3.87940986951101e-02, 4.62756436981388e-02, -3.46770570075288e-02, 1.59058082995305e-02, -4.06744078428648e-03, 4.41184918522490e-04, 0.0, 0.0, 0.0, 0.0],
+        [-1.56687484682703e-03, 1.73758484693946e-02, -7.96515646886111e-02, 2.02094401829054e-01, -3.16098733124618e-01, 3.17999240131250e-01, -2.06522928911140e-01, 8.37112455598470e-02, -1.92673715759439e-02, 1.92673715759439e-03, 0.0, 0.0, 0.0],
+        [6.88352254356072e-05, -1.92595810396278e-03, 1.38098624496279e-02, -4.87746083763075e-02, 1.02417890394006e-01, -1.38292226669620e-01, 1.23829022892659e-01, -7.34723830823462e-02, 2.79244565881356e-02, -6.20543479736347e-03, 6.20543479736347e-04, 0.0, 0.0],
+        [4.42345367100640e-05, 2.67913080025652e-04, -5.59301314813691e-03, 3.09954862110834e-02, -9.21529346596015e-02, 1.71559035817103e-01, -2.12738289547735e-01, 1.79835101537893e-01, -1.03167905467630e-01, 3.86879645503614e-02, -8.59732545563586e-03, 8.59732545563586e-04, 0.0],
+        [-1.15289127131636e-05, 4.10149803795578e-05, 6.21188131452618e-04, -7.24912245235322e-03, 3.41622279353287e-02, -9.30972311856124e-02, 1.64473506705108e-01, -1.98013074867399e-01, 1.65121699443015e-01, -9.43552568245798e-02, 3.53832213092174e-02, -7.86293806871498e-03, 7.86293806871498e-04]
+    ]);
 
     #[rustfmt::skip]
-    const DISS_DIAG: &'static [Float] = &[
+    const DISS_DIAG: RowVector<Float, 11> = RowVector::new([[
         1.0/1260.0, -1.0/126.0, 1.0/28.0, -2.0/21.0, 1.0/6.0, -1.0/5.0, 1.0/6.0, -2.0/21.0, 1.0/28.0, -1.0/126.0, 1.0/1260.0,
-    ];
+    ]]);
+    const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+        Self::DISS_BLOCK,
+        Self::DISS_DIAG,
+        super::flip_ud(super::flip_lr(Self::DISS_BLOCK)),
+    );
 }
 
 impl SbpOperator1d for Upwind9 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::Normal,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut)
     }
 
     fn h(&self) -> &'static [Float] {
-        Self::HBLOCK
+        &Self::H.start
     }
 
     #[cfg(feature = "sparse")]
     fn diff_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::Normal,
-            n,
-        )
+        super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n)
     }
     #[cfg(feature = "sparse")]
     fn h_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::h_matrix(Self::HBLOCK, n, self.is_h2())
+        super::h_matrix(&Self::H, n, self.is_h2())
     }
 
     fn upwind(&self) -> Option<&dyn UpwindOperator1d> {
@@ -83,35 +81,14 @@ impl SbpOperator1d for Upwind9 {
 }
 
 impl SbpOperator2d for Upwind9 {
-    fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn diffxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind9::BLOCK.len());
 
-        let symmetry = super::Symmetry::AntiSymmetric;
-        let optype = super::OperatorType::Normal;
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                diff_op_row(Upwind9::BLOCK, Upwind9::DIAG, symmetry, optype)(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                diff_op_col(Upwind9::BLOCK, Upwind9::DIAG, symmetry, optype)(prev, fut);
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind9.diff(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut)
     }
     fn op_xi(&self) -> &dyn SbpOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn SbpOperator1d {
-        &Self
-    }
     fn upwind(&self) -> Option<Box<dyn UpwindOperator2d>> {
         Some(Box::new(Self))
     }
@@ -119,14 +96,7 @@ impl SbpOperator2d for Upwind9 {
 
 impl UpwindOperator1d for Upwind9 {
     fn diss(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::Normal,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DISS, super::OperatorType::Normal, prev, fut)
     }
 
     fn as_sbp(&self) -> &dyn SbpOperator1d {
@@ -135,46 +105,18 @@ impl UpwindOperator1d for Upwind9 {
 
     #[cfg(feature = "sparse")]
     fn diss_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::Normal,
-            n,
-        )
+        super::sparse_from_block(&Self::DISS, super::OperatorType::Normal, n)
     }
 }
 
 impl UpwindOperator2d for Upwind9 {
-    fn dissxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn dissxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind9::BLOCK.len());
-
-        let symmetry = super::Symmetry::Symmetric;
-        let optype = super::OperatorType::Normal;
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                diff_op_row(Upwind9::DISS_BLOCK, Upwind9::DISS_DIAG, symmetry, optype)(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                diff_op_col(Upwind9::DISS_BLOCK, Upwind9::DISS_DIAG, symmetry, optype)(prev, fut);
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind9.diss(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DISS, OperatorType::Normal, prev, fut)
     }
     fn op_xi(&self) -> &dyn UpwindOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn UpwindOperator1d {
-        &Self
-    }
 }
 
 #[test]
diff --git a/sbp/src/operators/upwind9h2.rs b/sbp/src/operators/upwind9h2.rs
index df2e284..f099008 100644
--- a/sbp/src/operators/upwind9h2.rs
+++ b/sbp/src/operators/upwind9h2.rs
@@ -1,5 +1,6 @@
 use super::{
-    diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d,
+    BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d,
+    UpwindOperator1d, UpwindOperator2d,
 };
 use crate::Float;
 use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
@@ -9,57 +10,60 @@ pub struct Upwind9h2;
 
 impl Upwind9h2 {
     #[rustfmt::skip]
-    const HBLOCK: &'static [Float] = &[
+    const H: DiagonalMatrix<8> = DiagonalMatrix::new([
         0.13528301e8/0.97297200e8, 0.189103813e9/0.232243200e9, 0.125336387e9/0.116121600e9, 0.24412231e8/0.25804800e8, 0.11976149e8/0.11612160e8, 0.27510113e8/0.27869184e8, 0.142384289e9/0.141926400e9, 0.3018054133e10/0.3019161600e10
-    ];
+    ]);
     #[rustfmt::skip]
-    const DIAG: &'static [Float] = &[
+    const DIFF_DIAG: RowVector<Float, 11> = RowVector::new([[
         -7.93650793650794e-04, 9.92063492063492e-03, -5.95238095238095e-02, 2.38095238095238e-01, -8.33333333333333e-01, 0.00000000000000e+00, 8.33333333333333e-01, -2.38095238095238e-01, 5.95238095238095e-02, -9.92063492063492e-03, 7.93650793650794e-04
-    ];
+    ]]);
     #[rustfmt::skip]
-    const BLOCK: &'static [&'static [Float]] = &[
-        &[-3.59606132359119e+00, 4.89833536312447e+00, -1.88949383693789e+00, 5.47742030095865e-01, 1.98323891961440e-01, -1.81919774755160e-01, 9.75536252790282e-03, 1.33182875745630e-02],
-        &[-8.36438764514407e-01, 0.00000000000000e+00, 1.13444534067338e+00, -2.98589126517177e-01, -5.64158604558766e-02, 6.76718494319307e-02, -7.11434545077044e-03, -3.55909316707803e-03],
-        &[2.43402051702672e-01, -8.55808694890095e-01, 0.00000000000000e+00, 7.10023434316597e-01, -8.34706840757189e-02, -2.34098045062645e-02, 9.87675724649922e-03, -6.13059793689851e-04],
-        &[-8.05029896098980e-02, 2.56994775122436e-01, -8.10083655220697e-01, 0.00000000000000e+00, 7.86933049074895e-01, -1.76732206311904e-01, 2.52232469669152e-02, -2.67114375632805e-03, 8.38923734582063e-04],
-        &[-2.67370674924706e-02, 4.45404208225958e-02, 8.73562441688813e-02, -7.21839392529084e-01, 0.00000000000000e+00, 7.74603212862825e-01, -2.00161722480411e-01, 5.10878939438559e-02, -9.61911880020865e-03, 7.69529504016692e-04],
-        &[2.56244589039153e-02, -5.58209474181657e-02, 2.55972803937942e-02, 1.69377044771715e-01, -8.09310829929974e-01, 0.00000000000000e+00, 8.33417128898551e-01, -2.39938756878570e-01, 6.03007337701593e-02, -1.00501222950266e-02, 8.04009783602125e-04],
-        &[-1.35203347497451e-03, 5.77422023528126e-03, -1.06262408660325e-02, -2.37853245409338e-02, 2.05772021953463e-01, -8.20033622612654e-01, 0.00000000000000e+00, 8.31345778788959e-01, -2.37329555369694e-01, 5.93323888424235e-02, -9.88873147373725e-03, 7.91098517898980e-04],
-        &[-1.85246767034256e-03, 2.89905176240824e-03, 6.61951741227212e-04, 2.52792141498726e-03, -5.27086038822990e-02, 2.36934258275492e-01, -8.34333946306806e-01, 0.00000000000000e+00, 8.33639122800983e-01, -2.38182606514566e-01, 5.95456516286416e-02, -9.92427527144027e-03, 7.93942021715222e-04]
-    ];
+    const DIFF_BLOCK: Matrix<Float, 8, 13> = Matrix::new([
+        [-3.59606132359119e+00, 4.89833536312447e+00, -1.88949383693789e+00, 5.47742030095865e-01, 1.98323891961440e-01, -1.81919774755160e-01, 9.75536252790282e-03, 1.33182875745630e-02, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [-8.36438764514407e-01, 0.00000000000000e+00, 1.13444534067338e+00, -2.98589126517177e-01, -5.64158604558766e-02, 6.76718494319307e-02, -7.11434545077044e-03, -3.55909316707803e-03, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [2.43402051702672e-01, -8.55808694890095e-01, 0.00000000000000e+00, 7.10023434316597e-01, -8.34706840757189e-02, -2.34098045062645e-02, 9.87675724649922e-03, -6.13059793689851e-04, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [-8.05029896098980e-02, 2.56994775122436e-01, -8.10083655220697e-01, 0.00000000000000e+00, 7.86933049074895e-01, -1.76732206311904e-01, 2.52232469669152e-02, -2.67114375632805e-03, 8.38923734582063e-04, 0.0, 0.0, 0.0, 0.0],
+        [-2.67370674924706e-02, 4.45404208225958e-02, 8.73562441688813e-02, -7.21839392529084e-01, 0.00000000000000e+00, 7.74603212862825e-01, -2.00161722480411e-01, 5.10878939438559e-02, -9.61911880020865e-03, 7.69529504016692e-04, 0.0, 0.0, 0.0],
+        [2.56244589039153e-02, -5.58209474181657e-02, 2.55972803937942e-02, 1.69377044771715e-01, -8.09310829929974e-01, 0.00000000000000e+00, 8.33417128898551e-01, -2.39938756878570e-01, 6.03007337701593e-02, -1.00501222950266e-02, 8.04009783602125e-04, 0.0, 0.0],
+        [-1.35203347497451e-03, 5.77422023528126e-03, -1.06262408660325e-02, -2.37853245409338e-02, 2.05772021953463e-01, -8.20033622612654e-01, 0.00000000000000e+00, 8.31345778788959e-01, -2.37329555369694e-01, 5.93323888424235e-02, -9.88873147373725e-03, 7.91098517898980e-04, 0.0],
+        [-1.85246767034256e-03, 2.89905176240824e-03, 6.61951741227212e-04, 2.52792141498726e-03, -5.27086038822990e-02, 2.36934258275492e-01, -8.34333946306806e-01, 0.00000000000000e+00, 8.33639122800983e-01, -2.38182606514566e-01, 5.95456516286416e-02, -9.92427527144027e-03, 7.93942021715222e-04]
+    ]);
+    const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+        Self::DIFF_BLOCK,
+        Self::DIFF_DIAG,
+        super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
+    );
 
     #[rustfmt::skip]
-    const DISS_BLOCK: &'static [&'static [Float]] = &[
-        &[-9.67684432055993e-03, 2.46709290489728e-02, -3.59750445192099e-02, 3.68391266633410e-02, -2.15642540957128e-02, 6.31810399460400e-03, -5.50815969583136e-04, -6.12008018523449e-05],
-        &[4.21280289799980e-03, -1.11651391003720e-02, 1.78139108670387e-02, -2.04287733194061e-02, 1.39229889120575e-02, -5.16220078628933e-03, 8.08589544070426e-04, -2.17901509897605e-06],
-        &[-4.63425679136442e-03, 1.34385494509234e-02, -2.60541352669482e-02, 3.74164435120410e-02, -3.36394001786921e-02, 1.82199905150995e-02, -5.42550576921788e-03, 6.78314528158575e-04],
-        &[5.41433680102586e-03, -1.75829845731036e-02, 4.26893646894443e-02, -7.75663958780684e-02, 9.06986463369770e-02, -6.74666360933304e-02, 3.07997352073168e-02, -7.82499022484376e-03, 8.38923734582063e-04],
-        &[-2.90718839510249e-03, 1.09922241766815e-02, -3.52053141560315e-02, 8.31962208882433e-02, -1.27244413706703e-01, 1.27078393791315e-01, -8.23947157593937e-02, 3.34105586971409e-02, -7.69529504016692e-03, 7.69529504016692e-04],
-        &[8.89941714023592e-04, -4.25818033750240e-03, 1.99225160493104e-02, -6.46588399513875e-02, 1.32772390609268e-01, -1.78508501143865e-01, 1.59999344843642e-01, -9.51030239931668e-02, 3.61804402620956e-02, -8.04009783602125e-03, 8.04009783602125e-04],
-        &[-7.63397185185940e-05, 6.56275990492236e-04, -5.83721252683346e-03, 2.90439093207059e-02, -8.47041434795334e-02, 1.57429980520300e-01, -1.95480070025582e-01, 1.65419875422483e-01, -9.49318221478776e-02, 3.55994333054541e-02, -7.91098517898980e-03, 7.91098517898980e-04],
-        &[-8.51254383837181e-06, -1.77491211003807e-06, 7.32410586432031e-04, -7.40542710012764e-03, 3.44704736857857e-02, -9.39121496781828e-02, 1.66014456295034e-01, -1.99926171069111e-01, 1.66727824560197e-01, -9.52730426058266e-02, 3.57273909771850e-02, -7.93942021715222e-03, 7.93942021715222e-04]
-    ];
+    const DISS_BLOCK: Matrix<Float, 8, 13>  = Matrix::new([
+        [-9.67684432055993e-03, 2.46709290489728e-02, -3.59750445192099e-02, 3.68391266633410e-02, -2.15642540957128e-02, 6.31810399460400e-03, -5.50815969583136e-04, -6.12008018523449e-05, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [4.21280289799980e-03, -1.11651391003720e-02, 1.78139108670387e-02, -2.04287733194061e-02, 1.39229889120575e-02, -5.16220078628933e-03, 8.08589544070426e-04, -2.17901509897605e-06, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [-4.63425679136442e-03, 1.34385494509234e-02, -2.60541352669482e-02, 3.74164435120410e-02, -3.36394001786921e-02, 1.82199905150995e-02, -5.42550576921788e-03, 6.78314528158575e-04, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [5.41433680102586e-03, -1.75829845731036e-02, 4.26893646894443e-02, -7.75663958780684e-02, 9.06986463369770e-02, -6.74666360933304e-02, 3.07997352073168e-02, -7.82499022484376e-03, 8.38923734582063e-04, 0.0, 0.0, 0.0, 0.0],
+        [-2.90718839510249e-03, 1.09922241766815e-02, -3.52053141560315e-02, 8.31962208882433e-02, -1.27244413706703e-01, 1.27078393791315e-01, -8.23947157593937e-02, 3.34105586971409e-02, -7.69529504016692e-03, 7.69529504016692e-04, 0.0, 0.0, 0.0],
+        [8.89941714023592e-04, -4.25818033750240e-03, 1.99225160493104e-02, -6.46588399513875e-02, 1.32772390609268e-01, -1.78508501143865e-01, 1.59999344843642e-01, -9.51030239931668e-02, 3.61804402620956e-02, -8.04009783602125e-03, 8.04009783602125e-04, 0.0, 0.0],
+        [-7.63397185185940e-05, 6.56275990492236e-04, -5.83721252683346e-03, 2.90439093207059e-02, -8.47041434795334e-02, 1.57429980520300e-01, -1.95480070025582e-01, 1.65419875422483e-01, -9.49318221478776e-02, 3.55994333054541e-02, -7.91098517898980e-03, 7.91098517898980e-04, 0.0],
+        [-8.51254383837181e-06, -1.77491211003807e-06, 7.32410586432031e-04, -7.40542710012764e-03, 3.44704736857857e-02, -9.39121496781828e-02, 1.66014456295034e-01, -1.99926171069111e-01, 1.66727824560197e-01, -9.52730426058266e-02, 3.57273909771850e-02, -7.93942021715222e-03, 7.93942021715222e-04]
+    ]);
 
     #[rustfmt::skip]
-    const DISS_DIAG: &'static [Float; 11] = &[
+    const DISS_DIAG: RowVector<Float, 11> = RowVector::new([[
         7.93650793650794e-04, -7.93650793650794e-03, 3.57142857142857e-02, -9.52380952380952e-02, 1.66666666666667e-01, -2.00000000000000e-01, 1.66666666666667e-01, -9.52380952380952e-02, 3.57142857142857e-02, -7.93650793650794e-03, 7.93650793650794e-04
-    ];
+    ]]);
+    const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+        Self::DISS_BLOCK,
+        Self::DISS_DIAG,
+        super::flip_ud(super::flip_lr(Self::DISS_BLOCK)),
+    );
 }
 
 impl SbpOperator1d for Upwind9h2 {
     fn diff(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::H2,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DIFF, super::OperatorType::H2, prev, fut)
     }
 
     fn h(&self) -> &'static [Float] {
-        Self::HBLOCK
+        &Self::H.start
     }
     fn is_h2(&self) -> bool {
         true
@@ -67,17 +71,11 @@ impl SbpOperator1d for Upwind9h2 {
 
     #[cfg(feature = "sparse")]
     fn diff_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::BLOCK,
-            Self::DIAG,
-            super::Symmetry::AntiSymmetric,
-            super::OperatorType::H2,
-            n,
-        )
+        super::sparse_from_block(&Self::DIFF, super::OperatorType::H2, n)
     }
     #[cfg(feature = "sparse")]
     fn h_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::h_matrix(Self::HBLOCK, n, self.is_h2())
+        super::h_matrix(&Self::H, n, self.is_h2())
     }
 
     fn upwind(&self) -> Option<&dyn UpwindOperator1d> {
@@ -86,35 +84,14 @@ impl SbpOperator1d for Upwind9h2 {
 }
 
 impl SbpOperator2d for Upwind9h2 {
-    fn diffxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn diffxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind9h2::BLOCK.len());
 
-        let symmetry = super::Symmetry::AntiSymmetric;
-        let optype = super::OperatorType::H2;
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                diff_op_row(Upwind9h2::BLOCK, Upwind9h2::DIAG, symmetry, optype)(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                diff_op_col(Upwind9h2::BLOCK, Upwind9h2::DIAG, symmetry, optype)(prev, fut);
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind9h2.diff(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DIFF, OperatorType::H2, prev, fut)
     }
     fn op_xi(&self) -> &dyn SbpOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn SbpOperator1d {
-        &Self
-    }
     fn upwind(&self) -> Option<Box<dyn UpwindOperator2d>> {
         Some(Box::new(Self))
     }
@@ -147,14 +124,7 @@ fn upwind9h2_test() {
 
 impl UpwindOperator1d for Upwind9h2 {
     fn diss(&self, prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
-        super::diff_op_1d(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::H2,
-            prev,
-            fut,
-        )
+        super::diff_op_1d(&Self::DISS, super::OperatorType::H2, prev, fut)
     }
     fn as_sbp(&self) -> &dyn SbpOperator1d {
         self
@@ -162,54 +132,17 @@ impl UpwindOperator1d for Upwind9h2 {
 
     #[cfg(feature = "sparse")]
     fn diss_matrix(&self, n: usize) -> sprs::CsMat<Float> {
-        super::sparse_from_block(
-            Self::DISS_BLOCK,
-            Self::DISS_DIAG,
-            super::Symmetry::Symmetric,
-            super::OperatorType::H2,
-            n,
-        )
+        super::sparse_from_block(&Self::DISS, OperatorType::H2, n)
     }
 }
 
 impl UpwindOperator2d for Upwind9h2 {
-    fn dissxi(&self, prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
+    fn dissxi(&self, prev: ArrayView2<Float>, fut: ArrayViewMut2<Float>) {
         assert_eq!(prev.shape(), fut.shape());
-        assert!(prev.shape()[1] >= 2 * Upwind9h2::BLOCK.len());
 
-        let symmetry = super::Symmetry::Symmetric;
-        let optype = super::OperatorType::H2;
-
-        match (prev.strides(), fut.strides()) {
-            ([_, 1], [_, 1]) => {
-                diff_op_row(
-                    Upwind9h2::DISS_BLOCK,
-                    Upwind9h2::DISS_DIAG,
-                    symmetry,
-                    optype,
-                )(prev, fut);
-            }
-            ([1, _], [1, _]) => {
-                diff_op_col(
-                    Upwind9h2::DISS_BLOCK,
-                    Upwind9h2::DISS_DIAG,
-                    symmetry,
-                    optype,
-                )(prev, fut);
-            }
-            ([_, _], [_, _]) => {
-                // Fallback, work row by row
-                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
-                    Upwind9h2.diss(r0, r1);
-                }
-            }
-            _ => unreachable!("Should only be two elements in the strides vectors"),
-        }
+        super::diff_op_2d(&Self::DISS, OperatorType::H2, prev, fut)
     }
     fn op_xi(&self) -> &dyn UpwindOperator1d {
         &Self
     }
-    fn op_eta(&self) -> &dyn UpwindOperator1d {
-        &Self
-    }
 }

From 299b4f8083a76d98655dfc7d03d9b6ca50e18ee4 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 23:13:10 +0100
Subject: [PATCH 25/35] ensure FastFloat flag works

---
 sbp/src/operators/algos.rs        | 30 +++++++++++++-----------------
 sbp/src/operators/traditional4.rs |  4 ++--
 sbp/src/operators/traditional8.rs |  2 +-
 sbp/src/operators/upwind4.rs      |  4 ++--
 sbp/src/operators/upwind4h2.rs    |  4 ++--
 sbp/src/operators/upwind9.rs      |  4 ++--
 sbp/src/operators/upwind9h2.rs    |  4 ++--
 7 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 5a608a6..d66e74c 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -31,18 +31,14 @@ impl<const B: usize> DiagonalMatrix<B> {
 }
 
 #[derive(Clone, Debug, PartialEq)]
-pub(crate) struct BlockMatrix<const M: usize, const N: usize, const D: usize> {
-    pub start: Matrix<Float, M, N>,
-    pub diag: RowVector<Float, D>,
-    pub end: Matrix<Float, M, N>,
+pub(crate) struct BlockMatrix<T, const M: usize, const N: usize, const D: usize> {
+    pub start: Matrix<T, M, N>,
+    pub diag: RowVector<T, D>,
+    pub end: Matrix<T, M, N>,
 }
 
-impl<const M: usize, const N: usize, const D: usize> BlockMatrix<M, N, D> {
-    pub const fn new(
-        start: Matrix<Float, M, N>,
-        diag: RowVector<Float, D>,
-        end: Matrix<Float, M, N>,
-    ) -> Self {
+impl<T, const M: usize, const N: usize, const D: usize> BlockMatrix<T, M, N, D> {
+    pub const fn new(start: Matrix<T, M, N>, diag: RowVector<T, D>, end: Matrix<T, M, N>) -> Self {
         Self { start, diag, end }
     }
 }
@@ -57,7 +53,7 @@ pub(crate) enum OperatorType {
 #[inline(always)]
 /// Works on all 1d vectors
 pub(crate) fn diff_op_1d_fallback<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView1<Float>,
     mut fut: ArrayViewMut1<Float>,
@@ -106,7 +102,7 @@ pub(crate) fn diff_op_1d_fallback<const M: usize, const N: usize, const D: usize
 #[inline(always)]
 /// diff op in 1d for slices
 pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: &[Float],
     fut: &mut [Float],
@@ -181,7 +177,7 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
 #[inline(always)]
 /// Will always work on 1d, delegated based on slicedness
 pub(crate) fn diff_op_1d<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView1<Float>,
     mut fut: ArrayViewMut1<Float>,
@@ -201,7 +197,7 @@ pub(crate) fn diff_op_1d<const M: usize, const N: usize, const D: usize>(
 #[allow(unused)]
 /// 2D diff fallback for when matrices are not slicable
 pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
     mut fut: ArrayViewMut2<Float>,
@@ -275,7 +271,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
 }
 
 pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
     mut fut: ArrayViewMut2<Float>,
@@ -292,7 +288,7 @@ pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usiz
 #[inline(always)]
 /// Dispatch based on strides
 pub(crate) fn diff_op_2d<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
     fut: ArrayViewMut2<Float>,
@@ -494,7 +490,7 @@ fn dotproduct<'a>(
 
 #[cfg(feature = "sparse")]
 pub(crate) fn sparse_from_block<const M: usize, const N: usize, const D: usize>(
-    matrix: &BlockMatrix<M, N, D>,
+    matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     n: usize,
 ) -> sprs::CsMat<Float> {
diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs
index 167d36b..e2b59df 100644
--- a/sbp/src/operators/traditional4.rs
+++ b/sbp/src/operators/traditional4.rs
@@ -27,7 +27,7 @@ impl SBP4 {
     const DIFF_BLOCKEND: super::Matrix<Float, 4, 6> =
         super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK)));
 
-    const DIFF: BlockMatrix<4, 6, 5> =
+    const DIFF: BlockMatrix<Float, 4, 6, 5> =
         BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND);
 
     #[rustfmt::skip]
@@ -41,7 +41,7 @@ impl SBP4 {
         [-4.0/43.0, 59.0/43.0, -110.0/43.0, 59.0/43.0, -4.0/43.0, 0.0],
         [-1.0/49.0, 0.0, 59.0/49.0, -118.0/49.0, 64.0/49.0, -4.0/49.0]
     ]);
-    const D2: BlockMatrix<4, 6, 5> = BlockMatrix::new(
+    const D2: BlockMatrix<Float, 4, 6, 5> = BlockMatrix::new(
         Self::D2BLOCK,
         Self::D2DIAG,
         super::flip_ud(super::flip_lr(Self::D2BLOCK)),
diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs
index cfd7b8b..d1dcf4c 100644
--- a/sbp/src/operators/traditional8.rs
+++ b/sbp/src/operators/traditional8.rs
@@ -27,7 +27,7 @@ impl SBP8 {
         [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0],
         [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03],
     ]);
-    const DIFF: BlockMatrix<8, 12, 9> = BlockMatrix::new(
+    const DIFF: BlockMatrix<Float, 8, 12, 9> = BlockMatrix::new(
         Self::DIFF_BLOCK,
         Self::DIFF_DIAG,
         super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs
index b9742bf..0126a9d 100644
--- a/sbp/src/operators/upwind4.rs
+++ b/sbp/src/operators/upwind4.rs
@@ -27,7 +27,7 @@ impl Upwind4 {
     const DIFF_BLOCKEND: Matrix<Float, 4, 7> =
         super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK)));
 
-    const DIFF: BlockMatrix<4, 7, 7> =
+    const DIFF: BlockMatrix<Float, 4, 7, 7> =
         BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND);
 
     #[rustfmt::skip]
@@ -43,7 +43,7 @@ impl Upwind4 {
     ]]);
     const DISS_BLOCKEND: Matrix<Float, 4, 7> = super::flip_ud(super::flip_lr(Self::DISS_BLOCK));
 
-    const DISS: BlockMatrix<4, 7, 7> =
+    const DISS: BlockMatrix<Float, 4, 7, 7> =
         BlockMatrix::new(Self::DISS_BLOCK, Self::DISS_DIAG, Self::DISS_BLOCKEND);
 }
 
diff --git a/sbp/src/operators/upwind4h2.rs b/sbp/src/operators/upwind4h2.rs
index e1cbf60..2a270b3 100644
--- a/sbp/src/operators/upwind4h2.rs
+++ b/sbp/src/operators/upwind4h2.rs
@@ -24,7 +24,7 @@ impl Upwind4h2 {
     const DIFF_DIAG: RowVector<Float, 7> = RowVector::new([[
         -1.43229166666667e-02, 1.40625000000000e-01, -7.38281250000000e-01, 0.00000000000000e+00, 7.38281250000000e-01, -1.40625000000000e-01, 1.43229166666667e-02
     ]]);
-    const DIFF: BlockMatrix<4, 7, 7> = BlockMatrix::new(
+    const DIFF: BlockMatrix<Float, 4, 7, 7> = BlockMatrix::new(
         Self::DIFF_BLOCK,
         Self::DIFF_DIAG,
         super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
@@ -42,7 +42,7 @@ impl Upwind4h2 {
     const DISS_DIAG: RowVector<Float, 7> = RowVector::new([[
         1.43229166666667e-02, -8.59375000000000e-02, 2.14843750000000e-01, -2.86458333333333e-01, 2.14843750000000e-01, -8.59375000000000e-02, 1.43229166666667e-02,
     ]]);
-    const DISS: BlockMatrix<4, 7, 7> = BlockMatrix::new(
+    const DISS: BlockMatrix<Float, 4, 7, 7> = BlockMatrix::new(
         Self::DISS_BLOCK,
         Self::DISS_DIAG,
         super::flip_ud(super::flip_lr(Self::DISS_BLOCK)),
diff --git a/sbp/src/operators/upwind9.rs b/sbp/src/operators/upwind9.rs
index c6d374e..192b577 100644
--- a/sbp/src/operators/upwind9.rs
+++ b/sbp/src/operators/upwind9.rs
@@ -28,7 +28,7 @@ impl Upwind9 {
         [-6.19425252179959e-03, 3.69595678895333e-02, -7.01892820620398e-02, -3.35233082197107e-03, 2.69304373763091e-01, -8.89857974743355e-01, 0.00000000000000e+00, 8.66656645522330e-01, -2.57919763669076e-01, 6.44799409172690e-02, -1.07466568195448e-02, 8.59732545563586e-04, 0.0],
         [1.44853491014330e-02, -4.59275574977554e-02, 3.08833474560615e-02, 3.57240610228828e-02, -7.07760049349999e-02, 1.88587240076292e-01, -7.92626447113877e-01, 0.00000000000000e+00, 8.25608497215073e-01, -2.35888142061449e-01, 5.89720355153623e-02, -9.82867258589373e-03, 7.86293806871498e-04],
     ]);
-    const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+    const DIFF: BlockMatrix<Float, 8, 13, 11> = BlockMatrix::new(
         Self::DIFF_BLOCK,
         Self::DIFF_DIAG,
         super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
@@ -50,7 +50,7 @@ impl Upwind9 {
     const DISS_DIAG: RowVector<Float, 11> = RowVector::new([[
         1.0/1260.0, -1.0/126.0, 1.0/28.0, -2.0/21.0, 1.0/6.0, -1.0/5.0, 1.0/6.0, -2.0/21.0, 1.0/28.0, -1.0/126.0, 1.0/1260.0,
     ]]);
-    const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+    const DISS: BlockMatrix<Float, 8, 13, 11> = BlockMatrix::new(
         Self::DISS_BLOCK,
         Self::DISS_DIAG,
         super::flip_ud(super::flip_lr(Self::DISS_BLOCK)),
diff --git a/sbp/src/operators/upwind9h2.rs b/sbp/src/operators/upwind9h2.rs
index f099008..9eb6830 100644
--- a/sbp/src/operators/upwind9h2.rs
+++ b/sbp/src/operators/upwind9h2.rs
@@ -28,7 +28,7 @@ impl Upwind9h2 {
         [-1.35203347497451e-03, 5.77422023528126e-03, -1.06262408660325e-02, -2.37853245409338e-02, 2.05772021953463e-01, -8.20033622612654e-01, 0.00000000000000e+00, 8.31345778788959e-01, -2.37329555369694e-01, 5.93323888424235e-02, -9.88873147373725e-03, 7.91098517898980e-04, 0.0],
         [-1.85246767034256e-03, 2.89905176240824e-03, 6.61951741227212e-04, 2.52792141498726e-03, -5.27086038822990e-02, 2.36934258275492e-01, -8.34333946306806e-01, 0.00000000000000e+00, 8.33639122800983e-01, -2.38182606514566e-01, 5.95456516286416e-02, -9.92427527144027e-03, 7.93942021715222e-04]
     ]);
-    const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+    const DIFF: BlockMatrix<Float, 8, 13, 11> = BlockMatrix::new(
         Self::DIFF_BLOCK,
         Self::DIFF_DIAG,
         super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))),
@@ -50,7 +50,7 @@ impl Upwind9h2 {
     const DISS_DIAG: RowVector<Float, 11> = RowVector::new([[
         7.93650793650794e-04, -7.93650793650794e-03, 3.57142857142857e-02, -9.52380952380952e-02, 1.66666666666667e-01, -2.00000000000000e-01, 1.66666666666667e-01, -9.52380952380952e-02, 3.57142857142857e-02, -7.93650793650794e-03, 7.93650793650794e-04
     ]]);
-    const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new(
+    const DISS: BlockMatrix<Float, 8, 13, 11> = BlockMatrix::new(
         Self::DISS_BLOCK,
         Self::DISS_DIAG,
         super::flip_ud(super::flip_lr(Self::DISS_BLOCK)),

From b15ea57e6d68497f45f4c9b06ac7eb59976bb598 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 23:21:06 +0100
Subject: [PATCH 26/35] inline for perf

---
 sbp/src/operators/algos.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index d66e74c..9d24e9d 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -270,6 +270,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     }
 }
 
+#[inline(always)]
 pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usize>(
     matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,

From 74d99a4a181dd1d7f87882167f9c1c79f7074122 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Mon, 1 Feb 2021 23:58:13 +0100
Subject: [PATCH 27/35] try ndarray transmute

---
 sbp/Cargo.toml                       |  1 +
 sbp/src/operators/algos.rs           | 25 ++++++----
 sbp/src/operators/algos/fastfloat.rs | 71 +++++++++++++++++++++++++++-
 3 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml
index 9b7834f..9cb7a50 100644
--- a/sbp/Cargo.toml
+++ b/sbp/Cargo.toml
@@ -11,6 +11,7 @@ packed_simd = { version = "0.3.3", package = "packed_simd_2" }
 rayon = { version = "1.3.0", optional = true }
 sprs = { version = "0.9.0", optional = true, default-features = false }
 serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] }
+num-traits = "0.2.14"
 
 [features]
 # Use f32 as precision, default is f64
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 9d24e9d..a6485a8 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -1,5 +1,6 @@
 use super::*;
 use ndarray::s;
+use num_traits::Zero;
 
 pub(crate) mod constmatrix;
 pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector};
@@ -194,14 +195,24 @@ pub(crate) fn diff_op_1d<const M: usize, const N: usize, const D: usize>(
 }
 
 #[inline(always)]
-#[allow(unused)]
 /// 2D diff fallback for when matrices are not slicable
 pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize>(
     matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
-    mut fut: ArrayViewMut2<Float>,
+    fut: ArrayViewMut2<Float>,
 ) {
+    #[cfg(feature = "fast-float")]
+    let (matrix, prev, mut fut) = unsafe {
+        (
+            std::mem::transmute::<_, &BlockMatrix<FastFloat, M, N, D>>(matrix),
+            std::mem::transmute::<_, ArrayView2<FastFloat>>(prev),
+            std::mem::transmute::<_, ArrayViewMut2<FastFloat>>(fut),
+        )
+    };
+    #[cfg(not(feature = "fast-float"))]
+    let mut fut = fut;
+
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[1];
     let ny = prev.shape()[0];
@@ -214,8 +225,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     };
     let idx = 1.0 / dx;
 
-    fut.fill(0.0);
-
+    fut.fill(0.0.into());
     let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut((
         ndarray::s![.., ..M],
         ndarray::s![.., M..nx - M],
@@ -230,7 +240,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     {
         debug_assert_eq!(fut.len(), prev.shape()[0]);
         for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
-            if bl == 0.0 {
+            if bl.is_zero() {
                 continue;
             }
             debug_assert_eq!(prev.len(), fut.len());
@@ -246,7 +256,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
         .zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip))
     {
         for (&d, id) in matrix.diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
-            if d == 0.0 {
+            if d.is_zero() {
                 continue;
             }
             fut.scaled_add(idx * d, &id)
@@ -260,9 +270,8 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
         .iter_rows()
         .zip(futn.axis_iter_mut(ndarray::Axis(1)))
     {
-        fut.fill(0.0);
         for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
-            if bl == 0.0 {
+            if bl.is_zero() {
                 continue;
             }
             fut.scaled_add(idx * bl, &prev);
diff --git a/sbp/src/operators/algos/fastfloat.rs b/sbp/src/operators/algos/fastfloat.rs
index 40d6633..efad00b 100644
--- a/sbp/src/operators/algos/fastfloat.rs
+++ b/sbp/src/operators/algos/fastfloat.rs
@@ -14,14 +14,62 @@ impl core::ops::Mul for FastFloat {
     }
 }
 
-impl core::ops::Add for FastFloat {
-    type Output = Self;
+impl core::ops::Mul<Float> for FastFloat {
+    type Output = FastFloat;
+    #[inline(always)]
+    fn mul(self, o: Float) -> Self::Output {
+        unsafe { Self(fmul_fast(self.0, o)) }
+    }
+}
+
+impl core::ops::Mul<FastFloat> for Float {
+    type Output = FastFloat;
+    #[inline(always)]
+    fn mul(self, o: FastFloat) -> Self::Output {
+        unsafe { FastFloat(fmul_fast(self, o.0)) }
+    }
+}
+
+impl core::ops::Add<FastFloat> for FastFloat {
+    type Output = FastFloat;
     #[inline(always)]
     fn add(self, o: FastFloat) -> Self::Output {
         unsafe { Self(fadd_fast(self.0, o.0)) }
     }
 }
 
+impl core::ops::Add<Float> for FastFloat {
+    type Output = FastFloat;
+    #[inline(always)]
+    fn add(self, o: Float) -> Self::Output {
+        unsafe { Self(fadd_fast(self.0, o)) }
+    }
+}
+
+impl core::ops::Add<FastFloat> for Float {
+    type Output = FastFloat;
+    #[inline(always)]
+    fn add(self, o: FastFloat) -> Self::Output {
+        unsafe { FastFloat(fadd_fast(self, o.0)) }
+    }
+}
+
+impl core::ops::Sub for FastFloat {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, o: FastFloat) -> Self::Output {
+        unsafe { Self(fadd_fast(self.0, -o.0)) }
+    }
+}
+
+impl core::ops::Div for FastFloat {
+    type Output = Self;
+    #[inline(always)]
+    fn div(self, o: FastFloat) -> Self::Output {
+        Self(self.0 / o.0)
+    }
+}
+
 impl core::ops::MulAssign<FastFloat> for FastFloat {
     #[inline(always)]
     fn mul_assign(&mut self, o: FastFloat) {
@@ -51,3 +99,22 @@ impl From<FastFloat> for Float {
         f.0
     }
 }
+
+mod numt {
+    use super::{FastFloat, Float};
+    use num_traits::identities::{One, Zero};
+
+    impl One for FastFloat {
+        fn one() -> Self {
+            Self(Float::one())
+        }
+    }
+    impl Zero for FastFloat {
+        fn zero() -> Self {
+            Self(Float::zero())
+        }
+        fn is_zero(&self) -> bool {
+            self.0.is_zero()
+        }
+    }
+}

From c709cf465e8c85947a6ee9022050771cb59db8f3 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Tue, 2 Feb 2021 00:11:41 +0100
Subject: [PATCH 28/35] remove ndarray transmute

---
 sbp/src/operators/algos.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index a6485a8..8340632 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -200,8 +200,9 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
     prev: ArrayView2<Float>,
-    fut: ArrayViewMut2<Float>,
+    mut fut: ArrayViewMut2<Float>,
 ) {
+    /* Does not increase the perf...
     #[cfg(feature = "fast-float")]
     let (matrix, prev, mut fut) = unsafe {
         (
@@ -212,6 +213,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     };
     #[cfg(not(feature = "fast-float"))]
     let mut fut = fut;
+    */
 
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[1];

From 64a4e92dd2cfe88ecdb4f282db0d99fc8a330b6e Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Tue, 2 Feb 2021 00:33:37 +0100
Subject: [PATCH 29/35] specialise on contigous ny

---
 sbp/src/operators/algos.rs | 100 +++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 8340632..2a12fc3 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -281,6 +281,105 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     }
 }
 
+#[inline(always)]
+/// 2D diff when first axis is contiguous
+pub(crate) fn diff_op_2d_sliceable_y<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<Float, M, N, D>,
+    optype: OperatorType,
+    prev: ArrayView2<Float>,
+    mut fut: ArrayViewMut2<Float>,
+) {
+    /* Does not increase the perf...
+    #[cfg(feature = "fast-float")]
+    let (matrix, prev, mut fut) = unsafe {
+        (
+            std::mem::transmute::<_, &BlockMatrix<FastFloat, M, N, D>>(matrix),
+            std::mem::transmute::<_, ArrayView2<FastFloat>>(prev),
+            std::mem::transmute::<_, ArrayViewMut2<FastFloat>>(fut),
+        )
+    };
+    #[cfg(not(feature = "fast-float"))]
+    let mut fut = fut;
+    */
+
+    assert_eq!(prev.shape(), fut.shape());
+    let nx = prev.shape()[1];
+    let ny = prev.shape()[0];
+    assert!(nx >= 2 * M);
+    assert_eq!(prev.strides()[0], 1);
+    assert_eq!(fut.strides()[0], 1);
+
+    let dx = if optype == OperatorType::H2 {
+        1.0 / (nx - 2) as Float
+    } else {
+        1.0 / (nx - 1) as Float
+    };
+    let idx = 1.0 / dx;
+
+    fut.fill(0.0.into());
+    let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut((
+        ndarray::s![.., ..M],
+        ndarray::s![.., M..nx - M],
+        ndarray::s![.., nx - M..],
+    ));
+
+    // First block
+    for (bl, mut fut) in matrix
+        .start
+        .iter_rows()
+        .zip(fut0.axis_iter_mut(ndarray::Axis(1)))
+    {
+        let fut = &mut fut.as_slice_mut().unwrap()[..ny];
+        for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
+            if bl.is_zero() {
+                continue;
+            }
+            let prev = &prev.as_slice().unwrap()[..ny];
+            for (fut, prev) in fut.iter_mut().zip(prev) {
+                *fut = *fut + idx * prev * bl;
+            }
+        }
+    }
+
+    let window_elems_to_skip = M - ((D - 1) / 2);
+
+    // Diagonal entries
+    for (mut fut, id) in futmid
+        .axis_iter_mut(ndarray::Axis(1))
+        .zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip))
+    {
+        let fut = &mut fut.as_slice_mut().unwrap()[..ny];
+        for (&d, id) in matrix.diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
+            if d.is_zero() {
+                continue;
+            }
+            let id = id.as_slice().unwrap();
+            for (fut, id) in fut.iter_mut().zip(id) {
+                *fut = *fut + idx * id * d;
+            }
+        }
+    }
+
+    // End block
+    let prev = prev.slice(ndarray::s!(.., nx - N..));
+    for (bl, mut fut) in matrix
+        .end
+        .iter_rows()
+        .zip(futn.axis_iter_mut(ndarray::Axis(1)))
+    {
+        let fut = &mut fut.as_slice_mut().unwrap()[..ny];
+        for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
+            if bl.is_zero() {
+                continue;
+            }
+            let prev = &prev.as_slice().unwrap()[..ny];
+            for (fut, prev) in fut.iter_mut().zip(prev) {
+                *fut = *fut + idx * prev * bl;
+            }
+        }
+    }
+}
+
 #[inline(always)]
 pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usize>(
     matrix: &BlockMatrix<Float, M, N, D>,
@@ -308,6 +407,7 @@ pub(crate) fn diff_op_2d<const M: usize, const N: usize, const D: usize>(
     assert_eq!(prev.shape(), fut.shape());
     match (prev.strides(), fut.strides()) {
         ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut),
+        ([1, _], [1, _]) => diff_op_2d_sliceable_y(matrix, optype, prev, fut),
         _ => diff_op_2d_fallback(matrix, optype, prev, fut),
     }
 }

From 7ec426b5a8e8c79149a99c88e96a3fb7ce0374d0 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Wed, 3 Feb 2021 08:31:33 +0100
Subject: [PATCH 30/35] add more fast intrinsics

---
 sbp/src/operators/algos/fastfloat.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sbp/src/operators/algos/fastfloat.rs b/sbp/src/operators/algos/fastfloat.rs
index efad00b..f4e4e8b 100644
--- a/sbp/src/operators/algos/fastfloat.rs
+++ b/sbp/src/operators/algos/fastfloat.rs
@@ -4,7 +4,7 @@ use super::*;
 #[derive(Copy, Clone, Debug, PartialEq, Default)]
 pub(crate) struct FastFloat(Float);
 
-use core::intrinsics::{fadd_fast, fmul_fast};
+use core::intrinsics::{fadd_fast, fdiv_fast, fmul_fast, fsub_fast};
 
 impl core::ops::Mul for FastFloat {
     type Output = Self;
@@ -58,7 +58,7 @@ impl core::ops::Sub for FastFloat {
     type Output = Self;
     #[inline(always)]
     fn sub(self, o: FastFloat) -> Self::Output {
-        unsafe { Self(fadd_fast(self.0, -o.0)) }
+        unsafe { Self(fsub_fast(self.0, o.0)) }
     }
 }
 
@@ -66,7 +66,7 @@ impl core::ops::Div for FastFloat {
     type Output = Self;
     #[inline(always)]
     fn div(self, o: FastFloat) -> Self::Output {
-        Self(self.0 / o.0)
+        unsafe { Self(fdiv_fast(self.0, o.0)) }
     }
 }
 

From cf4d8f1e9ba7fefb1992f9363217d1eb47df9e1b Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Wed, 3 Feb 2021 08:41:11 +0100
Subject: [PATCH 31/35] add Zero for constmatrix

---
 sbp/src/operators/algos/constmatrix.rs | 34 ++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/sbp/src/operators/algos/constmatrix.rs b/sbp/src/operators/algos/constmatrix.rs
index b409bed..7203d64 100644
--- a/sbp/src/operators/algos/constmatrix.rs
+++ b/sbp/src/operators/algos/constmatrix.rs
@@ -1,4 +1,7 @@
 #![allow(unused)]
+
+use num_traits::identities::Zero;
+
 /// A row-major matrix
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 #[repr(C)]
@@ -16,6 +19,17 @@ impl<T: Copy + Default, const M: usize, const N: usize> Default for Matrix<T, M,
     }
 }
 
+impl<T: Copy + Zero + PartialEq, const M: usize, const N: usize> Zero for Matrix<T, M, N> {
+    fn zero() -> Self {
+        Self {
+            data: [[T::zero(); N]; M],
+        }
+    }
+    fn is_zero(&self) -> bool {
+        self == &Self::zero()
+    }
+}
+
 impl<T, const M: usize, const N: usize> core::ops::Index<(usize, usize)> for Matrix<T, M, N> {
     type Output = T;
     #[inline(always)]
@@ -123,9 +137,9 @@ macro_rules! impl_op_mul_mul {
             T: Copy + Default + core::ops::Add<Output = T> + core::ops::Mul<Output = T>,
         {
             type Output = Matrix<T, M, P>;
-            fn mul(self, rhs: $rhs) -> Self::Output {
+            fn mul(self, lhs: $rhs) -> Self::Output {
                 let mut out = Matrix::default();
-                out.matmul_into(&self, &rhs);
+                out.matmul_into(&self, &lhs);
                 out
             }
         }
@@ -147,6 +161,22 @@ where
     }
 }
 
+impl<T, const N: usize, const M: usize> core::ops::Add<Matrix<T, M, N>> for Matrix<T, M, N>
+where
+    T: Copy + Zero + core::ops::Add<Output = T> + PartialEq,
+{
+    type Output = Self;
+    fn add(self, lhs: Self) -> Self::Output {
+        let mut out = Matrix::zero();
+        for i in 0..M {
+            for j in 0..N {
+                out[(i, j)] = self[(i, j)] + lhs[(i, j)];
+            }
+        }
+        out
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::{super::*, *};

From 87c055f81e31ed40189a0a1066d0ce3599d3c89e Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Tue, 9 Feb 2021 21:44:35 +0100
Subject: [PATCH 32/35] Add back simd column algo

---
 sbp/src/operators/algos.rs | 171 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 1 deletion(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 2a12fc3..3ec4aff 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -283,6 +283,7 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
 
 #[inline(always)]
 /// 2D diff when first axis is contiguous
+#[allow(unused)]
 pub(crate) fn diff_op_2d_sliceable_y<const M: usize, const N: usize, const D: usize>(
     matrix: &BlockMatrix<Float, M, N, D>,
     optype: OperatorType,
@@ -380,6 +381,174 @@ pub(crate) fn diff_op_2d_sliceable_y<const M: usize, const N: usize, const D: us
     }
 }
 
+#[inline(always)]
+pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const D: usize>(
+    matrix: &BlockMatrix<Float, M, N, D>,
+    optype: OperatorType,
+    prev: ArrayView2<Float>,
+    mut fut: ArrayViewMut2<Float>,
+) {
+    assert_eq!(prev.shape(), fut.shape());
+    let nx = prev.shape()[1];
+    assert!(nx >= 2 * M);
+
+    assert_eq!(prev.strides()[0], 1);
+    assert_eq!(fut.strides()[0], 1);
+
+    let dx = if optype == OperatorType::H2 {
+        1.0 / (nx - 2) as Float
+    } else {
+        1.0 / (nx - 1) as Float
+    };
+    let idx = 1.0 / dx;
+
+    #[cfg(not(feature = "f32"))]
+    type SimdT = packed_simd::f64x8;
+    #[cfg(feature = "f32")]
+    type SimdT = packed_simd::f32x16;
+
+    let ny = prev.shape()[0];
+    // How many elements that can be simdified
+    let simdified = SimdT::lanes() * (ny / SimdT::lanes());
+
+    let half_diag_width = (D - 1) / 2;
+    assert!(half_diag_width <= M);
+
+    let fut_base_ptr = fut.as_mut_ptr();
+    let fut_stride = fut.strides()[1];
+    let fut_ptr = |j, i| {
+        debug_assert!(j < ny && i < nx);
+        unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
+    };
+
+    let prev_base_ptr = prev.as_ptr();
+    let prev_stride = prev.strides()[1];
+    let prev_ptr = |j, i| {
+        debug_assert!(j < ny && i < nx);
+        unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
+    };
+
+    // Not algo necessary, but gives performance increase
+    assert_eq!(fut_stride, prev_stride);
+
+    // First block
+    {
+        for (ifut, &bl) in matrix.start.iter_rows().enumerate() {
+            for j in (0..simdified).step_by(SimdT::lanes()) {
+                let index_to_simd = |i| unsafe {
+                    // j never moves past end of slice due to step_by and
+                    // rounding down
+                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
+                        prev_ptr(j, i),
+                        SimdT::lanes(),
+                    ))
+                };
+                let mut f = SimdT::splat(0.0);
+                for (iprev, &bl) in bl.iter().enumerate() {
+                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
+                }
+                f *= idx;
+
+                unsafe {
+                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
+                        fut_ptr(j, ifut),
+                        SimdT::lanes(),
+                    ));
+                }
+            }
+            for j in simdified..ny {
+                unsafe {
+                    let mut f = 0.0;
+                    for (iprev, bl) in bl.iter().enumerate() {
+                        f += bl * *prev_ptr(j, iprev);
+                    }
+                    *fut_ptr(j, ifut) = f * idx;
+                }
+            }
+        }
+    }
+
+    // Diagonal elements
+    {
+        for ifut in M..nx - M {
+            for j in (0..simdified).step_by(SimdT::lanes()) {
+                let index_to_simd = |i| unsafe {
+                    // j never moves past end of slice due to step_by and
+                    // rounding down
+                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
+                        prev_ptr(j, i),
+                        SimdT::lanes(),
+                    ))
+                };
+                let mut f = SimdT::splat(0.0);
+                for (id, &d) in matrix.diag.iter().enumerate() {
+                    let offset = ifut - half_diag_width + id;
+                    f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
+                }
+                f *= idx;
+                unsafe {
+                    // puts simd along stride 1, j never goes past end of slice
+                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
+                        fut_ptr(j, ifut),
+                        SimdT::lanes(),
+                    ));
+                }
+            }
+            for j in simdified..ny {
+                let mut f = 0.0;
+                for (id, &d) in matrix.diag.iter().enumerate() {
+                    let offset = ifut - half_diag_width + id;
+                    unsafe {
+                        f += d * *prev_ptr(j, offset);
+                    }
+                }
+                unsafe {
+                    *fut_ptr(j, ifut) = idx * f;
+                }
+            }
+        }
+    }
+
+    // End block
+    {
+        // Get blocks and corresponding offsets
+        // (rev to iterate in ifut increasing order)
+        for (bl, ifut) in matrix.end.iter_rows().zip(nx - M..) {
+            for j in (0..simdified).step_by(SimdT::lanes()) {
+                let index_to_simd = |i| unsafe {
+                    // j never moves past end of slice due to step_by and
+                    // rounding down
+                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
+                        prev_ptr(j, i),
+                        SimdT::lanes(),
+                    ))
+                };
+                let mut f = SimdT::splat(0.0);
+                for (&bl, iprev) in bl.iter().zip(nx - N..) {
+                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
+                }
+                f = f * idx;
+                unsafe {
+                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
+                        fut_ptr(j, ifut),
+                        SimdT::lanes(),
+                    ));
+                }
+            }
+
+            for j in simdified..ny {
+                unsafe {
+                    let mut f = 0.0;
+                    for (&bl, iprev) in bl.iter().zip(nx - N..) {
+                        f += bl * *prev_ptr(j, iprev);
+                    }
+                    *fut_ptr(j, ifut) = f * idx;
+                }
+            }
+        }
+    }
+}
+
 #[inline(always)]
 pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usize>(
     matrix: &BlockMatrix<Float, M, N, D>,
@@ -407,7 +576,7 @@ pub(crate) fn diff_op_2d<const M: usize, const N: usize, const D: usize>(
     assert_eq!(prev.shape(), fut.shape());
     match (prev.strides(), fut.strides()) {
         ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut),
-        ([1, _], [1, _]) => diff_op_2d_sliceable_y(matrix, optype, prev, fut),
+        ([1, _], [1, _]) => diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut),
         _ => diff_op_2d_fallback(matrix, optype, prev, fut),
     }
 }

From 8a6dc60edf9834f3f6cc3f65ecd617bb58cb5ecd Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Wed, 10 Feb 2021 19:02:48 +0100
Subject: [PATCH 33/35] remove some unsafe from simd

---
 sbp/src/operators/algos.rs | 166 ++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 102 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 3ec4aff..72750e9 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -390,10 +390,16 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
 ) {
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[1];
+    let ny = prev.shape()[0];
     assert!(nx >= 2 * M);
 
-    assert_eq!(prev.strides()[0], 1);
-    assert_eq!(fut.strides()[0], 1);
+    assert_eq!(prev.strides(), fut.strides());
+    assert_eq!(prev.strides(), &[1, ny as isize]);
+
+    let prev = prev.as_slice_memory_order().unwrap();
+    let fut = fut.as_slice_memory_order_mut().unwrap();
+    let prev = &prev[..nx * ny];
+    let fut = &mut fut[..nx * ny];
 
     let dx = if optype == OperatorType::H2 {
         1.0 / (nx - 2) as Float
@@ -407,103 +413,88 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
     #[cfg(feature = "f32")]
     type SimdT = packed_simd::f32x16;
 
-    let ny = prev.shape()[0];
     // How many elements that can be simdified
     let simdified = SimdT::lanes() * (ny / SimdT::lanes());
 
-    let half_diag_width = (D - 1) / 2;
-    assert!(half_diag_width <= M);
+    let (fut0, futmid) = fut.split_at_mut(M * ny);
+    let (futmid, futn) = futmid.split_at_mut((nx - 2 * M) * ny);
 
-    let fut_base_ptr = fut.as_mut_ptr();
-    let fut_stride = fut.strides()[1];
-    let fut_ptr = |j, i| {
-        debug_assert!(j < ny && i < nx);
-        unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
-    };
+    #[inline(always)]
+    fn block_multiply<const M: usize, const N: usize>(
+        matrix: &Matrix<Float, M, N>,
+        idx: Float,
+        ny: usize,
+        simdified: usize,
+        prev: &[Float],
+        fut: &mut [Float],
+    ) {
+        assert_eq!(prev.len(), N * ny);
+        assert_eq!(fut.len(), M * ny);
+        let prev = &prev[..N * ny];
+        let fut = &mut fut[..M * ny];
 
-    let prev_base_ptr = prev.as_ptr();
-    let prev_stride = prev.strides()[1];
-    let prev_ptr = |j, i| {
-        debug_assert!(j < ny && i < nx);
-        unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
-    };
+        let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
 
-    // Not algo necessary, but gives performance increase
-    assert_eq!(fut_stride, prev_stride);
-
-    // First block
-    {
-        for (ifut, &bl) in matrix.start.iter_rows().enumerate() {
-            for j in (0..simdified).step_by(SimdT::lanes()) {
-                let index_to_simd = |i| unsafe {
-                    // j never moves past end of slice due to step_by and
-                    // rounding down
-                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                        prev_ptr(j, i),
-                        SimdT::lanes(),
-                    ))
-                };
+        for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) {
+            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+            for (j, fut) in fut.by_ref().enumerate() {
+                let index_to_simd =
+                    |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
                 let mut f = SimdT::splat(0.0);
                 for (iprev, &bl) in bl.iter().enumerate() {
                     f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
                 }
                 f *= idx;
-
-                unsafe {
-                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                        fut_ptr(j, ifut),
-                        SimdT::lanes(),
-                    ));
-                }
+                f.write_to_slice_unaligned(fut);
             }
-            for j in simdified..ny {
-                unsafe {
-                    let mut f = 0.0;
-                    for (iprev, bl) in bl.iter().enumerate() {
-                        f += bl * *prev_ptr(j, iprev);
-                    }
-                    *fut_ptr(j, ifut) = f * idx;
+            for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
+                let mut f = 0.0;
+                for (iprev, bl) in bl.iter().enumerate() {
+                    f += bl * prevcol(iprev)[j];
                 }
+                *fut = f * idx;
             }
         }
     }
 
+    // First block
+    {
+        let prev = &prev[..N * ny];
+        block_multiply(&matrix.start, idx, ny, simdified, prev, fut0);
+    }
+
     // Diagonal elements
     {
-        for ifut in M..nx - M {
-            for j in (0..simdified).step_by(SimdT::lanes()) {
-                let index_to_simd = |i| unsafe {
-                    // j never moves past end of slice due to step_by and
-                    // rounding down
-                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                        prev_ptr(j, i),
-                        SimdT::lanes(),
-                    ))
-                };
+        let half_diag_width = (D - 1) / 2;
+        assert!(half_diag_width <= M);
+
+        let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
+        for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) {
+            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+            for (j, fut) in fut.by_ref().enumerate() {
+                let index_to_simd =
+                    |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
                 let mut f = SimdT::splat(0.0);
                 for (id, &d) in matrix.diag.iter().enumerate() {
                     let offset = ifut - half_diag_width + id;
                     f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
                 }
                 f *= idx;
-                unsafe {
+                {
                     // puts simd along stride 1, j never goes past end of slice
-                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                        fut_ptr(j, ifut),
-                        SimdT::lanes(),
-                    ));
+                    f.write_to_slice_unaligned(fut);
                 }
             }
-            for j in simdified..ny {
+            for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
                 let mut f = 0.0;
                 for (id, &d) in matrix.diag.iter().enumerate() {
                     let offset = ifut - half_diag_width + id;
-                    unsafe {
-                        f += d * *prev_ptr(j, offset);
+                    {
+                        f += d * prevcol(offset)[j];
                     }
                 }
-                unsafe {
-                    *fut_ptr(j, ifut) = idx * f;
+                {
+                    *fut = idx * f;
                 }
             }
         }
@@ -511,41 +502,8 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
 
     // End block
     {
-        // Get blocks and corresponding offsets
-        // (rev to iterate in ifut increasing order)
-        for (bl, ifut) in matrix.end.iter_rows().zip(nx - M..) {
-            for j in (0..simdified).step_by(SimdT::lanes()) {
-                let index_to_simd = |i| unsafe {
-                    // j never moves past end of slice due to step_by and
-                    // rounding down
-                    SimdT::from_slice_unaligned(std::slice::from_raw_parts(
-                        prev_ptr(j, i),
-                        SimdT::lanes(),
-                    ))
-                };
-                let mut f = SimdT::splat(0.0);
-                for (&bl, iprev) in bl.iter().zip(nx - N..) {
-                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
-                }
-                f = f * idx;
-                unsafe {
-                    f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
-                        fut_ptr(j, ifut),
-                        SimdT::lanes(),
-                    ));
-                }
-            }
-
-            for j in simdified..ny {
-                unsafe {
-                    let mut f = 0.0;
-                    for (&bl, iprev) in bl.iter().zip(nx - N..) {
-                        f += bl * *prev_ptr(j, iprev);
-                    }
-                    *fut_ptr(j, ifut) = f * idx;
-                }
-            }
-        }
+        let prev = &prev[((nx - N) * ny)..];
+        block_multiply(&matrix.end, idx, ny, simdified, prev, futn);
     }
 }
 
@@ -576,7 +534,11 @@ pub(crate) fn diff_op_2d<const M: usize, const N: usize, const D: usize>(
     assert_eq!(prev.shape(), fut.shape());
     match (prev.strides(), fut.strides()) {
         ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut),
-        ([1, _], [1, _]) => diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut),
+        ([1, _], [1, _])
+            if prev.as_slice_memory_order().is_some() && fut.as_slice_memory_order().is_some() =>
+        {
+            diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut)
+        }
         _ => diff_op_2d_fallback(matrix, optype, prev, fut),
     }
 }

From 02175d1734c96a577f37eed797acf78dcf0265d6 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Wed, 10 Feb 2021 19:29:26 +0100
Subject: [PATCH 34/35] use some unsafe...

---
 sbp/src/operators/algos.rs | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 72750e9..78844dd 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -433,6 +433,14 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
         let prev = &prev[..N * ny];
         let fut = &mut fut[..M * ny];
 
+        /*
+        let prevcol = {
+            let prev_ptr = prev.as_ptr();
+            move |i: usize| -> &[Float] {
+                unsafe { std::slice::from_raw_parts(prev_ptr.add(i * ny), ny) }
+            }
+        };
+        */
         let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
 
         for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) {
@@ -468,9 +476,16 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
         let half_diag_width = (D - 1) / 2;
         assert!(half_diag_width <= M);
 
-        let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
+        let prevcol = {
+            let prev_ptr = prev.as_ptr();
+            move |i: usize| -> &[Float] {
+                unsafe { std::slice::from_raw_parts(prev_ptr.add(i * ny), ny) }
+            }
+        };
+        //let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
+
         for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) {
-            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+            let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
             for (j, fut) in fut.by_ref().enumerate() {
                 let index_to_simd =
                     |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
@@ -481,7 +496,6 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
                 }
                 f *= idx;
                 {
-                    // puts simd along stride 1, j never goes past end of slice
                     f.write_to_slice_unaligned(fut);
                 }
             }

From a02c7daafc9a5583011dbfd952bfec6355392923 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Wed, 10 Feb 2021 21:15:21 +0100
Subject: [PATCH 35/35] remove iterator inhibiting optimisation

---
 sbp/src/operators/algos.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 78844dd..d14997a 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -444,7 +444,7 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
         let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
 
         for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) {
-            let mut fut = fut.chunks_exact_mut(SimdT::lanes());
+            let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
             for (j, fut) in fut.by_ref().enumerate() {
                 let index_to_simd =
                     |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
@@ -487,17 +487,23 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
         for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) {
             let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
             for (j, fut) in fut.by_ref().enumerate() {
-                let index_to_simd =
-                    |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
+                //let index_to_simd =
+                //    |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
+                let index_to_simd = |i: usize| unsafe {
+                    let prev = std::slice::from_raw_parts(
+                        prev.as_ptr().add(i * ny + SimdT::lanes() * j),
+                        SimdT::lanes(),
+                    );
+                    SimdT::from_slice_unaligned_unchecked(prev)
+                };
                 let mut f = SimdT::splat(0.0);
-                for (id, &d) in matrix.diag.iter().enumerate() {
+                // direct iter does not optimize well here
+                for (id, &d) in matrix.diag.row(0).iter().enumerate() {
                     let offset = ifut - half_diag_width + id;
                     f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
                 }
                 f *= idx;
-                {
-                    f.write_to_slice_unaligned(fut);
-                }
+                f.write_to_slice_unaligned(fut);
             }
             for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
                 let mut f = 0.0;
@@ -507,9 +513,7 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
                         f += d * prevcol(offset)[j];
                     }
                 }
-                {
-                    *fut = idx * f;
-                }
+                *fut = idx * f;
             }
         }
     }