From 4ae5c02bb1a7246fe814b9e8f3436e0d4a3402d3 Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Tue, 23 Mar 2021 19:21:38 +0100
Subject: [PATCH] Replace FastFloat with mul_add

---
 multigrid/Cargo.toml       |  2 +-
 sbp/Cargo.toml             |  1 -
 sbp/src/operators/algos.rs | 70 ++++++++------------------------------
 webfront/Cargo.toml        |  2 +-
 4 files changed, 16 insertions(+), 59 deletions(-)
diff --git a/multigrid/Cargo.toml b/multigrid/Cargo.toml
index 238ad7c..0370fd0 100644
--- a/multigrid/Cargo.toml
+++ b/multigrid/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2018"
 
 
 [dependencies]
-sbp = { path = "../sbp", features = ["serde1", "fast-float"] }
+sbp = { path = "../sbp", features = ["serde1"] }
 euler = { path = "../euler", features = ["serde1"] }
 hdf5 = "0.7.0"
 integrate = { path = "../utils/integrate" }
diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml
index 4e85fe5..5fd3a30 100644
--- a/sbp/Cargo.toml
+++ b/sbp/Cargo.toml
@@ -17,7 +17,6 @@ constmatrix = { path = "../utils/constmatrix" }
 [features]
 # Use f32 as precision, default is f64
 f32 = ["float/f32"]
-fast-float = ["float/fast-float"]
 sparse = ["sprs"]
 serde1 = ["serde", "ndarray/serde"]
 
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index b075447..5d0896e 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -4,9 +4,6 @@ use num_traits::Zero;
 
 pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
 
-#[cfg(feature = "fast-float")]
-use float::FastFloat;
-
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct DiagonalMatrix<const B: usize> {
     pub start: [Float; B],
@@ -105,17 +102,14 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
     prev: &[Float],
     fut: &mut [Float],
 ) {
-    #[cfg(feature = "fast-float")]
-    let (matrix, prev, fut) = {
-        use std::mem::transmute;
-        unsafe {
-            (
-                transmute::<_, &BlockMatrix<FastFloat, M, N, D>>(matrix),
-                transmute::<_, &[FastFloat]>(prev),
-                transmute::<_, &mut [FastFloat]>(fut),
-            )
-        }
-    };
+    #[inline(never)]
+    fn dedup_matmul<const M: usize, const N: usize>(
+        c: &mut ColVector<Float, M>,
+        a: &Matrix<Float, M, N>,
+        b: &ColVector<Float, N>,
+    ) {
+        c.matmul_float_into(a, b)
+    }
 
     assert_eq!(prev.len(), fut.len());
     let nx = prev.len();
@@ -130,8 +124,6 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
         1.0 / (nx - 1) as Float
     };
     let idx = 1.0 / dx;
-    #[cfg(feature = "fast-float")]
-    let idx = FastFloat::from(idx);
 
     // Help aliasing analysis
     let (futb1, fut) = fut.split_at_mut(M);
@@ -142,7 +134,7 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
         let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
         let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap());
 
-        fut.matmul_into(&matrix.start, prev);
+        dedup_matmul(fut, &matrix.start, prev);
         *fut *= idx;
     }
 
@@ -158,7 +150,7 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
         let fut = ColVector::<_, 1>::map_to_col_mut(f);
         let prev = ColVector::<_, D>::map_to_col(window);
 
-        fut.matmul_into(&matrix.diag, prev);
+        fut.matmul_float_into(&matrix.diag, prev);
         *fut *= idx;
     }
 
@@ -167,7 +159,7 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
         let prev = ColVector::<_, N>::map_to_col(prev);
         let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap());
 
-        fut.matmul_into(&matrix.end, prev);
+        dedup_matmul(fut, &matrix.end, prev);
         *fut *= idx;
     }
 }
@@ -199,19 +191,6 @@ pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize
     prev: ArrayView2<Float>,
     mut fut: ArrayViewMut2<Float>,
 ) {
-    /* Does not increase the perf...
-    #[cfg(feature = "fast-float")]
-    let (matrix, prev, mut fut) = unsafe {
-        (
-            std::mem::transmute::<_, &BlockMatrix<FastFloat, M, N, D>>(matrix),
-            std::mem::transmute::<_, ArrayView2<FastFloat>>(prev),
-            std::mem::transmute::<_, ArrayViewMut2<FastFloat>>(fut),
-        )
-    };
-    #[cfg(not(feature = "fast-float"))]
-    let mut fut = fut;
-    */
-
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[1];
     let ny = prev.shape()[0];
@@ -287,19 +266,6 @@ pub(crate) fn diff_op_2d_sliceable_y<const M: usize, const N: usize, const D: us
     prev: ArrayView2<Float>,
     mut fut: ArrayViewMut2<Float>,
 ) {
-    /* Does not increase the perf...
-    #[cfg(feature = "fast-float")]
-    let (matrix, prev, mut fut) = unsafe {
-        (
-            std::mem::transmute::<_, &BlockMatrix<FastFloat, M, N, D>>(matrix),
-            std::mem::transmute::<_, ArrayView2<FastFloat>>(prev),
-            std::mem::transmute::<_, ArrayViewMut2<FastFloat>>(fut),
-        )
-    };
-    #[cfg(not(feature = "fast-float"))]
-    let mut fut = fut;
-    */
-
     assert_eq!(prev.shape(), fut.shape());
     let nx = prev.shape()[1];
     let ny = prev.shape()[0];
@@ -733,17 +699,9 @@ fn dotproduct<'a>(
     u: impl IntoIterator<Item = &'a Float>,
     v: impl IntoIterator<Item = &'a Float>,
 ) -> Float {
-    u.into_iter().zip(v.into_iter()).fold(0.0, |acc, (&u, &v)| {
-        #[cfg(feature = "fast-float")]
-        {
-            // We do not care about the order of multiplication nor addition
-            (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into()
-        }
-        #[cfg(not(feature = "fast-float"))]
-        {
-            acc + u * v
-        }
-    })
+    u.into_iter()
+        .zip(v.into_iter())
+        .fold(0.0, |acc, (&u, &v)| Float::mul_add(u, v, acc))
 }
 
 #[cfg(feature = "sparse")]
diff --git a/webfront/Cargo.toml b/webfront/Cargo.toml
index 43baf83..7d80dc0 100644
--- a/webfront/Cargo.toml
+++ b/webfront/Cargo.toml
@@ -11,7 +11,7 @@ crate-type = ["cdylib"]
 wasm-bindgen = "0.2.63"
 console_error_panic_hook = "0.1.6"
 wee_alloc = "0.4.5"
-sbp = { path = "../sbp", features = ["f32", "fast-float"] }
+sbp = { path = "../sbp", features = ["f32"] }
 ndarray = "0.14.0"
 euler = { path = "../euler" }
 maxwell = { path = "../maxwell" }