From 3edd18c4fdff5b9bcb4cb060273facc0906d016b Mon Sep 17 00:00:00 2001
From: Magnus Ulimoen <magnus@ulimoen.dev>
Date: Tue, 27 Jul 2021 18:32:22 +0000
Subject: [PATCH] Use core_simd over packed_simd

---
 sbp/Cargo.toml             |  2 +-
 sbp/src/lib.rs             |  1 +
 sbp/src/operators/algos.rs | 40 +++++++++++++++++++++++---------------
 3 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml
index 5fd3a30..e642625 100644
--- a/sbp/Cargo.toml
+++ b/sbp/Cargo.toml
@@ -7,12 +7,12 @@ edition = "2018"
 [dependencies]
 ndarray = { version = "0.14.0", features = ["approx"] }
 approx = "0.4.0"
-packed_simd = { version = "0.3.3", package = "packed_simd_2" }
 sprs = { version = "0.10.0", optional = true, default-features = false }
 serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] }
 num-traits = "0.2.14"
 float = { path = "../utils/float" }
 constmatrix = { path = "../utils/constmatrix" }
+core_simd = { git = "https://github.com/rust-lang/stdsimd" }
 
 [features]
 # Use f32 as precision, default is f64
diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs
index 3360a7f..ffe6ce3 100644
--- a/sbp/src/lib.rs
+++ b/sbp/src/lib.rs
@@ -2,6 +2,7 @@
 #![feature(array_windows)]
 #![feature(array_chunks)]
 #![feature(const_fn_floating_point_arithmetic)]
+#![feature(portable_simd)]
 
 pub use float::{consts, Float};
 
diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs
index 70aa3f7..20ae1d5 100644
--- a/sbp/src/operators/algos.rs
+++ b/sbp/src/operators/algos.rs
@@ -1,6 +1,7 @@
 use super::*;
 use ndarray::s;
 use num_traits::Zero;
+use std::convert::TryInto;
 
 pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
 
@@ -102,7 +103,6 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
     prev: &[Float],
     fut: &mut [Float],
 ) {
-    use std::convert::TryInto;
     #[inline(never)]
     /// This prevents code bloat, both start and end block gives
     /// a matrix multiplication with the same matrix sizes
@@ -392,13 +392,14 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
     };
     let idx = 1.0 / dx;
 
+    use core_simd::Vector;
     #[cfg(not(feature = "f32"))]
-    type SimdT = packed_simd::f64x8;
+    type SimdT = core_simd::f64x8;
     #[cfg(feature = "f32")]
-    type SimdT = packed_simd::f32x16;
+    type SimdT = core_simd::f32x16;
 
     // How many elements that can be simdified
-    let simdified = SimdT::lanes() * (ny / SimdT::lanes());
+    let simdified = SimdT::LANES * (ny / SimdT::LANES);
 
     let (fut0, futmid) = fut.split_at_mut(M * ny);
     let (futmid, futn) = futmid.split_at_mut((nx - 2 * M) * ny);
@@ -428,16 +429,21 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
         let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
 
         for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) {
-            let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
+            let mut fut = fut.array_chunks_mut::<{ SimdT::LANES }>();
             for (j, fut) in fut.by_ref().enumerate() {
-                let index_to_simd =
-                    |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
+                let index_to_simd = |i| {
+                    SimdT::from_array(
+                        (&prevcol(i)[SimdT::LANES * j..SimdT::LANES * (j + 1)])
+                            .try_into()
+                            .unwrap(),
+                    )
+                };
                 let mut f = SimdT::splat(0.0);
                 for (iprev, &bl) in bl.iter().enumerate() {
-                    f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
+                    f = index_to_simd(iprev).mul_add(SimdT::splat(bl), f);
                 }
                 f *= idx;
-                f.write_to_slice_unaligned(fut);
+                fut.clone_from_slice(f.as_array());
             }
             for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
                 let mut f = 0.0;
@@ -469,25 +475,27 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
         //let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
 
         for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) {
-            let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
+            let mut fut = fut.array_chunks_mut::<{ SimdT::LANES }>();
             for (j, fut) in fut.by_ref().enumerate() {
                 //let index_to_simd =
                 //    |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
                 let index_to_simd = |i: usize| unsafe {
                     let prev = std::slice::from_raw_parts(
-                        prev.as_ptr().add(i * ny + SimdT::lanes() * j),
-                        SimdT::lanes(),
-                    );
-                    SimdT::from_slice_unaligned_unchecked(prev)
+                        prev.as_ptr().add(i * ny + SimdT::LANES * j),
+                        SimdT::LANES,
+                    )
+                    .try_into()
+                    .unwrap();
+                    SimdT::from_array(prev)
                 };
                 let mut f = SimdT::splat(0.0);
                 // direct iter does not optimize well here
                 for (id, &d) in matrix.diag.row(0).iter().enumerate() {
                     let offset = ifut - half_diag_width + id;
-                    f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
+                    f = index_to_simd(offset).mul_add(SimdT::splat(d), f);
                 }
                 f *= idx;
-                f.write_to_slice_unaligned(fut);
+                fut.clone_from_slice(f.as_array());
             }
             for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
                 let mut f = 0.0;