diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml index 5fd3a30..e642625 100644 --- a/sbp/Cargo.toml +++ b/sbp/Cargo.toml @@ -7,12 +7,12 @@ edition = "2018" [dependencies] ndarray = { version = "0.14.0", features = ["approx"] } approx = "0.4.0" -packed_simd = { version = "0.3.3", package = "packed_simd_2" } sprs = { version = "0.10.0", optional = true, default-features = false } serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] } num-traits = "0.2.14" float = { path = "../utils/float" } constmatrix = { path = "../utils/constmatrix" } +core_simd = { git = "https://github.com/rust-lang/stdsimd" } [features] # Use f32 as precision, default is f64 diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs index 3360a7f..ffe6ce3 100644 --- a/sbp/src/lib.rs +++ b/sbp/src/lib.rs @@ -2,6 +2,7 @@ #![feature(array_windows)] #![feature(array_chunks)] #![feature(const_fn_floating_point_arithmetic)] +#![feature(portable_simd)] pub use float::{consts, Float}; diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 70aa3f7..20ae1d5 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -1,6 +1,7 @@ use super::*; use ndarray::s; use num_traits::Zero; +use std::convert::TryInto; pub(crate) use constmatrix::{ColVector, Matrix, RowVector}; @@ -102,7 +103,6 @@ pub(crate) fn diff_op_1d_slice( prev: &[Float], fut: &mut [Float], ) { - use std::convert::TryInto; #[inline(never)] /// This prevents code bloat, both start and end block gives /// a matrix multiplication with the same matrix sizes @@ -392,13 +392,14 @@ pub(crate) fn diff_op_2d_sliceable_y_simd &[Float] { &prev[i * ny..(i + 1) * ny] }; for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) { - let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>(); + let mut fut = fut.array_chunks_mut::<{ SimdT::LANES }>(); for (j, fut) in fut.by_ref().enumerate() { - let index_to_simd = - |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); + let index_to_simd = |i| { + SimdT::from_array( + (&prevcol(i)[SimdT::LANES * j..SimdT::LANES * (j + 1)]) + .try_into() + .unwrap(), + ) + }; let mut f = SimdT::splat(0.0); for (iprev, &bl) in bl.iter().enumerate() { - f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); + f = index_to_simd(iprev).mul_add(SimdT::splat(bl), f); } f *= idx; - f.write_to_slice_unaligned(fut); + fut.clone_from_slice(f.as_array()); } for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { let mut f = 0.0; @@ -469,25 +475,27 @@ pub(crate) fn diff_op_2d_sliceable_y_simd &[Float] { &prev[i * ny..(i + 1) * ny] }; for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) { - let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>(); + let mut fut = fut.array_chunks_mut::<{ SimdT::LANES }>(); for (j, fut) in fut.by_ref().enumerate() { //let index_to_simd = // |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); let index_to_simd = |i: usize| unsafe { let prev = std::slice::from_raw_parts( - prev.as_ptr().add(i * ny + SimdT::lanes() * j), - SimdT::lanes(), - ); - SimdT::from_slice_unaligned_unchecked(prev) + prev.as_ptr().add(i * ny + SimdT::LANES * j), + SimdT::LANES, + ) + .try_into() + .unwrap(); + SimdT::from_array(prev) }; let mut f = SimdT::splat(0.0); // direct iter does not optimize well here for (id, &d) in matrix.diag.row(0).iter().enumerate() { let offset = ifut - half_diag_width + id; - f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); + f = index_to_simd(offset).mul_add(SimdT::splat(d), f); } f *= idx; - f.write_to_slice_unaligned(fut); + fut.clone_from_slice(f.as_array()); } for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { let mut f = 0.0;