Use core_simd over packed_simd

This commit is contained in:
Magnus Ulimoen 2021-07-27 18:32:22 +00:00
parent e25ee9c74a
commit 3edd18c4fd
3 changed files with 26 additions and 17 deletions

View File

@ -7,12 +7,12 @@ edition = "2018"
[dependencies] [dependencies]
ndarray = { version = "0.14.0", features = ["approx"] } ndarray = { version = "0.14.0", features = ["approx"] }
approx = "0.4.0" approx = "0.4.0"
packed_simd = { version = "0.3.3", package = "packed_simd_2" }
sprs = { version = "0.10.0", optional = true, default-features = false } sprs = { version = "0.10.0", optional = true, default-features = false }
serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] } serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] }
num-traits = "0.2.14" num-traits = "0.2.14"
float = { path = "../utils/float" } float = { path = "../utils/float" }
constmatrix = { path = "../utils/constmatrix" } constmatrix = { path = "../utils/constmatrix" }
core_simd = { git = "https://github.com/rust-lang/stdsimd" }
[features] [features]
# Use f32 as precision, default is f64 # Use f32 as precision, default is f64

View File

@ -2,6 +2,7 @@
#![feature(array_windows)] #![feature(array_windows)]
#![feature(array_chunks)] #![feature(array_chunks)]
#![feature(const_fn_floating_point_arithmetic)] #![feature(const_fn_floating_point_arithmetic)]
#![feature(portable_simd)]
pub use float::{consts, Float}; pub use float::{consts, Float};

View File

@ -1,6 +1,7 @@
use super::*; use super::*;
use ndarray::s; use ndarray::s;
use num_traits::Zero; use num_traits::Zero;
use std::convert::TryInto;
pub(crate) use constmatrix::{ColVector, Matrix, RowVector}; pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
@ -102,7 +103,6 @@ pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
prev: &[Float], prev: &[Float],
fut: &mut [Float], fut: &mut [Float],
) { ) {
use std::convert::TryInto;
#[inline(never)] #[inline(never)]
/// This prevents code bloat, both start and end block gives /// This prevents code bloat, both start and end block gives
/// a matrix multiplication with the same matrix sizes /// a matrix multiplication with the same matrix sizes
@ -392,13 +392,14 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
}; };
let idx = 1.0 / dx; let idx = 1.0 / dx;
use core_simd::Vector;
#[cfg(not(feature = "f32"))] #[cfg(not(feature = "f32"))]
type SimdT = packed_simd::f64x8; type SimdT = core_simd::f64x8;
#[cfg(feature = "f32")] #[cfg(feature = "f32")]
type SimdT = packed_simd::f32x16; type SimdT = core_simd::f32x16;
// How many elements that can be simdified // How many elements that can be simdified
let simdified = SimdT::lanes() * (ny / SimdT::lanes()); let simdified = SimdT::LANES * (ny / SimdT::LANES);
let (fut0, futmid) = fut.split_at_mut(M * ny); let (fut0, futmid) = fut.split_at_mut(M * ny);
let (futmid, futn) = futmid.split_at_mut((nx - 2 * M) * ny); let (futmid, futn) = futmid.split_at_mut((nx - 2 * M) * ny);
@ -428,16 +429,21 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] }; let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) { for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) {
let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>(); let mut fut = fut.array_chunks_mut::<{ SimdT::LANES }>();
for (j, fut) in fut.by_ref().enumerate() { for (j, fut) in fut.by_ref().enumerate() {
let index_to_simd = let index_to_simd = |i| {
|i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); SimdT::from_array(
(&prevcol(i)[SimdT::LANES * j..SimdT::LANES * (j + 1)])
.try_into()
.unwrap(),
)
};
let mut f = SimdT::splat(0.0); let mut f = SimdT::splat(0.0);
for (iprev, &bl) in bl.iter().enumerate() { for (iprev, &bl) in bl.iter().enumerate() {
f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); f = index_to_simd(iprev).mul_add(SimdT::splat(bl), f);
} }
f *= idx; f *= idx;
f.write_to_slice_unaligned(fut); fut.clone_from_slice(f.as_array());
} }
for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
let mut f = 0.0; let mut f = 0.0;
@ -469,25 +475,27 @@ pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const
//let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] }; //let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) { for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) {
let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>(); let mut fut = fut.array_chunks_mut::<{ SimdT::LANES }>();
for (j, fut) in fut.by_ref().enumerate() { for (j, fut) in fut.by_ref().enumerate() {
//let index_to_simd = //let index_to_simd =
// |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); // |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
let index_to_simd = |i: usize| unsafe { let index_to_simd = |i: usize| unsafe {
let prev = std::slice::from_raw_parts( let prev = std::slice::from_raw_parts(
prev.as_ptr().add(i * ny + SimdT::lanes() * j), prev.as_ptr().add(i * ny + SimdT::LANES * j),
SimdT::lanes(), SimdT::LANES,
); )
SimdT::from_slice_unaligned_unchecked(prev) .try_into()
.unwrap();
SimdT::from_array(prev)
}; };
let mut f = SimdT::splat(0.0); let mut f = SimdT::splat(0.0);
// direct iter does not optimize well here // direct iter does not optimize well here
for (id, &d) in matrix.diag.row(0).iter().enumerate() { for (id, &d) in matrix.diag.row(0).iter().enumerate() {
let offset = ifut - half_diag_width + id; let offset = ifut - half_diag_width + id;
f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); f = index_to_simd(offset).mul_add(SimdT::splat(d), f);
} }
f *= idx; f *= idx;
f.write_to_slice_unaligned(fut); fut.clone_from_slice(f.as_array());
} }
for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
let mut f = 0.0; let mut f = 0.0;