From c104082ac0430c7cced18306647ee55c1701a06f Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Thu, 28 Jan 2021 19:18:34 +0100 Subject: [PATCH 01/35] add matrix type --- sbp/src/operators/algos.rs | 122 +++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index a8dc30b..75fcae6 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -1,5 +1,127 @@ use super::*; +pub(crate) mod constmatrix { + /// A row-major matrix + #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] + pub struct Matrix { + data: [[T; N]; M], + } + + impl Default for Matrix { + fn default() -> Self { + use std::mem::MaybeUninit; + let mut d: [[MaybeUninit; N]; M] = unsafe { MaybeUninit::uninit().assume_init() }; + + for row in d.iter_mut() { + for item in row.iter_mut() { + *item = MaybeUninit::new(T::default()); + } + } + + let data = unsafe { std::mem::transmute_copy::<_, [[T; N]; M]>(&d) }; + Self { data } + } + } + + impl core::ops::Index<(usize, usize)> for Matrix { + type Output = T; + #[inline(always)] + fn index(&self, (i, j): (usize, usize)) -> &Self::Output { + &self.data[i][j] + } + } + + impl core::ops::IndexMut<(usize, usize)> for Matrix { + #[inline(always)] + fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output { + &mut self.data[i][j] + } + } + + impl core::ops::Index for Matrix { + type Output = [T; N]; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.data[i] + } + } + + impl core::ops::IndexMut for Matrix { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.data[i] + } + } + + impl Matrix { + pub const fn new(data: [[T; N]; M]) -> Self { + Self { data } + } + pub const fn nrows(&self) -> usize { + M + } + pub const fn ncols(&self) -> usize { + N + } + pub fn matmul(&self, other: &Matrix) -> Matrix + where + T: Default + core::ops::AddAssign, + for<'f> &'f T: std::ops::Mul, + { + let mut out = Matrix::default(); + self.matmul_into(other, &mut out); + out + } + pub fn matmul_into( + &self, + other: &Matrix, + out: &mut Matrix, + ) where + T: Default + core::ops::AddAssign, + for<'f> &'f T: std::ops::Mul, + { + *out = Default::default(); + for i in 0..M { + for j in 0..P { + for k in 0..N { + out[(i, j)] += &self[(i, k)] * &other[(k, j)]; + } + } + } + } + pub fn iter_rows( + &self, + ) -> impl ExactSizeIterator + DoubleEndedIterator { + (0..M).map(move |i| &self[i]) + } + } + + #[cfg(test)] + mod tests { + use super::{super::*, *}; + #[test] + fn construct_copy_type() { + let _m0 = Matrix::::default(); + let _m1: Matrix = Matrix::default(); + + let _m2 = Matrix::new([[1, 2], [3, 4]]); + } + #[test] + fn construct_non_copy() { + let _m = Matrix::::default(); + } + + #[test] + fn matmul() { + let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); + let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]); + + let m3 = m1.matmul(&m2); + assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]])); + } + } +} + #[inline(always)] pub(crate) fn diff_op_1d( block: &[&[Float]], From 94e8fb5b7cccff90aeff6916d9fc9df6dc8a540b Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Thu, 28 Jan 2021 20:59:11 +0100 Subject: [PATCH 02/35] checkpoint --- sbp/src/lib.rs | 1 + sbp/src/operators/algos.rs | 183 +++++++++++++++++++++++++++++++++++++ 2 files changed, 184 insertions(+) diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs index e8badea..5b3eaa2 100644 --- a/sbp/src/lib.rs +++ b/sbp/src/lib.rs @@ -1,4 +1,5 @@ #![feature(core_intrinsics)] +#![feature(array_windows)] /// Type used for floats, configure with the `f32` feature #[cfg(feature = "f32")] diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 75fcae6..93657e3 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -6,6 +6,8 @@ pub(crate) mod constmatrix { pub struct Matrix { data: [[T; N]; M], } + pub type RowVector = Matrix; + pub type ColVector = Matrix; impl Default for Matrix { fn default() -> Self { @@ -89,11 +91,61 @@ pub(crate) mod constmatrix { } } } + pub fn iter(&self) -> impl ExactSizeIterator + DoubleEndedIterator { + (0..N * M).map(move |x| { + let i = x / N; + let j = x % N; + &self[(i, j)] + }) + } + pub fn iter_mut(&mut self) -> impl Iterator { + self.data.iter_mut().flatten() + } pub fn iter_rows( &self, ) -> impl ExactSizeIterator + DoubleEndedIterator { (0..M).map(move |i| &self[i]) } + + pub fn flip(&self) -> Self + where + T: Default + Clone, + { + let mut v = Self::default(); + for i in 0..M { + for j in 0..N { + v[(i, j)] = self[(N - 1 - i, M - 1 - j)].clone() + } + } + v + } + } + + impl ColVector { + pub fn map_to_col(slice: &[T; N]) -> &ColVector { + unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } + } + pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector { + unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } + } + } + + impl RowVector { + pub fn map_to_row(slice: &[T; N]) -> &Self { + unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } + } + pub fn map_to_row_mut(slice: &mut [T; N]) -> &mut Self { + unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } + } + } + + impl core::ops::MulAssign<&T> for Matrix + where + for<'f> T: core::ops::MulAssign<&'f T>, + { + fn mul_assign(&mut self, other: &T) { + self.iter_mut().for_each(|x| *x *= other) + } } #[cfg(test)] @@ -122,6 +174,137 @@ pub(crate) mod constmatrix { } } +pub(crate) use constmatrix::{ColVector, Matrix, RowVector}; + +#[inline(always)] +pub(crate) fn diff_op_1d_matrix( + block: &Matrix, + diag: &RowVector, + symmetry: Symmetry, + optype: OperatorType, + prev: ArrayView1, + mut fut: ArrayViewMut1, +) { + assert_eq!(prev.shape(), fut.shape()); + let nx = prev.shape()[0]; + assert!(nx >= 2 * M); + assert!(nx >= N); + + let dx = if optype == OperatorType::H2 { + 1.0 / (nx - 2) as Float + } else { + 1.0 / (nx - 1) as Float + }; + let idx = 1.0 / dx; + + for (bl, f) in block.iter_rows().zip(&mut fut) { + let diff = bl + .iter() + .zip(prev.iter()) + .map(|(x, y)| x * y) + .sum::(); + *f = diff * idx; + } + + // The window needs to be aligned to the diagonal elements, + // based on the block size + let window_elems_to_skip = M - ((D - 1) / 2); + + for (window, f) in prev + .windows(D) + .into_iter() + .skip(window_elems_to_skip) + .zip(fut.iter_mut().skip(M)) + .take(nx - 2 * M) + { + let diff = diag.iter().zip(&window).map(|(x, y)| x * y).sum::(); + *f = diff * idx; + } + + for (bl, f) in block.iter_rows().zip(fut.iter_mut().rev()) { + let diff = bl + .iter() + .zip(prev.iter().rev()) + .map(|(x, y)| x * y) + .sum::(); + + *f = idx + * if symmetry == Symmetry::Symmetric { + diff + } else { + -diff + }; + } +} + +#[inline(always)] +pub(crate) fn diff_op_1d_slice_matrix( + block: &Matrix, + diag: &RowVector, + symmetry: Symmetry, + optype: OperatorType, + prev: &[Float], + fut: &mut [Float], +) { + assert_eq!(prev.len(), fut.len()); + let nx = prev.len(); + assert!(nx >= 2 * M); + assert!(nx >= N); + let prev = &prev[..nx]; + let fut = &mut fut[..nx]; + + let dx = if optype == OperatorType::H2 { + 1.0 / (nx - 2) as Float + } else { + 1.0 / (nx - 1) as Float + }; + let idx = 1.0 / dx; + + use std::convert::TryInto; + { + let prev = ColVector::<_, N>::map_to_col((&prev[0..N]).try_into().unwrap()); + let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap()); + + block.matmul_into(prev, fut); + *fut *= &idx; + } + + // The window needs to be aligned to the diagonal elements, + // based on the block size + let window_elems_to_skip = M - ((D - 1) / 2); + + for (window, f) in prev + .array_windows::() + .skip(window_elems_to_skip) + .zip(fut.iter_mut().skip(M)) + .take(nx - 2 * M) + { + let fut = ColVector::::map_to_col_mut(unsafe { + std::mem::transmute::<&mut Float, &mut [Float; 1]>(f) + }); + let prev = ColVector::<_, D>::map_to_col(window); + + diag.matmul_into(prev, fut); + *fut *= &idx; + } + + let flipped = { + let mut flipped = block.flip(); + if symmetry != Symmetry::Symmetric { + flipped *= &-1.0; + } + flipped + }; + + { + let prev = ColVector::<_, N>::map_to_col((&prev[nx - N..]).try_into().unwrap()); + let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); + + flipped.matmul_into(prev, fut); + *fut *= &idx; + } +} + #[inline(always)] pub(crate) fn diff_op_1d( block: &[&[Float]], From 30c563c19d95cc56b096f7deb2804fb15d70ea3f Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Thu, 28 Jan 2021 21:27:25 +0100 Subject: [PATCH 03/35] working d1 SBP4 --- sbp/src/lib.rs | 1 + sbp/src/operators/algos.rs | 12 ++++++------ sbp/src/operators/traditional4.rs | 30 +++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs index 5b3eaa2..1d09854 100644 --- a/sbp/src/lib.rs +++ b/sbp/src/lib.rs @@ -1,5 +1,6 @@ #![feature(core_intrinsics)] #![feature(array_windows)] +#![feature(array_chunks)] /// Type used for floats, configure with the `f32` feature #[cfg(feature = "f32")] diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 93657e3..3fd83bf 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -114,7 +114,7 @@ pub(crate) mod constmatrix { let mut v = Self::default(); for i in 0..M { for j in 0..N { - v[(i, j)] = self[(N - 1 - i, M - 1 - j)].clone() + v[(i, j)] = self[(M - 1 - i, N - 1 - j)].clone() } } v @@ -276,12 +276,11 @@ pub(crate) fn diff_op_1d_slice_matrix() .skip(window_elems_to_skip) - .zip(fut.iter_mut().skip(M)) + .zip(fut.array_chunks_mut::<1>().skip(M)) .take(nx - 2 * M) { - let fut = ColVector::::map_to_col_mut(unsafe { - std::mem::transmute::<&mut Float, &mut [Float; 1]>(f) - }); + // impl From here? + let fut = ColVector::::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); diag.matmul_into(prev, fut); @@ -297,7 +296,8 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col((&prev[nx - N..]).try_into().unwrap()); + let prev = prev.array_windows::().last().unwrap(); + let prev = ColVector::<_, N>::map_to_col(prev); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); flipped.matmul_into(prev, fut); diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 649c43c..c29944d 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -10,10 +10,14 @@ impl SBP4 { const HBLOCK: &'static [Float] = &[ 17.0 / 48.0, 59.0 / 48.0, 43.0 / 48.0, 49.0 / 48.0, ]; + #[rustfmt::skip] const DIAG: &'static [Float] = &[ 1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0, ]; + + const DIAG_MATRIX: super::RowVector = + super::RowVector::new([[1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0]]); #[rustfmt::skip] const BLOCK: &'static [&'static [Float]] = &[ &[-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0], @@ -21,6 +25,13 @@ impl SBP4 { &[4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0], &[3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0] ]; + #[rustfmt::skip] + const BLOCK_MATRIX: super::Matrix = super::Matrix::new([ + [-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0, 0.0, 0.0], + [-1.0/2.0, 0.0, 1.0/2.0, 0.0, 0.0, 0.0], + [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0], + [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0] + ]); #[rustfmt::skip] const D2DIAG: &'static [Float] = &[ @@ -71,6 +82,22 @@ impl SbpOperator1d for SBP4 { } } +fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { + for (p, mut f) in prev + .axis_iter(ndarray::Axis(0)) + .zip(fut.axis_iter_mut(ndarray::Axis(0))) + { + super::diff_op_1d_slice_matrix( + &SBP4::BLOCK_MATRIX, + &SBP4::DIAG_MATRIX, + super::Symmetry::AntiSymmetric, + super::OperatorType::Normal, + p.as_slice().unwrap(), + f.as_slice_mut().unwrap(), + ) + } +} + impl SbpOperator2d for SBP4 { fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); @@ -81,7 +108,8 @@ impl SbpOperator2d for SBP4 { match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => { - diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); + //diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); + diff_op_row_local(prev, fut) } ([1, _], [1, _]) => { diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); From 36293e75e61ee2834fd009555974afc103844f82 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Thu, 28 Jan 2021 21:59:13 +0100 Subject: [PATCH 04/35] 10% instr reduction with fast_ intr --- sbp/src/operators/algos.rs | 53 ++++++++++++++++++++++--------- sbp/src/operators/traditional4.rs | 4 ++- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 3fd83bf..26686e0 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -59,9 +59,11 @@ pub(crate) mod constmatrix { pub const fn new(data: [[T; N]; M]) -> Self { Self { data } } + #[inline] pub const fn nrows(&self) -> usize { M } + #[inline] pub const fn ncols(&self) -> usize { N } @@ -82,12 +84,13 @@ pub(crate) mod constmatrix { T: Default + core::ops::AddAssign, for<'f> &'f T: std::ops::Mul, { - *out = Default::default(); for i in 0..M { for j in 0..P { + let mut t = T::default(); for k in 0..N { - out[(i, j)] += &self[(i, k)] * &other[(k, j)]; + t += &self[(i, k)] * &other[(k, j)]; } + out[(i, j)] = t; } } } @@ -148,6 +151,34 @@ pub(crate) mod constmatrix { } } + use super::Float; + impl Matrix { + pub fn matmul_fast( + &self, + other: &Matrix, + ) -> Matrix { + let mut out = Matrix::default(); + self.matmul_into_fast(other, &mut out); + out + } + pub fn matmul_into_fast( + &self, + other: &Matrix, + out: &mut Matrix, + ) { + use core::intrinsics::{fadd_fast, fmul_fast}; + for i in 0..M { + for j in 0..P { + let mut t = 0.0; + for k in 0..N { + t = unsafe { fadd_fast(t, fmul_fast(self[(i, k)], other[(k, j)])) } + } + out[(i, j)] = t; + } + } + } + } + #[cfg(test)] mod tests { use super::{super::*, *}; @@ -240,8 +271,8 @@ pub(crate) fn diff_op_1d_matrix( #[inline(always)] pub(crate) fn diff_op_1d_slice_matrix( block: &Matrix, + endblock: &Matrix, diag: &RowVector, - symmetry: Symmetry, optype: OperatorType, prev: &[Float], fut: &mut [Float], @@ -262,10 +293,10 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col((&prev[0..N]).try_into().unwrap()); + let prev = ColVector::<_, N>::map_to_col(prev.array_windows::().nth(0).unwrap()); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap()); - block.matmul_into(prev, fut); + block.matmul_into_fast(prev, fut); *fut *= &idx; } @@ -283,24 +314,16 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); - diag.matmul_into(prev, fut); + diag.matmul_into_fast(prev, fut); *fut *= &idx; } - let flipped = { - let mut flipped = block.flip(); - if symmetry != Symmetry::Symmetric { - flipped *= &-1.0; - } - flipped - }; - { let prev = prev.array_windows::().last().unwrap(); let prev = ColVector::<_, N>::map_to_col(prev); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); - flipped.matmul_into(prev, fut); + endblock.matmul_into_fast(prev, fut); *fut *= &idx; } } diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index c29944d..41f3c03 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -83,14 +83,16 @@ impl SbpOperator1d for SBP4 { } fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { + let mut flipmatrix = SBP4::BLOCK_MATRIX.flip(); + flipmatrix *= &-1.0; for (p, mut f) in prev .axis_iter(ndarray::Axis(0)) .zip(fut.axis_iter_mut(ndarray::Axis(0))) { super::diff_op_1d_slice_matrix( &SBP4::BLOCK_MATRIX, + &flipmatrix, &SBP4::DIAG_MATRIX, - super::Symmetry::AntiSymmetric, super::OperatorType::Normal, p.as_slice().unwrap(), f.as_slice_mut().unwrap(), From bcda26a51297e437eab0cf0fa998407fe6583384 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Thu, 28 Jan 2021 22:31:54 +0100 Subject: [PATCH 05/35] 15% reductions in instr count --- sbp/src/operators/algos.rs | 129 +++++++++++++++++++++++++++---------- 1 file changed, 96 insertions(+), 33 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 26686e0..9717a98 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -151,34 +151,6 @@ pub(crate) mod constmatrix { } } - use super::Float; - impl Matrix { - pub fn matmul_fast( - &self, - other: &Matrix, - ) -> Matrix { - let mut out = Matrix::default(); - self.matmul_into_fast(other, &mut out); - out - } - pub fn matmul_into_fast( - &self, - other: &Matrix, - out: &mut Matrix, - ) { - use core::intrinsics::{fadd_fast, fmul_fast}; - for i in 0..M { - for j in 0..P { - let mut t = 0.0; - for k in 0..N { - t = unsafe { fadd_fast(t, fmul_fast(self[(i, k)], other[(k, j)])) } - } - out[(i, j)] = t; - } - } - } - } - #[cfg(test)] mod tests { use super::{super::*, *}; @@ -268,6 +240,90 @@ pub(crate) fn diff_op_1d_matrix( } } +mod fastfloat { + use super::*; + #[repr(transparent)] + #[derive(Debug, PartialEq, Default)] + pub(crate) struct FastFloat(Float); + + use core::intrinsics::{fadd_fast, fmul_fast}; + + impl core::ops::Mul for FastFloat { + type Output = Self; + fn mul(self, o: Self) -> Self::Output { + unsafe { Self(fmul_fast(self.0, o.0)) } + } + } + + impl core::ops::MulAssign for FastFloat { + fn mul_assign(&mut self, o: FastFloat) { + unsafe { + self.0 = fmul_fast(self.0, o.0); + } + } + } + + impl core::ops::MulAssign<&FastFloat> for FastFloat { + fn mul_assign(&mut self, o: &FastFloat) { + unsafe { + self.0 = fmul_fast(self.0, o.0); + } + } + } + + impl core::ops::MulAssign for &mut FastFloat { + fn mul_assign(&mut self, o: FastFloat) { + unsafe { + self.0 = fmul_fast(self.0, o.0); + } + } + } + impl core::ops::MulAssign<&FastFloat> for &mut FastFloat { + fn mul_assign(&mut self, o: &FastFloat) { + unsafe { + self.0 = fmul_fast(self.0, o.0); + } + } + } + + impl core::ops::AddAssign for FastFloat { + fn add_assign(&mut self, o: Self) { + unsafe { + self.0 = fadd_fast(self.0, o.0); + } + } + } + + impl core::ops::Mul for &FastFloat { + type Output = FastFloat; + fn mul(self, o: Self) -> Self::Output { + unsafe { FastFloat(fmul_fast(self.0, o.0)) } + } + } + + impl core::ops::MulAssign<&Float> for FastFloat { + fn mul_assign(&mut self, o: &Float) { + unsafe { + self.0 = fmul_fast(self.0, *o); + } + } + } + impl core::ops::MulAssign<&Float> for &mut FastFloat { + fn mul_assign(&mut self, o: &Float) { + unsafe { + self.0 = fmul_fast(self.0, *o); + } + } + } + + impl From for FastFloat { + fn from(f: Float) -> Self { + Self(f) + } + } +} +use fastfloat::FastFloat; + #[inline(always)] pub(crate) fn diff_op_1d_slice_matrix( block: &Matrix, @@ -277,6 +333,13 @@ pub(crate) fn diff_op_1d_slice_matrix>(block) }; + let endblock = unsafe { transmute::<_, &Matrix>(endblock) }; + let diag = unsafe { transmute::<_, &RowVector>(diag) }; + let prev = unsafe { transmute::<_, &[FastFloat]>(prev) }; + let fut = unsafe { transmute::<_, &mut [FastFloat]>(fut) }; + assert_eq!(prev.len(), fut.len()); let nx = prev.len(); assert!(nx >= 2 * M); @@ -289,14 +352,14 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev.array_windows::().nth(0).unwrap()); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap()); - block.matmul_into_fast(prev, fut); + block.matmul_into(prev, fut); *fut *= &idx; } @@ -311,10 +374,10 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col_mut(f); + let fut = ColVector::<_, 1>::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); - diag.matmul_into_fast(prev, fut); + diag.matmul_into(prev, fut); *fut *= &idx; } @@ -323,7 +386,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); - endblock.matmul_into_fast(prev, fut); + endblock.matmul_into(prev, fut); *fut *= &idx; } } From db552af4ff85a43bc938713fb68c98342bd71f5e Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Thu, 28 Jan 2021 23:20:52 +0100 Subject: [PATCH 06/35] add blockend with weird caveat --- sbp/src/operators/traditional4.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 41f3c03..4e5abad 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -32,6 +32,13 @@ impl SBP4 { [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0], [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0] ]); + #[rustfmt::skip] + const BLOCKEND_MATRIX: super::Matrix = super::Matrix::new([ + [4.0/49.0, -32.0/49.0, 0.0, 59.0/98.0, 0.0, -3.0/98.0 ], + [0.0, 4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0], + [0.0, 0.0, 0.0, -1.0/2.0, 0.0, 1.0/2.0], + [0.0, 0.0, 3.0/34.0, 4.0/17.0, -59.0/34.0, 24.0/17.0], + ]); #[rustfmt::skip] const D2DIAG: &'static [Float] = &[ @@ -83,15 +90,18 @@ impl SbpOperator1d for SBP4 { } fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { + // Magic two lines that prevents or enables optimisation + // (doubles instructions when not included) let mut flipmatrix = SBP4::BLOCK_MATRIX.flip(); flipmatrix *= &-1.0; + for (p, mut f) in prev .axis_iter(ndarray::Axis(0)) .zip(fut.axis_iter_mut(ndarray::Axis(0))) { super::diff_op_1d_slice_matrix( &SBP4::BLOCK_MATRIX, - &flipmatrix, + &SBP4::BLOCKEND_MATRIX, &SBP4::DIAG_MATRIX, super::OperatorType::Normal, p.as_slice().unwrap(), From 4c2daf59334f3a44311f84e26a8e48a5c9302a07 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 00:08:31 +0100 Subject: [PATCH 07/35] minor thingys --- sbp/src/operators/algos.rs | 14 ++++++-------- sbp/src/operators/traditional4.rs | 13 ++++++++++++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 9717a98..917d9e0 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -3,6 +3,7 @@ use super::*; pub(crate) mod constmatrix { /// A row-major matrix #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] + #[repr(transparent)] pub struct Matrix { data: [[T; N]; M], } @@ -94,12 +95,8 @@ pub(crate) mod constmatrix { } } } - pub fn iter(&self) -> impl ExactSizeIterator + DoubleEndedIterator { - (0..N * M).map(move |x| { - let i = x / N; - let j = x % N; - &self[(i, j)] - }) + pub fn iter(&self) -> impl Iterator { + self.data.iter().flatten() } pub fn iter_mut(&mut self) -> impl Iterator { self.data.iter_mut().flatten() @@ -352,7 +349,8 @@ pub(crate) fn diff_op_1d_slice_matrix().last().unwrap(); + let prev = prev.array_windows::().next_back().unwrap(); let prev = ColVector::<_, N>::map_to_col(prev); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 4e5abad..352a92b 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -92,7 +92,7 @@ impl SbpOperator1d for SBP4 { fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { // Magic two lines that prevents or enables optimisation // (doubles instructions when not included) - let mut flipmatrix = SBP4::BLOCK_MATRIX.flip(); + let mut flipmatrix = SBP4::BLOCK_MATRIX; flipmatrix *= &-1.0; for (p, mut f) in prev @@ -228,3 +228,14 @@ fn test_trad4() { 1e-1, ); } + +#[test] +fn block_equality() { + let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip(); + flipped_inverted *= &-1.0; + + assert!(flipped_inverted + .iter() + .zip(SBP4::BLOCKEND_MATRIX.iter()) + .all(|(x, y)| (x - y).abs() < 1e-3)) +} From 3d34f2e7a0feec6d985ef3a7ceeaa9735380cf77 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 00:14:56 +0100 Subject: [PATCH 08/35] add fast-float feature --- sbp/Cargo.toml | 1 + sbp/src/operators/algos.rs | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml index a1ba176..2dd2b22 100644 --- a/sbp/Cargo.toml +++ b/sbp/Cargo.toml @@ -16,6 +16,7 @@ serde = { version = "1.0.115", optional = true, default-features = false, featur [features] # Use f32 as precision, default is f64 f32 = [] +fast-float = [] sparse = ["sprs"] serde1 = ["serde", "ndarray/serde"] diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 917d9e0..88c92e7 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -319,6 +319,7 @@ mod fastfloat { } } } +#[cfg(feature = "fast-float")] use fastfloat::FastFloat; #[inline(always)] @@ -330,12 +331,19 @@ pub(crate) fn diff_op_1d_slice_matrix>(block) }; - let endblock = unsafe { transmute::<_, &Matrix>(endblock) }; - let diag = unsafe { transmute::<_, &RowVector>(diag) }; - let prev = unsafe { transmute::<_, &[FastFloat]>(prev) }; - let fut = unsafe { transmute::<_, &mut [FastFloat]>(fut) }; + #[cfg(feature = "fast-float")] + let (block, endblock, diag, prev, fut) = { + use std::mem::transmute; + unsafe { + ( + transmute::<_, &Matrix>(block), + transmute::<_, &Matrix>(endblock), + transmute::<_, &RowVector>(diag), + transmute::<_, &[FastFloat]>(prev), + transmute::<_, &mut [FastFloat]>(fut), + ) + } + }; assert_eq!(prev.len(), fut.len()); let nx = prev.len(); @@ -350,6 +358,7 @@ pub(crate) fn diff_op_1d_slice_matrix Date: Fri, 29 Jan 2021 08:28:55 +0100 Subject: [PATCH 09/35] add Matrix approach for SBP8 --- sbp/src/operators/traditional8.rs | 39 +++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 6947022..14d0d18 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -1,4 +1,4 @@ -use super::{diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d}; +use super::{diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d}; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -15,6 +15,10 @@ impl SBP8 { 3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03 ]; #[rustfmt::skip] + const DIAG_MATRIX: RowVector = Matrix::new([[ + 3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03 + ]]); + #[rustfmt::skip] const BLOCK: &'static [&'static [Float]] = &[ &[-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02], &[-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03], @@ -25,6 +29,17 @@ impl SBP8 { &[2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03], &[5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], ]; + #[rustfmt::skip] + const BLOCK_MATRIX: Matrix = Matrix::new([ + [-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02, 0.0, 0.0, 0.0, 0.0], + [-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03, 0.0, 0.0, 0.0, 0.0], + [3.88218088704253e-02, -5.44329744454984e-01, 0.00000000000000e+00, 3.89516189693211e-01, 1.36433486528546e-01, 1.03290582800845e-01, -1.79720579323281e-01, 5.59882558852296e-02, 0.0, 0.0, 0.0, 0.0], + [1.28088632226564e-01, -4.19172130036008e-01, -5.57707021445779e-02, 0.00000000000000e+00, 1.24714160903055e-01, 2.81285212519100e-01, -3.94470423942641e-02, -1.96981310738430e-02, 0.0, 0.0, 0.0, 0.0], + [-1.82119472519009e-02, 9.09986646154550e-02, -8.51090570277506e-02, -5.43362886365301e-01, 0.00000000000000e+00, 6.37392455438558e-01, -1.02950081118829e-01, 2.98964956216039e-02, -8.65364391190110e-03, 0.0, 0.0, 0.0], + [-7.93147196245203e-02, 2.22875323171502e-01, -2.07999824391436e-02, -3.95611167748401e-01, -2.05756876210586e-01, 0.00000000000000e+00, 5.45876519966127e-01, -9.42727926638298e-02, 2.97971812952850e-02, -2.79348574643297e-03, 0.0, 0.0], + [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0], + [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], + ]); } impl SbpOperator1d for SBP8 { @@ -59,6 +74,25 @@ impl SbpOperator1d for SBP8 { } } +fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { + let mut flipmatrix = SBP8::BLOCK_MATRIX; + flipmatrix *= &-1.0; + + for (p, mut f) in prev + .axis_iter(ndarray::Axis(0)) + .zip(fut.axis_iter_mut(ndarray::Axis(0))) + { + super::diff_op_1d_slice_matrix( + &SBP8::BLOCK_MATRIX, + &flipmatrix, + &SBP8::DIAG_MATRIX, + super::OperatorType::Normal, + p.as_slice().unwrap(), + f.as_slice_mut().unwrap(), + ) + } +} + impl SbpOperator2d for SBP8 { fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); @@ -69,7 +103,8 @@ impl SbpOperator2d for SBP8 { match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => { - diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); + //diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); + diff_op_row_local(prev, fut); } ([1, _], [1, _]) => { diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); From a7660281c83af9b4d1ad6db0df097369f238a95a Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 08:32:26 +0100 Subject: [PATCH 10/35] add inline to remove magic fix --- sbp/src/operators/algos.rs | 10 ++++++++-- sbp/src/operators/traditional4.rs | 5 ----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 88c92e7..115b322 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -60,11 +60,11 @@ pub(crate) mod constmatrix { pub const fn new(data: [[T; N]; M]) -> Self { Self { data } } - #[inline] + #[inline(always)] pub const fn nrows(&self) -> usize { M } - #[inline] + #[inline(always)] pub const fn ncols(&self) -> usize { N } @@ -95,12 +95,15 @@ pub(crate) mod constmatrix { } } } + #[inline(always)] pub fn iter(&self) -> impl Iterator { self.data.iter().flatten() } + #[inline(always)] pub fn iter_mut(&mut self) -> impl Iterator { self.data.iter_mut().flatten() } + #[inline(always)] pub fn iter_rows( &self, ) -> impl ExactSizeIterator + DoubleEndedIterator { @@ -122,9 +125,11 @@ pub(crate) mod constmatrix { } impl ColVector { + #[inline(always)] pub fn map_to_col(slice: &[T; N]) -> &ColVector { unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } } + #[inline(always)] pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector { unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } } @@ -143,6 +148,7 @@ pub(crate) mod constmatrix { where for<'f> T: core::ops::MulAssign<&'f T>, { + #[inline(always)] fn mul_assign(&mut self, other: &T) { self.iter_mut().for_each(|x| *x *= other) } diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 352a92b..37519e2 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -90,11 +90,6 @@ impl SbpOperator1d for SBP4 { } fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { - // Magic two lines that prevents or enables optimisation - // (doubles instructions when not included) - let mut flipmatrix = SBP4::BLOCK_MATRIX; - flipmatrix *= &-1.0; - for (p, mut f) in prev .axis_iter(ndarray::Axis(0)) .zip(fut.axis_iter_mut(ndarray::Axis(0))) From f7c238f6a7d0bb5489687b2e08cd9f81a7dcf1ae Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 08:51:26 +0100 Subject: [PATCH 11/35] add blockend SBP8 --- sbp/src/operators/traditional8.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 14d0d18..4b431cd 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -40,6 +40,17 @@ impl SBP8 { [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0], [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], ]); + #[rustfmt::skip] + const BLOCKEND_MATRIX: Matrix = Matrix::new([ + [0.0035384016230155199, -0.037742950645498902, 0.19815049088886899, -0.79260196355547696, -0.0, 0.75166824372712304, -0.11941174319355199, 0.012224423573111201, -0.035091936128702303, 0.0142810403117803, 0.010833666120933699, -0.0058476727216045096], + [-0.0, 0.0038681313433548601, -0.041260067662451799, 0.21661535522787201, -0.821713248844682, -0.0, 0.75587358166357999, -0.046018121340651898, -0.076822925360096897, -0.050113507756358401, 0.087129564256063705, -0.027558761526617698], + [-0.0, -0.0, 0.0027934857464329702, -0.029797181295285, 0.094272792663829805, -0.54587651996612696, -0.0, 0.205756876210586, 0.39561116774840099, 0.020799982439143602, -0.22287532317150199, 0.079314719624520302], + [-0.0, -0.0, -0.0, 0.0086536439119011006, -0.029896495621603899, 0.102950081118829, -0.63739245543855805, -0.0, 0.54336288636530095, 0.085109057027750595, -0.090998664615454999, 0.01821194725190089], + [-0.0, -0.0, -0.0, -0.0, 0.019698131073842998, 0.039447042394264102, -0.28128521251909999, -0.124714160903055, -0.0, 0.055770702144577897, 0.41917213003600801, -0.128088632226564], + [-0.0, -0.0, -0.0, -0.0, -0.055988255885229599, 0.179720579323281, -0.10329058280084499, -0.13643348652854601, -0.389516189693211, -0.0, 0.54432974445498405, -0.038821808870425301], + [-0.0, -0.0, -0.0, -0.0, -0.0071669648308011497, -0.052726783847581303, 0.18675940343293501, 0.024615176293723499, -0.49400862680798402, -0.091851192507295606, -0.0, 0.43437898826698501], + [-0.0, -0.0, -0.0, -0.0, 0.020015058331576099, 0.086285816263333495, -0.34386522738887298, -0.0254881486107905, 0.78102816812674902, 0.033893192260150001, -2.2474124634140402, 1.6955436044319001], + ]); } impl SbpOperator1d for SBP8 { @@ -75,8 +86,10 @@ impl SbpOperator1d for SBP8 { } fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { + /* let mut flipmatrix = SBP8::BLOCK_MATRIX; flipmatrix *= &-1.0; + */ for (p, mut f) in prev .axis_iter(ndarray::Axis(0)) @@ -84,7 +97,8 @@ fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayVi { super::diff_op_1d_slice_matrix( &SBP8::BLOCK_MATRIX, - &flipmatrix, + &SBP8::BLOCKEND_MATRIX, + //&flipmatrix, &SBP8::DIAG_MATRIX, super::OperatorType::Normal, p.as_slice().unwrap(), @@ -189,3 +203,14 @@ fn test_trad8() { 1e-4, ); } + +#[test] +fn block_equality() { + let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip(); + flipped_inverted *= &-1.0; + + assert!(flipped_inverted + .iter() + .zip(SBP8::BLOCKEND_MATRIX.iter()) + .all(|(x, y)| (x - y).abs() < 1e-12)) +} From 3c7cc4605a13652ed3330c9608e9d374aa1bd7d0 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 16:42:49 +0100 Subject: [PATCH 12/35] simplify traits --- sbp/Cargo.toml | 1 - sbp/src/operators/algos.rs | 82 +++++++++---------------------- sbp/src/operators/traditional4.rs | 2 +- sbp/src/operators/traditional8.rs | 2 +- 4 files changed, 26 insertions(+), 61 deletions(-) diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml index 2dd2b22..9b7834f 100644 --- a/sbp/Cargo.toml +++ b/sbp/Cargo.toml @@ -10,7 +10,6 @@ approx = "0.3.2" packed_simd = { version = "0.3.3", package = "packed_simd_2" } rayon = { version = "1.3.0", optional = true } sprs = { version = "0.9.0", optional = true, default-features = false } -num-traits = "0.2.11" serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] } [features] diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 115b322..bd8c4cb 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -1,6 +1,7 @@ use super::*; pub(crate) mod constmatrix { + #![allow(unused)] /// A row-major matrix #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] #[repr(transparent)] @@ -70,8 +71,7 @@ pub(crate) mod constmatrix { } pub fn matmul(&self, other: &Matrix) -> Matrix where - T: Default + core::ops::AddAssign, - for<'f> &'f T: std::ops::Mul, + T: Copy + Default + core::ops::Add + core::ops::Mul, { let mut out = Matrix::default(); self.matmul_into(other, &mut out); @@ -82,14 +82,13 @@ pub(crate) mod constmatrix { other: &Matrix, out: &mut Matrix, ) where - T: Default + core::ops::AddAssign, - for<'f> &'f T: std::ops::Mul, + T: Copy + Default + core::ops::Add + core::ops::Mul, { for i in 0..M { for j in 0..P { let mut t = T::default(); for k in 0..N { - t += &self[(i, k)] * &other[(k, j)]; + t = t + self[(i, k)] * other[(k, j)]; } out[(i, j)] = t; } @@ -144,12 +143,12 @@ pub(crate) mod constmatrix { } } - impl core::ops::MulAssign<&T> for Matrix + impl core::ops::MulAssign for Matrix where - for<'f> T: core::ops::MulAssign<&'f T>, + T: Copy + core::ops::MulAssign, { #[inline(always)] - fn mul_assign(&mut self, other: &T) { + fn mul_assign(&mut self, other: T) { self.iter_mut().for_each(|x| *x *= other) } } @@ -243,22 +242,33 @@ pub(crate) fn diff_op_1d_matrix( } } +#[cfg(feature = "fast-float")] mod fastfloat { use super::*; #[repr(transparent)] - #[derive(Debug, PartialEq, Default)] + #[derive(Copy, Clone, Debug, PartialEq, Default)] pub(crate) struct FastFloat(Float); use core::intrinsics::{fadd_fast, fmul_fast}; impl core::ops::Mul for FastFloat { type Output = Self; + #[inline(always)] fn mul(self, o: Self) -> Self::Output { unsafe { Self(fmul_fast(self.0, o.0)) } } } + impl core::ops::Add for FastFloat { + type Output = Self; + #[inline(always)] + fn add(self, o: FastFloat) -> Self::Output { + unsafe { Self(fadd_fast(self.0, o.0)) } + } + } + impl core::ops::MulAssign for FastFloat { + #[inline(always)] fn mul_assign(&mut self, o: FastFloat) { unsafe { self.0 = fmul_fast(self.0, o.0); @@ -266,60 +276,16 @@ mod fastfloat { } } - impl core::ops::MulAssign<&FastFloat> for FastFloat { - fn mul_assign(&mut self, o: &FastFloat) { - unsafe { - self.0 = fmul_fast(self.0, o.0); - } - } - } - - impl core::ops::MulAssign for &mut FastFloat { - fn mul_assign(&mut self, o: FastFloat) { - unsafe { - self.0 = fmul_fast(self.0, o.0); - } - } - } - impl core::ops::MulAssign<&FastFloat> for &mut FastFloat { - fn mul_assign(&mut self, o: &FastFloat) { - unsafe { - self.0 = fmul_fast(self.0, o.0); - } - } - } - - impl core::ops::AddAssign for FastFloat { - fn add_assign(&mut self, o: Self) { - unsafe { - self.0 = fadd_fast(self.0, o.0); - } - } - } - impl core::ops::Mul for &FastFloat { type Output = FastFloat; + #[inline(always)] fn mul(self, o: Self) -> Self::Output { unsafe { FastFloat(fmul_fast(self.0, o.0)) } } } - impl core::ops::MulAssign<&Float> for FastFloat { - fn mul_assign(&mut self, o: &Float) { - unsafe { - self.0 = fmul_fast(self.0, *o); - } - } - } - impl core::ops::MulAssign<&Float> for &mut FastFloat { - fn mul_assign(&mut self, o: &Float) { - unsafe { - self.0 = fmul_fast(self.0, *o); - } - } - } - impl From for FastFloat { + #[inline(always)] fn from(f: Float) -> Self { Self(f) } @@ -373,7 +339,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col_mut((&mut fut[0..M]).try_into().unwrap()); block.matmul_into(prev, fut); - *fut *= &idx; + *fut *= idx; } // The window needs to be aligned to the diagonal elements, @@ -391,7 +357,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(window); diag.matmul_into(prev, fut); - *fut *= &idx; + *fut *= idx; } { @@ -400,7 +366,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); endblock.matmul_into(prev, fut); - *fut *= &idx; + *fut *= idx; } } diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 37519e2..af84f0b 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -227,7 +227,7 @@ fn test_trad4() { #[test] fn block_equality() { let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip(); - flipped_inverted *= &-1.0; + flipped_inverted *= -1.0; assert!(flipped_inverted .iter() diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 4b431cd..76cd2e6 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -207,7 +207,7 @@ fn test_trad8() { #[test] fn block_equality() { let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip(); - flipped_inverted *= &-1.0; + flipped_inverted *= -1.0; assert!(flipped_inverted .iter() From c1335574593bf2d1a19bbed4a5c68f46486f417c Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 17:36:05 +0100 Subject: [PATCH 13/35] use Matrix in SBP diff --- sbp/src/operators/algos.rs | 207 ++++++++++++++++++++++++++---- sbp/src/operators/traditional4.rs | 25 +++- sbp/src/operators/traditional8.rs | 24 +++- 3 files changed, 215 insertions(+), 41 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index bd8c4cb..2b60090 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -3,7 +3,7 @@ use super::*; pub(crate) mod constmatrix { #![allow(unused)] /// A row-major matrix - #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] + #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[repr(transparent)] pub struct Matrix { data: [[T; N]; M], @@ -11,19 +11,11 @@ pub(crate) mod constmatrix { pub type RowVector = Matrix; pub type ColVector = Matrix; - impl Default for Matrix { + impl Default for Matrix { fn default() -> Self { - use std::mem::MaybeUninit; - let mut d: [[MaybeUninit; N]; M] = unsafe { MaybeUninit::uninit().assume_init() }; - - for row in d.iter_mut() { - for item in row.iter_mut() { - *item = MaybeUninit::new(T::default()); - } + Self { + data: [[T::default(); N]; M], } - - let data = unsafe { std::mem::transmute_copy::<_, [[T; N]; M]>(&d) }; - Self { data } } } @@ -111,12 +103,12 @@ pub(crate) mod constmatrix { pub fn flip(&self) -> Self where - T: Default + Clone, + T: Default + Copy, { let mut v = Self::default(); for i in 0..M { for j in 0..N { - v[(i, j)] = self[(M - 1 - i, N - 1 - j)].clone() + v[(i, j)] = self[(M - 1 - i, N - 1 - j)] } } v @@ -163,10 +155,6 @@ pub(crate) mod constmatrix { let _m2 = Matrix::new([[1, 2], [3, 4]]); } - #[test] - fn construct_non_copy() { - let _m = Matrix::::default(); - } #[test] fn matmul() { @@ -184,8 +172,8 @@ pub(crate) use constmatrix::{ColVector, Matrix, RowVector}; #[inline(always)] pub(crate) fn diff_op_1d_matrix( block: &Matrix, - diag: &RowVector, - symmetry: Symmetry, + blockend: &Matrix, + diag: &RowVector, optype: OperatorType, prev: ArrayView1, mut fut: ArrayViewMut1, @@ -226,19 +214,14 @@ pub(crate) fn diff_op_1d_matrix( *f = diff * idx; } - for (bl, f) in block.iter_rows().zip(fut.iter_mut().rev()) { + for (bl, f) in blockend.iter_rows().zip(fut.iter_mut().rev().take(M).rev()) { let diff = bl .iter() - .zip(prev.iter().rev()) + .zip(prev.iter()) .map(|(x, y)| x * y) .sum::(); - *f = idx - * if symmetry == Symmetry::Symmetric { - diff - } else { - -diff - }; + *f = diff * idx; } } @@ -708,6 +691,174 @@ fn product_fast<'a>( }) } +#[inline(always)] +pub(crate) fn diff_op_col_matrix( + block: &Matrix, + block2: &Matrix, + diag: &RowVector, + optype: OperatorType, + prev: ArrayView2, + mut fut: ArrayViewMut2, +) { + assert_eq!(prev.shape(), fut.shape()); + let nx = prev.shape()[1]; + assert!(nx >= 2 * M); + + assert_eq!(prev.strides()[0], 1); + assert_eq!(fut.strides()[0], 1); + + let dx = if optype == OperatorType::H2 { + 1.0 / (nx - 2) as Float + } else { + 1.0 / (nx - 1) as Float + }; + let idx = 1.0 / dx; + + #[cfg(not(feature = "f32"))] + type SimdT = packed_simd::f64x8; + #[cfg(feature = "f32")] + type SimdT = packed_simd::f32x16; + + let ny = prev.shape()[0]; + // How many elements that can be simdified + let simdified = SimdT::lanes() * (ny / SimdT::lanes()); + + let half_diag_width = (D - 1) / 2; + assert!(half_diag_width <= M); + + let fut_base_ptr = fut.as_mut_ptr(); + let fut_stride = fut.strides()[1]; + let fut_ptr = |j, i| { + debug_assert!(j < ny && i < nx); + unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) } + }; + + let prev_base_ptr = prev.as_ptr(); + let prev_stride = prev.strides()[1]; + let prev_ptr = |j, i| { + debug_assert!(j < ny && i < nx); + unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) } + }; + + // Not algo necessary, but gives performance increase + assert_eq!(fut_stride, prev_stride); + + // First block + { + for (ifut, &bl) in block.iter_rows().enumerate() { + for j in (0..simdified).step_by(SimdT::lanes()) { + let index_to_simd = |i| unsafe { + // j never moves past end of slice due to step_by and + // rounding down + SimdT::from_slice_unaligned(std::slice::from_raw_parts( + prev_ptr(j, i), + SimdT::lanes(), + )) + }; + let mut f = SimdT::splat(0.0); + for (iprev, &bl) in bl.iter().enumerate() { + f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); + } + f *= idx; + + unsafe { + f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( + fut_ptr(j, ifut), + SimdT::lanes(), + )); + } + } + for j in simdified..ny { + unsafe { + let mut f = 0.0; + for (iprev, bl) in bl.iter().enumerate() { + f += bl * *prev_ptr(j, iprev); + } + *fut_ptr(j, ifut) = f * idx; + } + } + } + } + + // Diagonal elements + { + for ifut in M..nx - M { + for j in (0..simdified).step_by(SimdT::lanes()) { + let index_to_simd = |i| unsafe { + // j never moves past end of slice due to step_by and + // rounding down + SimdT::from_slice_unaligned(std::slice::from_raw_parts( + prev_ptr(j, i), + SimdT::lanes(), + )) + }; + let mut f = SimdT::splat(0.0); + for (id, &d) in diag.iter().enumerate() { + let offset = ifut - half_diag_width + id; + f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); + } + f *= idx; + unsafe { + // puts simd along stride 1, j never goes past end of slice + f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( + fut_ptr(j, ifut), + SimdT::lanes(), + )); + } + } + for j in simdified..ny { + let mut f = 0.0; + for (id, &d) in diag.iter().enumerate() { + let offset = ifut - half_diag_width + id; + unsafe { + f += d * *prev_ptr(j, offset); + } + } + unsafe { + *fut_ptr(j, ifut) = idx * f; + } + } + } + } + + // End block + { + for (ifut, &bl) in (nx - M..nx).zip(block2.iter_rows()) { + for j in (0..simdified).step_by(SimdT::lanes()) { + let index_to_simd = |i| unsafe { + // j never moves past end of slice due to step_by and + // rounding down + SimdT::from_slice_unaligned(std::slice::from_raw_parts( + prev_ptr(j, i), + SimdT::lanes(), + )) + }; + let mut f = SimdT::splat(0.0); + for (iprev, &bl) in (nx - N..nx).zip(bl.iter()) { + f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); + } + f *= idx; + + unsafe { + f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( + fut_ptr(j, ifut), + SimdT::lanes(), + )); + } + } + for j in simdified..ny { + unsafe { + let mut f = 0.0; + for (iprev, bl) in (nx - N..nx).zip(bl.iter()) { + f += bl * *prev_ptr(j, iprev); + } + *fut_ptr(j, ifut) = f * idx; + } + } + } + } +} + #[inline(always)] pub(crate) fn diff_op_row( block: &'static [&'static [Float]], diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index af84f0b..57afd35 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -1,4 +1,4 @@ -use super::{diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d}; +use super::{SbpOperator1d, SbpOperator2d}; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -55,10 +55,10 @@ impl SBP4 { impl SbpOperator1d for SBP4 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, + super::diff_op_1d_matrix( + &Self::BLOCK_MATRIX, + &Self::BLOCKEND_MATRIX, + &Self::DIAG_MATRIX, super::OperatorType::Normal, prev, fut, @@ -104,6 +104,17 @@ fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayVi ) } } +fn diff_op_col_local(prev: ndarray::ArrayView2, fut: ndarray::ArrayViewMut2) { + let optype = super::OperatorType::Normal; + super::diff_op_col_matrix( + &SBP4::BLOCK_MATRIX, + &SBP4::BLOCKEND_MATRIX, + &SBP4::DIAG_MATRIX, + optype, + prev, + fut, + ) +} impl SbpOperator2d for SBP4 { fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { @@ -112,14 +123,14 @@ impl SbpOperator2d for SBP4 { let symmetry = super::Symmetry::AntiSymmetric; let optype = super::OperatorType::Normal; - match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => { //diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); diff_op_row_local(prev, fut) } ([1, _], [1, _]) => { - diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); + //diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); + diff_op_col_local(prev, fut) } ([_, _], [_, _]) => { // Fallback, work row by row diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 76cd2e6..88631b3 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -55,10 +55,10 @@ impl SBP8 { impl SbpOperator1d for SBP8 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, + super::diff_op_1d_matrix( + &Self::BLOCK_MATRIX, + &Self::BLOCKEND_MATRIX, + &Self::DIAG_MATRIX, super::OperatorType::Normal, prev, fut, @@ -107,6 +107,18 @@ fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayVi } } +fn diff_op_col_local(prev: ndarray::ArrayView2, fut: ndarray::ArrayViewMut2) { + let optype = super::OperatorType::Normal; + super::diff_op_col_matrix( + &SBP8::BLOCK_MATRIX, + &SBP8::BLOCKEND_MATRIX, + &SBP8::DIAG_MATRIX, + optype, + prev, + fut, + ) +} + impl SbpOperator2d for SBP8 { fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); @@ -114,14 +126,14 @@ impl SbpOperator2d for SBP8 { let symmetry = super::Symmetry::AntiSymmetric; let optype = super::OperatorType::Normal; - match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => { //diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); diff_op_row_local(prev, fut); } ([1, _], [1, _]) => { - diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); + //diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); + diff_op_col_local(prev, fut) } ([_, _], [_, _]) => { // Fallback, work row by row From db94caf2b2495df8855d2a3333b3a30943253cd0 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 17:47:13 +0100 Subject: [PATCH 14/35] change repr of Matrix --- sbp/src/operators/algos.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 2b60090..659ce42 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -4,7 +4,7 @@ pub(crate) mod constmatrix { #![allow(unused)] /// A row-major matrix #[derive(Debug, Clone, PartialEq, Eq, Hash)] - #[repr(transparent)] + #[repr(C)] pub struct Matrix { data: [[T; N]; M], } @@ -318,7 +318,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev.array_windows::().nth(0).unwrap()); + let prev = ColVector::<_, N>::map_to_col(prev.array_windows::().next().unwrap()); let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap()); block.matmul_into(prev, fut); @@ -335,7 +335,6 @@ pub(crate) fn diff_op_1d_slice_matrix().skip(M)) .take(nx - 2 * M) { - // impl From here? let fut = ColVector::<_, 1>::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); From 00f3ba6a01eeffd76dd815cbb69e70b7d1df91c8 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Fri, 29 Jan 2021 17:59:35 +0100 Subject: [PATCH 15/35] use split_at_mut --- sbp/src/operators/algos.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 659ce42..ad13240 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -316,10 +316,14 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev.array_windows::().next().unwrap()); - let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[0..M]).try_into().unwrap()); + let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap()); block.matmul_into(prev, fut); *fut *= idx; @@ -332,8 +336,7 @@ pub(crate) fn diff_op_1d_slice_matrix() .skip(window_elems_to_skip) - .zip(fut.array_chunks_mut::<1>().skip(M)) - .take(nx - 2 * M) + .zip(fut.array_chunks_mut::<1>()) { let fut = ColVector::<_, 1>::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); @@ -345,7 +348,7 @@ pub(crate) fn diff_op_1d_slice_matrix().next_back().unwrap(); let prev = ColVector::<_, N>::map_to_col(prev); - let fut = ColVector::<_, M>::map_to_col_mut((&mut fut[nx - M..]).try_into().unwrap()); + let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap()); endblock.matmul_into(prev, fut); *fut *= idx; From 481f2d607ed230a04fd78e97bc9255b9f9413d74 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Sat, 30 Jan 2021 15:58:03 +0100 Subject: [PATCH 16/35] remove a lot of unsafe, lost perf --- sbp/src/operators/algos.rs | 137 ++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 72 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index ad13240..1928f2d 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -700,7 +700,7 @@ pub(crate) fn diff_op_col_matrix diag: &RowVector, optype: OperatorType, prev: ArrayView2, - mut fut: ArrayViewMut2, + fut: ArrayViewMut2, ) { assert_eq!(prev.shape(), fut.shape()); let nx = prev.shape()[1]; @@ -728,12 +728,7 @@ pub(crate) fn diff_op_col_matrix let half_diag_width = (D - 1) / 2; assert!(half_diag_width <= M); - let fut_base_ptr = fut.as_mut_ptr(); let fut_stride = fut.strides()[1]; - let fut_ptr = |j, i| { - debug_assert!(j < ny && i < nx); - unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) } - }; let prev_base_ptr = prev.as_ptr(); let prev_stride = prev.strides()[1]; @@ -745,88 +740,92 @@ pub(crate) fn diff_op_col_matrix // Not algo necessary, but gives performance increase assert_eq!(fut_stride, prev_stride); + use ndarray::Axis; + let (mut fut1, fut) = fut.split_at(Axis(1), M); + let (mut fut, mut fut2) = fut.split_at(Axis(1), nx - 2 * M); + // First block { - for (ifut, &bl) in block.iter_rows().enumerate() { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; + let prev = prev.slice(ndarray::s![.., ..N]); + let (prevb, prevl) = prev.split_at(Axis(0), simdified); + for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(block.iter_rows()) { + let fut = fut.as_slice_mut().unwrap(); + let fut = &mut fut[..ny]; + + let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + let mut prev = prevb.axis_chunks_iter(Axis(0), SimdT::lanes()); + + for (fut, prev) in fut.by_ref().zip(prev.by_ref()) { let mut f = SimdT::splat(0.0); - for (iprev, &bl) in bl.iter().enumerate() { - f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); + for (&bl, prev) in bl.iter().zip(prev.axis_iter(Axis(1))) { + let prev = prev.to_slice().unwrap(); + let prev = SimdT::from_slice_unaligned(prev); + f = prev.mul_adde(SimdT::splat(bl), f); } f *= idx; - - unsafe { - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } + f.write_to_slice_unaligned(fut); } - for j in simdified..ny { - unsafe { - let mut f = 0.0; - for (iprev, bl) in bl.iter().enumerate() { - f += bl * *prev_ptr(j, iprev); - } - *fut_ptr(j, ifut) = f * idx; + for (fut, prev) in fut + .into_remainder() + .iter_mut() + .zip(prevl.axis_iter(Axis(0))) + { + let mut f = 0.0; + for (bl, prev) in bl.iter().zip(prev.iter()) { + f += bl * prev; } + *fut = f * idx; } } } // Diagonal elements { - for ifut in M..nx - M { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; + let window_elems_to_skip = M - ((D - 1) / 2); + let prev = prev.slice(ndarray::s![.., window_elems_to_skip..]); + let prev = prev.windows((ny, D)); + for (mut fut, prev) in fut.axis_iter_mut(Axis(1)).zip(prev) { + let fut = fut.as_slice_mut().unwrap(); + let fut = &mut fut[..ny]; + + let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + + let (prevb, prevl) = prev.split_at(Axis(0), simdified); + let prev = prevb.axis_chunks_iter(Axis(0), SimdT::lanes()); + + for (fut, prev) in fut.by_ref().zip(prev) { let mut f = SimdT::splat(0.0); - for (id, &d) in diag.iter().enumerate() { - let offset = ifut - half_diag_width + id; - f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); + for (&d, prev) in diag.iter().zip(prev.axis_iter(Axis(1))) { + let prev = prev.to_slice().unwrap(); + let prev = SimdT::from_slice_unaligned(prev); + f = prev.mul_adde(SimdT::splat(d), f); } f *= idx; - unsafe { - // puts simd along stride 1, j never goes past end of slice - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } + f.write_to_slice_unaligned(fut); } - for j in simdified..ny { + + for (fut, prev) in fut + .into_remainder() + .into_iter() + .zip(prevl.axis_iter(Axis(0))) + { let mut f = 0.0; - for (id, &d) in diag.iter().enumerate() { - let offset = ifut - half_diag_width + id; - unsafe { - f += d * *prev_ptr(j, offset); - } - } - unsafe { - *fut_ptr(j, ifut) = idx * f; + for (&d, prev) in diag.iter().zip(prev) { + f += d * prev; } + *fut = idx * f; } } } // End block { - for (ifut, &bl) in (nx - M..nx).zip(block2.iter_rows()) { - for j in (0..simdified).step_by(SimdT::lanes()) { + for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(block2.iter_rows()) { + let fut = fut.as_slice_mut().unwrap(); + let fut = &mut fut[..ny]; + let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + + for (fut, j) in fut.by_ref().zip((0..simdified).step_by(SimdT::lanes())) { let index_to_simd = |i| unsafe { // j never moves past end of slice due to step_by and // rounding down @@ -840,21 +839,15 @@ pub(crate) fn diff_op_col_matrix f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); } f *= idx; - - unsafe { - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } + f.write_to_slice_unaligned(fut); } - for j in simdified..ny { + for (fut, j) in fut.into_remainder().into_iter().zip(simdified..ny) { unsafe { let mut f = 0.0; for (iprev, bl) in (nx - N..nx).zip(bl.iter()) { f += bl * *prev_ptr(j, iprev); } - *fut_ptr(j, ifut) = f * idx; + *fut = f * idx; } } } From b0e1ec62f86b3e19a98cdc066b3288fcd74be8bb Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Sun, 31 Jan 2021 13:23:15 +0100 Subject: [PATCH 17/35] change order in matmul_into --- sbp/src/operators/algos.rs | 72 +++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 1928f2d..1681501 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -61,31 +61,6 @@ pub(crate) mod constmatrix { pub const fn ncols(&self) -> usize { N } - pub fn matmul(&self, other: &Matrix) -> Matrix - where - T: Copy + Default + core::ops::Add + core::ops::Mul, - { - let mut out = Matrix::default(); - self.matmul_into(other, &mut out); - out - } - pub fn matmul_into( - &self, - other: &Matrix, - out: &mut Matrix, - ) where - T: Copy + Default + core::ops::Add + core::ops::Mul, - { - for i in 0..M { - for j in 0..P { - let mut t = T::default(); - for k in 0..N { - t = t + self[(i, k)] * other[(k, j)]; - } - out[(i, j)] = t; - } - } - } #[inline(always)] pub fn iter(&self) -> impl Iterator { self.data.iter().flatten() @@ -135,6 +110,45 @@ pub(crate) mod constmatrix { } } + impl Matrix { + #[inline(always)] + pub fn matmul_into(&mut self, lhs: &Matrix, rhs: &Matrix) + where + T: Default + Copy + core::ops::Mul + core::ops::Add, + { + for i in 0..M { + for j in 0..P { + let mut t = T::default(); + for k in 0..N { + t = t + lhs[(i, k)] * rhs[(k, j)]; + } + self[(i, j)] = t; + } + } + } + } + + macro_rules! impl_op_mul_mul { + ($lhs:ty, $rhs:ty) => { + impl core::ops::Mul<$rhs> for $lhs + where + T: Copy + Default + core::ops::Add + core::ops::Mul, + { + type Output = Matrix; + fn mul(self, rhs: $rhs) -> Self::Output { + let mut out = Matrix::default(); + out.matmul_into(&self, &rhs); + out + } + } + }; + } + + impl_op_mul_mul! { Matrix, Matrix } + impl_op_mul_mul! { &Matrix, Matrix } + impl_op_mul_mul! { Matrix, &Matrix } + impl_op_mul_mul! { &Matrix, &Matrix } + impl core::ops::MulAssign for Matrix where T: Copy + core::ops::MulAssign, @@ -161,7 +175,7 @@ pub(crate) mod constmatrix { let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]); - let m3 = m1.matmul(&m2); + let m3 = m1 * m2; assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]])); } } @@ -325,7 +339,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev.array_windows::().next().unwrap()); let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap()); - block.matmul_into(prev, fut); + fut.matmul_into(block, prev); *fut *= idx; } @@ -341,7 +355,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); - diag.matmul_into(prev, fut); + fut.matmul_into(diag, prev); *fut *= idx; } @@ -350,7 +364,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev); let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap()); - endblock.matmul_into(prev, fut); + fut.matmul_into(endblock, prev); *fut *= idx; } } From 45e4d515136f09cc0942205644f85924bf547850 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Sun, 31 Jan 2021 13:32:53 +0100 Subject: [PATCH 18/35] remove a closure --- sbp/src/operators/algos.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 1681501..6c579e5 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -73,7 +73,7 @@ pub(crate) mod constmatrix { pub fn iter_rows( &self, ) -> impl ExactSizeIterator + DoubleEndedIterator { - (0..M).map(move |i| &self[i]) + self.data.iter() } pub fn flip(&self) -> Self From c660354c3fcd738ec06f575f24116d235f3599fe Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Sun, 31 Jan 2021 14:54:15 +0100 Subject: [PATCH 19/35] Matrix for Upwind4 --- sbp/src/operators/algos.rs | 66 +++++++++++++++++++++++++++++++ sbp/src/operators/traditional4.rs | 5 +-- sbp/src/operators/traditional8.rs | 5 +-- sbp/src/operators/upwind4.rs | 60 ++++++++++++++++++++++------ 4 files changed, 117 insertions(+), 19 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 6c579e5..a504b4c 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -178,6 +178,71 @@ pub(crate) mod constmatrix { let m3 = m1 * m2; assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]])); } + #[test] + fn iter() { + let m = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); + let mut iter = m.iter(); + assert_eq!(iter.next(), Some(&1)); + assert_eq!(iter.next(), Some(&2)); + assert_eq!(iter.next(), Some(&3)); + assert_eq!(iter.next(), Some(&4)); + assert_eq!(iter.next(), Some(&5)); + assert_eq!(iter.next(), Some(&6)); + assert_eq!(iter.next(), None); + } + } + + mod approx { + use super::Matrix; + use ::approx::{AbsDiffEq, RelativeEq, UlpsEq}; + + impl AbsDiffEq for Matrix + where + T: AbsDiffEq, + { + type Epsilon = T::Epsilon; + fn default_epsilon() -> Self::Epsilon { + T::default_epsilon() + } + fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool { + self.iter() + .zip(other.iter()) + .all(|(r, l)| r.abs_diff_eq(l, T::default_epsilon())) + } + } + impl RelativeEq for Matrix + where + T: RelativeEq, + Self::Epsilon: Copy, + { + fn default_max_relative() -> Self::Epsilon { + T::default_max_relative() + } + fn relative_eq( + &self, + other: &Self, + epsilon: Self::Epsilon, + max_relative: Self::Epsilon, + ) -> bool { + self.iter() + .zip(other.iter()) + .all(|(r, l)| r.relative_eq(l, epsilon, max_relative)) + } + } + impl UlpsEq for Matrix + where + T: UlpsEq, + Self::Epsilon: Copy, + { + fn default_max_ulps() -> u32 { + T::default_max_ulps() + } + fn ulps_eq(&self, other: &Self, epsilon: Self::Epsilon, max_ulps: u32) -> bool { + self.iter() + .zip(other.iter()) + .all(|(r, l)| r.ulps_eq(l, epsilon, max_ulps)) + } + } } } @@ -228,6 +293,7 @@ pub(crate) fn diff_op_1d_matrix( *f = diff * idx; } + let prev = prev.slice(ndarray::s![nx - N..]); for (bl, f) in blockend.iter_rows().zip(fut.iter_mut().rev().take(M).rev()) { let diff = bl .iter() diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 57afd35..db5802f 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -240,8 +240,5 @@ fn block_equality() { let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip(); flipped_inverted *= -1.0; - assert!(flipped_inverted - .iter() - .zip(SBP4::BLOCKEND_MATRIX.iter()) - .all(|(x, y)| (x - y).abs() < 1e-3)) + approx::assert_ulps_eq!(SBP4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1); } diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 88631b3..48891f8 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -221,8 +221,5 @@ fn block_equality() { let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip(); flipped_inverted *= -1.0; - assert!(flipped_inverted - .iter() - .zip(SBP8::BLOCKEND_MATRIX.iter()) - .all(|(x, y)| (x - y).abs() < 1e-12)) + approx::assert_ulps_eq!(SBP8::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1); } diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs index 8e577b6..66b9852 100644 --- a/sbp/src/operators/upwind4.rs +++ b/sbp/src/operators/upwind4.rs @@ -1,5 +1,6 @@ use super::{ - diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d, + diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d, UpwindOperator1d, + UpwindOperator2d, }; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Axis}; @@ -174,12 +175,30 @@ impl Upwind4 { -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0 ]; #[rustfmt::skip] + const DIAG_MATRIX : RowVector = RowVector::new([ + [-1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0] + ]); + #[rustfmt::skip] const BLOCK: &'static [&'static [Float]] = &[ &[ -72.0 / 49.0, 187.0 / 98.0, -20.0 / 49.0, -3.0 / 98.0, 0.0, 0.0, 0.0], &[-187.0 / 366.0, 0.0, 69.0 / 122.0, -16.0 / 183.0, 2.0 / 61.0, 0.0, 0.0], &[ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], &[ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], ]; + #[rustfmt::skip] + const BLOCK_MATRIX: Matrix = Matrix::new([ + [ -72.0 / 49.0, 187.0 / 98.0, -20.0 / 49.0, -3.0 / 98.0, 0.0, 0.0, 0.0], + [-187.0 / 366.0, 0.0, 69.0 / 122.0, -16.0 / 183.0, 2.0 / 61.0, 0.0, 0.0], + [ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], + [ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], + ]); + #[rustfmt::skip] + const BLOCKEND_MATRIX: Matrix = Matrix::new([ + [ -6.0 / 149.0, 36.0 / 149.0, -126.0 / 149.0, 0.0, 227.0 / 298.0, -16.0 / 149.0, -3.0 / 298.0], + [ 0.0, -2.0/41.0, 12.0/41.0, -227.0/246.0, 0.0, 69.0/82.0, -20.0/123.0], + [ 0.0, 0.0, -2.0/61.0, 16.0/183.0, -69.0/122.0, 0.0, 187.0/366.0], + [ 0.0, 0.0, 0.0, 3.0 / 98.0, 20.0 / 49.0, -187.0 / 98.0, 72.0/49.0], + ]); #[rustfmt::skip] const DISS_BLOCK: &'static [&'static [Float]] = &[ @@ -197,10 +216,10 @@ impl Upwind4 { impl SbpOperator1d for Upwind4 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, + super::diff_op_1d_matrix( + &Self::BLOCK_MATRIX, + &Self::BLOCKEND_MATRIX, + &Self::DIAG_MATRIX, super::OperatorType::Normal, prev, fut, @@ -229,18 +248,29 @@ impl SbpOperator1d for Upwind4 { } } +fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { + for (p, mut f) in prev + .axis_iter(ndarray::Axis(0)) + .zip(fut.axis_iter_mut(ndarray::Axis(0))) + { + super::diff_op_1d_slice_matrix( + &Upwind4::BLOCK_MATRIX, + &Upwind4::BLOCKEND_MATRIX, + &Upwind4::DIAG_MATRIX, + super::OperatorType::Normal, + p.as_slice().unwrap(), + f.as_slice_mut().unwrap(), + ) + } +} + impl SbpOperator2d for Upwind4 { fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); assert!(prev.shape()[1] >= 2 * Upwind4::BLOCK.len()); match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => diff_op_row( - Upwind4::BLOCK, - Upwind4::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - )(prev, fut), + ([_, 1], [_, 1]) => diff_op_row_local(prev, fut), ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => { diff_simd_col(prev, fut) } @@ -465,3 +495,11 @@ fn upwind4_test2() { 1e-1, ); } + +#[test] +fn block_equality() { + let mut flipped_inverted = Upwind4::BLOCK_MATRIX.flip(); + flipped_inverted *= -1.0; + + approx::assert_ulps_eq!(Upwind4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1); +} From c73c6e7407ea045bb19d246e42b61085aa83088e Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Sun, 31 Jan 2021 15:40:28 +0100 Subject: [PATCH 20/35] diff_op_col_naive_matrix --- sbp/src/operators/algos.rs | 76 +++++++++++++++++++++++++++++++ sbp/src/operators/traditional4.rs | 2 +- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index a504b4c..b32425c 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -572,6 +572,82 @@ pub(crate) fn diff_op_col_naive( } } +#[inline(always)] +#[allow(unused)] +pub(crate) fn diff_op_col_naive_matrix( + block: &Matrix, + blockend: &Matrix, + diag: &RowVector, + optype: OperatorType, + prev: ArrayView2, + mut fut: ArrayViewMut2, +) { + assert_eq!(prev.shape(), fut.shape()); + let nx = prev.shape()[1]; + let ny = prev.shape()[0]; + assert!(nx >= 2 * M); + + assert_eq!(prev.strides()[0], 1); + assert_eq!(fut.strides()[0], 1); + + let dx = if optype == OperatorType::H2 { + 1.0 / (nx - 2) as Float + } else { + 1.0 / (nx - 1) as Float + }; + let idx = 1.0 / dx; + + fut.fill(0.0); + + let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut(( + ndarray::s![.., ..M], + ndarray::s![.., M..nx - M], + ndarray::s![.., nx - M..], + )); + + // First block + for (bl, mut fut) in block.iter_rows().zip(fut0.axis_iter_mut(ndarray::Axis(1))) { + debug_assert_eq!(fut.len(), prev.shape()[0]); + for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) { + if bl == 0.0 { + continue; + } + debug_assert_eq!(prev.len(), fut.len()); + fut.scaled_add(idx * bl, &prev); + } + } + + let window_elems_to_skip = M - ((D - 1) / 2); + + // Diagonal entries + for (mut fut, id) in futmid + .axis_iter_mut(ndarray::Axis(1)) + .zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip)) + { + for (&d, id) in diag.iter().zip(id.axis_iter(ndarray::Axis(1))) { + if d == 0.0 { + continue; + } + fut.scaled_add(idx * d, &id) + } + } + + // End block + let prev = prev.slice(ndarray::s!(.., nx - N..)); + for (bl, mut fut) in blockend + .iter_rows() + .zip(futn.axis_iter_mut(ndarray::Axis(1))) + { + fut.fill(0.0); + for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) { + if bl == 0.0 { + continue; + } + fut.scaled_add(idx * bl, &prev); + } + } +} + #[inline(always)] pub(crate) fn diff_op_col( block: &'static [&'static [Float]], diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index db5802f..f480cda 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -106,7 +106,7 @@ fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayVi } fn diff_op_col_local(prev: ndarray::ArrayView2, fut: ndarray::ArrayViewMut2) { let optype = super::OperatorType::Normal; - super::diff_op_col_matrix( + super::diff_op_col_naive_matrix( &SBP4::BLOCK_MATRIX, &SBP4::BLOCKEND_MATRIX, &SBP4::DIAG_MATRIX, From 1f3aa2c11660698bcd251c5c299fa70d23332fec Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 18:01:20 +0100 Subject: [PATCH 21/35] make core-intrinsics cfg'ed --- sbp/src/lib.rs | 2 +- sbp/src/operators/algos.rs | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs index 1d09854..62cb872 100644 --- a/sbp/src/lib.rs +++ b/sbp/src/lib.rs @@ -1,4 +1,4 @@ -#![feature(core_intrinsics)] +#![cfg_attr(feature = "fast-float", feature(core_intrinsics))] #![feature(array_windows)] #![feature(array_chunks)] diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index b32425c..11b9923 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -353,6 +353,12 @@ mod fastfloat { Self(f) } } + impl From for Float { + #[inline(always)] + fn from(f: FastFloat) -> Self { + f.0 + } + } } #[cfg(feature = "fast-float")] use fastfloat::FastFloat; @@ -838,14 +844,17 @@ pub(crate) fn diff_op_col_simd( } #[inline(always)] -fn product_fast<'a>( - u: impl Iterator, - v: impl Iterator, -) -> Float { - use std::intrinsics::{fadd_fast, fmul_fast}; - u.zip(v).fold(0.0, |acc, (&u, &v)| unsafe { - // We do not care about the order of multiplication nor addition - fadd_fast(acc, fmul_fast(u, v)) +fn dotproduct<'a>(u: impl Iterator, v: impl Iterator) -> Float { + u.zip(v).fold(0.0, |acc, (&u, &v)| { + #[cfg(feature = "fast-float")] + unsafe { + // We do not care about the order of multiplication nor addition + (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into() + } + #[cfg(not(feature = "fast-float"))] + { + acc + u * v + } }) } @@ -1043,7 +1052,7 @@ pub(crate) fn diff_op_row( assert!(prev.len() >= 2 * block.len()); for (bl, f) in block.iter().zip(fut.iter_mut()) { - let diff = product_fast(bl.iter(), prev[..bl.len()].iter()); + let diff = dotproduct(bl.iter(), prev[..bl.len()].iter()); *f = diff * idx; } @@ -1057,12 +1066,12 @@ pub(crate) fn diff_op_row( .zip(fut.iter_mut().skip(block.len())) .take(nx - 2 * block.len()) { - let diff = product_fast(diag.iter(), window.iter()); + let diff = dotproduct(diag.iter(), window.iter()); *f = diff * idx; } for (bl, f) in block.iter().zip(fut.iter_mut().rev()) { - let diff = product_fast(bl.iter(), prev.iter().rev()); + let diff = dotproduct(bl.iter(), prev.iter().rev()); *f = idx * if symmetry == Symmetry::Symmetric { From f7f8a7ffff09250142c554104f0d788d78026496 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 18:37:59 +0100 Subject: [PATCH 22/35] make flip const functions --- sbp/src/lib.rs | 1 + sbp/src/operators/algos.rs | 85 +++++++++++++++++++++++++------ sbp/src/operators/traditional4.rs | 17 +------ sbp/src/operators/traditional8.rs | 21 +------- sbp/src/operators/upwind4.rs | 17 +------ 5 files changed, 77 insertions(+), 64 deletions(-) diff --git a/sbp/src/lib.rs b/sbp/src/lib.rs index 62cb872..52a1afa 100644 --- a/sbp/src/lib.rs +++ b/sbp/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(feature = "fast-float", feature(core_intrinsics))] #![feature(array_windows)] #![feature(array_chunks)] +#![feature(const_fn_floating_point_arithmetic)] /// Type used for floats, configure with the `f32` feature #[cfg(feature = "f32")] diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 11b9923..7fe8dd6 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -6,7 +6,7 @@ pub(crate) mod constmatrix { #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[repr(C)] pub struct Matrix { - data: [[T; N]; M], + pub data: [[T; N]; M], } pub type RowVector = Matrix; pub type ColVector = Matrix; @@ -75,19 +75,6 @@ pub(crate) mod constmatrix { ) -> impl ExactSizeIterator + DoubleEndedIterator { self.data.iter() } - - pub fn flip(&self) -> Self - where - T: Default + Copy, - { - let mut v = Self::default(); - for i in 0..M { - for j in 0..N { - v[(i, j)] = self[(M - 1 - i, N - 1 - j)] - } - } - v - } } impl ColVector { @@ -244,9 +231,77 @@ pub(crate) mod constmatrix { } } } + + pub(crate) const fn flip_ud( + mut m: Matrix, + ) -> Matrix { + let mut i = 0; + while i < M / 2 { + let tmp = m.data[i]; + m.data[i] = m.data[M - 1 - i]; + m.data[M - 1 - i] = tmp; + i += 1; + } + m + } + + pub(crate) const fn flip_lr( + mut m: Matrix, + ) -> Matrix { + let mut i = 0; + while i < M { + let mut j = 0; + while j < N / 2 { + let tmp = m.data[i][j]; + m.data[i][j] = m.data[i][N - 1 - j]; + m.data[i][N - 1 - j] = tmp; + j += 1; + } + i += 1; + } + m + } + + /// Flip all sign bits + pub(crate) const fn flip_sign( + mut m: Matrix, + ) -> Matrix { + let mut i = 0; + while i < M { + let mut j = 0; + while j < N { + m.data[i][j] = -m.data[i][j]; + j += 1; + } + i += 1; + } + m + } + mod flipping { + use super::*; + + #[test] + fn flip_lr_test() { + let m = Matrix::new([[1.0, 2.0, 3.0, 4.0]]); + let flipped = flip_lr(m); + assert_eq!(flipped, Matrix::new([[4.0, 3.0, 2.0, 1.0]])); + let m = Matrix::new([[1.0, 2.0, 3.0, 4.0, 5.0]]); + let flipped = flip_lr(m); + assert_eq!(flipped, Matrix::new([[5.0, 4.0, 3.0, 2.0, 1.0]])); + } + #[test] + fn flip_ud_test() { + let m = Matrix::new([[1.0], [2.0], [3.0], [4.0]]); + let flipped = flip_ud(m); + assert_eq!(flipped, Matrix::new([[4.0], [3.0], [2.0], [1.0]])); + let m = Matrix::new([[1.0], [2.0], [3.0], [4.0], [5.0]]); + let flipped = flip_ud(m); + assert_eq!(flipped, Matrix::new([[5.0], [4.0], [3.0], [2.0], [1.0]])); + } + } } -pub(crate) use constmatrix::{ColVector, Matrix, RowVector}; +pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector}; #[inline(always)] pub(crate) fn diff_op_1d_matrix( diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index f480cda..417b799 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -32,13 +32,8 @@ impl SBP4 { [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0], [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0] ]); - #[rustfmt::skip] - const BLOCKEND_MATRIX: super::Matrix = super::Matrix::new([ - [4.0/49.0, -32.0/49.0, 0.0, 59.0/98.0, 0.0, -3.0/98.0 ], - [0.0, 4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0], - [0.0, 0.0, 0.0, -1.0/2.0, 0.0, 1.0/2.0], - [0.0, 0.0, 3.0/34.0, 4.0/17.0, -59.0/34.0, 24.0/17.0], - ]); + const BLOCKEND_MATRIX: super::Matrix = + super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX))); #[rustfmt::skip] const D2DIAG: &'static [Float] = &[ @@ -234,11 +229,3 @@ fn test_trad4() { 1e-1, ); } - -#[test] -fn block_equality() { - let mut flipped_inverted = SBP4::BLOCK_MATRIX.flip(); - flipped_inverted *= -1.0; - - approx::assert_ulps_eq!(SBP4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1); -} diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 48891f8..32b392c 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -40,17 +40,8 @@ impl SBP8 { [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0], [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], ]); - #[rustfmt::skip] - const BLOCKEND_MATRIX: Matrix = Matrix::new([ - [0.0035384016230155199, -0.037742950645498902, 0.19815049088886899, -0.79260196355547696, -0.0, 0.75166824372712304, -0.11941174319355199, 0.012224423573111201, -0.035091936128702303, 0.0142810403117803, 0.010833666120933699, -0.0058476727216045096], - [-0.0, 0.0038681313433548601, -0.041260067662451799, 0.21661535522787201, -0.821713248844682, -0.0, 0.75587358166357999, -0.046018121340651898, -0.076822925360096897, -0.050113507756358401, 0.087129564256063705, -0.027558761526617698], - [-0.0, -0.0, 0.0027934857464329702, -0.029797181295285, 0.094272792663829805, -0.54587651996612696, -0.0, 0.205756876210586, 0.39561116774840099, 0.020799982439143602, -0.22287532317150199, 0.079314719624520302], - [-0.0, -0.0, -0.0, 0.0086536439119011006, -0.029896495621603899, 0.102950081118829, -0.63739245543855805, -0.0, 0.54336288636530095, 0.085109057027750595, -0.090998664615454999, 0.01821194725190089], - [-0.0, -0.0, -0.0, -0.0, 0.019698131073842998, 0.039447042394264102, -0.28128521251909999, -0.124714160903055, -0.0, 0.055770702144577897, 0.41917213003600801, -0.128088632226564], - [-0.0, -0.0, -0.0, -0.0, -0.055988255885229599, 0.179720579323281, -0.10329058280084499, -0.13643348652854601, -0.389516189693211, -0.0, 0.54432974445498405, -0.038821808870425301], - [-0.0, -0.0, -0.0, -0.0, -0.0071669648308011497, -0.052726783847581303, 0.18675940343293501, 0.024615176293723499, -0.49400862680798402, -0.091851192507295606, -0.0, 0.43437898826698501], - [-0.0, -0.0, -0.0, -0.0, 0.020015058331576099, 0.086285816263333495, -0.34386522738887298, -0.0254881486107905, 0.78102816812674902, 0.033893192260150001, -2.2474124634140402, 1.6955436044319001], - ]); + const BLOCKEND_MATRIX: super::Matrix = + super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX))); } impl SbpOperator1d for SBP8 { @@ -215,11 +206,3 @@ fn test_trad8() { 1e-4, ); } - -#[test] -fn block_equality() { - let mut flipped_inverted = SBP8::BLOCK_MATRIX.flip(); - flipped_inverted *= -1.0; - - approx::assert_ulps_eq!(SBP8::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1); -} diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs index 66b9852..d2a4eb4 100644 --- a/sbp/src/operators/upwind4.rs +++ b/sbp/src/operators/upwind4.rs @@ -192,13 +192,8 @@ impl Upwind4 { [ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], [ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], ]); - #[rustfmt::skip] - const BLOCKEND_MATRIX: Matrix = Matrix::new([ - [ -6.0 / 149.0, 36.0 / 149.0, -126.0 / 149.0, 0.0, 227.0 / 298.0, -16.0 / 149.0, -3.0 / 298.0], - [ 0.0, -2.0/41.0, 12.0/41.0, -227.0/246.0, 0.0, 69.0/82.0, -20.0/123.0], - [ 0.0, 0.0, -2.0/61.0, 16.0/183.0, -69.0/122.0, 0.0, 187.0/366.0], - [ 0.0, 0.0, 0.0, 3.0 / 98.0, 20.0 / 49.0, -187.0 / 98.0, 72.0/49.0], - ]); + const BLOCKEND_MATRIX: Matrix = + super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX))); #[rustfmt::skip] const DISS_BLOCK: &'static [&'static [Float]] = &[ @@ -495,11 +490,3 @@ fn upwind4_test2() { 1e-1, ); } - -#[test] -fn block_equality() { - let mut flipped_inverted = Upwind4::BLOCK_MATRIX.flip(); - flipped_inverted *= -1.0; - - approx::assert_ulps_eq!(Upwind4::BLOCKEND_MATRIX, flipped_inverted, max_ulps = 1); -} From 31ac46e3868f35138afb0d4caeff5b9b843c063a Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 18:59:59 +0100 Subject: [PATCH 23/35] move data structs into separate files --- sbp/src/operators/algos.rs | 367 +------------------------ sbp/src/operators/algos/constmatrix.rs | 298 ++++++++++++++++++++ sbp/src/operators/algos/fastfloat.rs | 53 ++++ 3 files changed, 358 insertions(+), 360 deletions(-) create mode 100644 sbp/src/operators/algos/constmatrix.rs create mode 100644 sbp/src/operators/algos/fastfloat.rs diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 7fe8dd6..53d2c63 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -1,308 +1,13 @@ use super::*; -pub(crate) mod constmatrix { - #![allow(unused)] - /// A row-major matrix - #[derive(Debug, Clone, PartialEq, Eq, Hash)] - #[repr(C)] - pub struct Matrix { - pub data: [[T; N]; M], - } - pub type RowVector = Matrix; - pub type ColVector = Matrix; - - impl Default for Matrix { - fn default() -> Self { - Self { - data: [[T::default(); N]; M], - } - } - } - - impl core::ops::Index<(usize, usize)> for Matrix { - type Output = T; - #[inline(always)] - fn index(&self, (i, j): (usize, usize)) -> &Self::Output { - &self.data[i][j] - } - } - - impl core::ops::IndexMut<(usize, usize)> for Matrix { - #[inline(always)] - fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output { - &mut self.data[i][j] - } - } - - impl core::ops::Index for Matrix { - type Output = [T; N]; - #[inline(always)] - fn index(&self, i: usize) -> &Self::Output { - &self.data[i] - } - } - - impl core::ops::IndexMut for Matrix { - #[inline(always)] - fn index_mut(&mut self, i: usize) -> &mut Self::Output { - &mut self.data[i] - } - } - - impl Matrix { - pub const fn new(data: [[T; N]; M]) -> Self { - Self { data } - } - #[inline(always)] - pub const fn nrows(&self) -> usize { - M - } - #[inline(always)] - pub const fn ncols(&self) -> usize { - N - } - #[inline(always)] - pub fn iter(&self) -> impl Iterator { - self.data.iter().flatten() - } - #[inline(always)] - pub fn iter_mut(&mut self) -> impl Iterator { - self.data.iter_mut().flatten() - } - #[inline(always)] - pub fn iter_rows( - &self, - ) -> impl ExactSizeIterator + DoubleEndedIterator { - self.data.iter() - } - } - - impl ColVector { - #[inline(always)] - pub fn map_to_col(slice: &[T; N]) -> &ColVector { - unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } - } - #[inline(always)] - pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector { - unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } - } - } - - impl RowVector { - pub fn map_to_row(slice: &[T; N]) -> &Self { - unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } - } - pub fn map_to_row_mut(slice: &mut [T; N]) -> &mut Self { - unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } - } - } - - impl Matrix { - #[inline(always)] - pub fn matmul_into(&mut self, lhs: &Matrix, rhs: &Matrix) - where - T: Default + Copy + core::ops::Mul + core::ops::Add, - { - for i in 0..M { - for j in 0..P { - let mut t = T::default(); - for k in 0..N { - t = t + lhs[(i, k)] * rhs[(k, j)]; - } - self[(i, j)] = t; - } - } - } - } - - macro_rules! impl_op_mul_mul { - ($lhs:ty, $rhs:ty) => { - impl core::ops::Mul<$rhs> for $lhs - where - T: Copy + Default + core::ops::Add + core::ops::Mul, - { - type Output = Matrix; - fn mul(self, rhs: $rhs) -> Self::Output { - let mut out = Matrix::default(); - out.matmul_into(&self, &rhs); - out - } - } - }; - } - - impl_op_mul_mul! { Matrix, Matrix } - impl_op_mul_mul! { &Matrix, Matrix } - impl_op_mul_mul! { Matrix, &Matrix } - impl_op_mul_mul! { &Matrix, &Matrix } - - impl core::ops::MulAssign for Matrix - where - T: Copy + core::ops::MulAssign, - { - #[inline(always)] - fn mul_assign(&mut self, other: T) { - self.iter_mut().for_each(|x| *x *= other) - } - } - - #[cfg(test)] - mod tests { - use super::{super::*, *}; - #[test] - fn construct_copy_type() { - let _m0 = Matrix::::default(); - let _m1: Matrix = Matrix::default(); - - let _m2 = Matrix::new([[1, 2], [3, 4]]); - } - - #[test] - fn matmul() { - let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); - let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]); - - let m3 = m1 * m2; - assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]])); - } - #[test] - fn iter() { - let m = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); - let mut iter = m.iter(); - assert_eq!(iter.next(), Some(&1)); - assert_eq!(iter.next(), Some(&2)); - assert_eq!(iter.next(), Some(&3)); - assert_eq!(iter.next(), Some(&4)); - assert_eq!(iter.next(), Some(&5)); - assert_eq!(iter.next(), Some(&6)); - assert_eq!(iter.next(), None); - } - } - - mod approx { - use super::Matrix; - use ::approx::{AbsDiffEq, RelativeEq, UlpsEq}; - - impl AbsDiffEq for Matrix - where - T: AbsDiffEq, - { - type Epsilon = T::Epsilon; - fn default_epsilon() -> Self::Epsilon { - T::default_epsilon() - } - fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool { - self.iter() - .zip(other.iter()) - .all(|(r, l)| r.abs_diff_eq(l, T::default_epsilon())) - } - } - impl RelativeEq for Matrix - where - T: RelativeEq, - Self::Epsilon: Copy, - { - fn default_max_relative() -> Self::Epsilon { - T::default_max_relative() - } - fn relative_eq( - &self, - other: &Self, - epsilon: Self::Epsilon, - max_relative: Self::Epsilon, - ) -> bool { - self.iter() - .zip(other.iter()) - .all(|(r, l)| r.relative_eq(l, epsilon, max_relative)) - } - } - impl UlpsEq for Matrix - where - T: UlpsEq, - Self::Epsilon: Copy, - { - fn default_max_ulps() -> u32 { - T::default_max_ulps() - } - fn ulps_eq(&self, other: &Self, epsilon: Self::Epsilon, max_ulps: u32) -> bool { - self.iter() - .zip(other.iter()) - .all(|(r, l)| r.ulps_eq(l, epsilon, max_ulps)) - } - } - } - - pub(crate) const fn flip_ud( - mut m: Matrix, - ) -> Matrix { - let mut i = 0; - while i < M / 2 { - let tmp = m.data[i]; - m.data[i] = m.data[M - 1 - i]; - m.data[M - 1 - i] = tmp; - i += 1; - } - m - } - - pub(crate) const fn flip_lr( - mut m: Matrix, - ) -> Matrix { - let mut i = 0; - while i < M { - let mut j = 0; - while j < N / 2 { - let tmp = m.data[i][j]; - m.data[i][j] = m.data[i][N - 1 - j]; - m.data[i][N - 1 - j] = tmp; - j += 1; - } - i += 1; - } - m - } - - /// Flip all sign bits - pub(crate) const fn flip_sign( - mut m: Matrix, - ) -> Matrix { - let mut i = 0; - while i < M { - let mut j = 0; - while j < N { - m.data[i][j] = -m.data[i][j]; - j += 1; - } - i += 1; - } - m - } - mod flipping { - use super::*; - - #[test] - fn flip_lr_test() { - let m = Matrix::new([[1.0, 2.0, 3.0, 4.0]]); - let flipped = flip_lr(m); - assert_eq!(flipped, Matrix::new([[4.0, 3.0, 2.0, 1.0]])); - let m = Matrix::new([[1.0, 2.0, 3.0, 4.0, 5.0]]); - let flipped = flip_lr(m); - assert_eq!(flipped, Matrix::new([[5.0, 4.0, 3.0, 2.0, 1.0]])); - } - #[test] - fn flip_ud_test() { - let m = Matrix::new([[1.0], [2.0], [3.0], [4.0]]); - let flipped = flip_ud(m); - assert_eq!(flipped, Matrix::new([[4.0], [3.0], [2.0], [1.0]])); - let m = Matrix::new([[1.0], [2.0], [3.0], [4.0], [5.0]]); - let flipped = flip_ud(m); - assert_eq!(flipped, Matrix::new([[5.0], [4.0], [3.0], [2.0], [1.0]])); - } - } -} - +pub(crate) mod constmatrix; pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector}; +#[cfg(feature = "fast-float")] +mod fastfloat; +#[cfg(feature = "fast-float")] +use fastfloat::FastFloat; + #[inline(always)] pub(crate) fn diff_op_1d_matrix( block: &Matrix, @@ -360,64 +65,6 @@ pub(crate) fn diff_op_1d_matrix( } } -#[cfg(feature = "fast-float")] -mod fastfloat { - use super::*; - #[repr(transparent)] - #[derive(Copy, Clone, Debug, PartialEq, Default)] - pub(crate) struct FastFloat(Float); - - use core::intrinsics::{fadd_fast, fmul_fast}; - - impl core::ops::Mul for FastFloat { - type Output = Self; - #[inline(always)] - fn mul(self, o: Self) -> Self::Output { - unsafe { Self(fmul_fast(self.0, o.0)) } - } - } - - impl core::ops::Add for FastFloat { - type Output = Self; - #[inline(always)] - fn add(self, o: FastFloat) -> Self::Output { - unsafe { Self(fadd_fast(self.0, o.0)) } - } - } - - impl core::ops::MulAssign for FastFloat { - #[inline(always)] - fn mul_assign(&mut self, o: FastFloat) { - unsafe { - self.0 = fmul_fast(self.0, o.0); - } - } - } - - impl core::ops::Mul for &FastFloat { - type Output = FastFloat; - #[inline(always)] - fn mul(self, o: Self) -> Self::Output { - unsafe { FastFloat(fmul_fast(self.0, o.0)) } - } - } - - impl From for FastFloat { - #[inline(always)] - fn from(f: Float) -> Self { - Self(f) - } - } - impl From for Float { - #[inline(always)] - fn from(f: FastFloat) -> Self { - f.0 - } - } -} -#[cfg(feature = "fast-float")] -use fastfloat::FastFloat; - #[inline(always)] pub(crate) fn diff_op_1d_slice_matrix( block: &Matrix, @@ -902,7 +549,7 @@ pub(crate) fn diff_op_col_simd( fn dotproduct<'a>(u: impl Iterator, v: impl Iterator) -> Float { u.zip(v).fold(0.0, |acc, (&u, &v)| { #[cfg(feature = "fast-float")] - unsafe { + { // We do not care about the order of multiplication nor addition (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into() } diff --git a/sbp/src/operators/algos/constmatrix.rs b/sbp/src/operators/algos/constmatrix.rs new file mode 100644 index 0000000..6d66fea --- /dev/null +++ b/sbp/src/operators/algos/constmatrix.rs @@ -0,0 +1,298 @@ +#![allow(unused)] +/// A row-major matrix +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[repr(C)] +pub struct Matrix { + pub data: [[T; N]; M], +} +pub type RowVector = Matrix; +pub type ColVector = Matrix; + +impl Default for Matrix { + fn default() -> Self { + Self { + data: [[T::default(); N]; M], + } + } +} + +impl core::ops::Index<(usize, usize)> for Matrix { + type Output = T; + #[inline(always)] + fn index(&self, (i, j): (usize, usize)) -> &Self::Output { + &self.data[i][j] + } +} + +impl core::ops::IndexMut<(usize, usize)> for Matrix { + #[inline(always)] + fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output { + &mut self.data[i][j] + } +} + +impl core::ops::Index for Matrix { + type Output = [T; N]; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.data[i] + } +} + +impl core::ops::IndexMut for Matrix { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.data[i] + } +} + +impl Matrix { + pub const fn new(data: [[T; N]; M]) -> Self { + Self { data } + } + #[inline(always)] + pub const fn nrows(&self) -> usize { + M + } + #[inline(always)] + pub const fn ncols(&self) -> usize { + N + } + #[inline(always)] + pub fn iter(&self) -> impl Iterator { + self.data.iter().flatten() + } + #[inline(always)] + pub fn iter_mut(&mut self) -> impl Iterator { + self.data.iter_mut().flatten() + } + #[inline(always)] + pub fn iter_rows( + &self, + ) -> impl ExactSizeIterator + DoubleEndedIterator { + self.data.iter() + } +} + +impl ColVector { + #[inline(always)] + pub fn map_to_col(slice: &[T; N]) -> &ColVector { + unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } + } + #[inline(always)] + pub fn map_to_col_mut(slice: &mut [T; N]) -> &mut ColVector { + unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } + } +} + +impl RowVector { + pub fn map_to_row(slice: &[T; N]) -> &Self { + unsafe { std::mem::transmute::<&[T; N], &Self>(slice) } + } + pub fn map_to_row_mut(slice: &mut [T; N]) -> &mut Self { + unsafe { std::mem::transmute::<&mut [T; N], &mut Self>(slice) } + } +} + +impl Matrix { + #[inline(always)] + pub fn matmul_into(&mut self, lhs: &Matrix, rhs: &Matrix) + where + T: Default + Copy + core::ops::Mul + core::ops::Add, + { + for i in 0..M { + for j in 0..P { + let mut t = T::default(); + for k in 0..N { + t = t + lhs[(i, k)] * rhs[(k, j)]; + } + self[(i, j)] = t; + } + } + } +} + +macro_rules! impl_op_mul_mul { + ($lhs:ty, $rhs:ty) => { + impl core::ops::Mul<$rhs> for $lhs + where + T: Copy + Default + core::ops::Add + core::ops::Mul, + { + type Output = Matrix; + fn mul(self, rhs: $rhs) -> Self::Output { + let mut out = Matrix::default(); + out.matmul_into(&self, &rhs); + out + } + } + }; +} + +impl_op_mul_mul! { Matrix, Matrix } +impl_op_mul_mul! { &Matrix, Matrix } +impl_op_mul_mul! { Matrix, &Matrix } +impl_op_mul_mul! { &Matrix, &Matrix } + +impl core::ops::MulAssign for Matrix +where + T: Copy + core::ops::MulAssign, +{ + #[inline(always)] + fn mul_assign(&mut self, other: T) { + self.iter_mut().for_each(|x| *x *= other) + } +} + +#[cfg(test)] +mod tests { + use super::{super::*, *}; + #[test] + fn construct_copy_type() { + let _m0 = Matrix::::default(); + let _m1: Matrix = Matrix::default(); + + let _m2 = Matrix::new([[1, 2], [3, 4]]); + } + + #[test] + fn matmul() { + let m1 = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); + let m2 = Matrix::new([[7_u8, 8, 9, 10], [11, 12, 13, 14], [15, 16, 17, 18]]); + + let m3 = m1 * m2; + assert_eq!(m3, Matrix::new([[74, 80, 86, 92], [173, 188, 203, 218]])); + } + #[test] + fn iter() { + let m = Matrix::new([[1_u8, 2, 3], [4, 5, 6]]); + let mut iter = m.iter(); + assert_eq!(iter.next(), Some(&1)); + assert_eq!(iter.next(), Some(&2)); + assert_eq!(iter.next(), Some(&3)); + assert_eq!(iter.next(), Some(&4)); + assert_eq!(iter.next(), Some(&5)); + assert_eq!(iter.next(), Some(&6)); + assert_eq!(iter.next(), None); + } +} + +mod approx { + use super::Matrix; + use ::approx::{AbsDiffEq, RelativeEq, UlpsEq}; + + impl AbsDiffEq for Matrix + where + T: AbsDiffEq, + { + type Epsilon = T::Epsilon; + fn default_epsilon() -> Self::Epsilon { + T::default_epsilon() + } + fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool { + self.iter() + .zip(other.iter()) + .all(|(r, l)| r.abs_diff_eq(l, T::default_epsilon())) + } + } + impl RelativeEq for Matrix + where + T: RelativeEq, + Self::Epsilon: Copy, + { + fn default_max_relative() -> Self::Epsilon { + T::default_max_relative() + } + fn relative_eq( + &self, + other: &Self, + epsilon: Self::Epsilon, + max_relative: Self::Epsilon, + ) -> bool { + self.iter() + .zip(other.iter()) + .all(|(r, l)| r.relative_eq(l, epsilon, max_relative)) + } + } + impl UlpsEq for Matrix + where + T: UlpsEq, + Self::Epsilon: Copy, + { + fn default_max_ulps() -> u32 { + T::default_max_ulps() + } + fn ulps_eq(&self, other: &Self, epsilon: Self::Epsilon, max_ulps: u32) -> bool { + self.iter() + .zip(other.iter()) + .all(|(r, l)| r.ulps_eq(l, epsilon, max_ulps)) + } + } +} + +pub(crate) const fn flip_ud( + mut m: Matrix, +) -> Matrix { + let mut i = 0; + while i < M / 2 { + let tmp = m.data[i]; + m.data[i] = m.data[M - 1 - i]; + m.data[M - 1 - i] = tmp; + i += 1; + } + m +} + +pub(crate) const fn flip_lr( + mut m: Matrix, +) -> Matrix { + let mut i = 0; + while i < M { + let mut j = 0; + while j < N / 2 { + let tmp = m.data[i][j]; + m.data[i][j] = m.data[i][N - 1 - j]; + m.data[i][N - 1 - j] = tmp; + j += 1; + } + i += 1; + } + m +} + +/// Flip all sign bits +pub(crate) const fn flip_sign( + mut m: Matrix, +) -> Matrix { + let mut i = 0; + while i < M { + let mut j = 0; + while j < N { + m.data[i][j] = -m.data[i][j]; + j += 1; + } + i += 1; + } + m +} +mod flipping { + use super::*; + + #[test] + fn flip_lr_test() { + let m = Matrix::new([[1.0, 2.0, 3.0, 4.0]]); + let flipped = flip_lr(m); + assert_eq!(flipped, Matrix::new([[4.0, 3.0, 2.0, 1.0]])); + let m = Matrix::new([[1.0, 2.0, 3.0, 4.0, 5.0]]); + let flipped = flip_lr(m); + assert_eq!(flipped, Matrix::new([[5.0, 4.0, 3.0, 2.0, 1.0]])); + } + #[test] + fn flip_ud_test() { + let m = Matrix::new([[1.0], [2.0], [3.0], [4.0]]); + let flipped = flip_ud(m); + assert_eq!(flipped, Matrix::new([[4.0], [3.0], [2.0], [1.0]])); + let m = Matrix::new([[1.0], [2.0], [3.0], [4.0], [5.0]]); + let flipped = flip_ud(m); + assert_eq!(flipped, Matrix::new([[5.0], [4.0], [3.0], [2.0], [1.0]])); + } +} diff --git a/sbp/src/operators/algos/fastfloat.rs b/sbp/src/operators/algos/fastfloat.rs new file mode 100644 index 0000000..40d6633 --- /dev/null +++ b/sbp/src/operators/algos/fastfloat.rs @@ -0,0 +1,53 @@ +use super::*; + +#[repr(transparent)] +#[derive(Copy, Clone, Debug, PartialEq, Default)] +pub(crate) struct FastFloat(Float); + +use core::intrinsics::{fadd_fast, fmul_fast}; + +impl core::ops::Mul for FastFloat { + type Output = Self; + #[inline(always)] + fn mul(self, o: Self) -> Self::Output { + unsafe { Self(fmul_fast(self.0, o.0)) } + } +} + +impl core::ops::Add for FastFloat { + type Output = Self; + #[inline(always)] + fn add(self, o: FastFloat) -> Self::Output { + unsafe { Self(fadd_fast(self.0, o.0)) } + } +} + +impl core::ops::MulAssign for FastFloat { + #[inline(always)] + fn mul_assign(&mut self, o: FastFloat) { + unsafe { + self.0 = fmul_fast(self.0, o.0); + } + } +} + +impl core::ops::Mul for &FastFloat { + type Output = FastFloat; + #[inline(always)] + fn mul(self, o: Self) -> Self::Output { + unsafe { FastFloat(fmul_fast(self.0, o.0)) } + } +} + +impl From for FastFloat { + #[inline(always)] + fn from(f: Float) -> Self { + Self(f) + } +} +impl From for Float { + #[inline(always)] + fn from(f: FastFloat) -> Self { + f.0 + } +} From 6f7268bf339abebb83d3bb2e1419405fefff7f10 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 23:08:55 +0100 Subject: [PATCH 24/35] use matrices everywhere --- sbp/src/operators.rs | 8 +- sbp/src/operators/algos.rs | 628 +++++++------------------ sbp/src/operators/algos/constmatrix.rs | 4 + sbp/src/operators/traditional4.rs | 146 ++---- sbp/src/operators/traditional8.rs | 116 +---- sbp/src/operators/upwind4.rs | 326 ++----------- sbp/src/operators/upwind4h2.rs | 149 ++---- sbp/src/operators/upwind9.rs | 154 ++---- sbp/src/operators/upwind9h2.rs | 163 ++----- 9 files changed, 405 insertions(+), 1289 deletions(-) diff --git a/sbp/src/operators.rs b/sbp/src/operators.rs index 8f8598f..63580f9 100644 --- a/sbp/src/operators.rs +++ b/sbp/src/operators.rs @@ -78,7 +78,9 @@ pub trait SbpOperator2d: Send + Sync { } fn op_xi(&self) -> &dyn SbpOperator1d; - fn op_eta(&self) -> &dyn SbpOperator1d; + fn op_eta(&self) -> &dyn SbpOperator1d { + self.op_xi() + } fn upwind(&self) -> Option> { None @@ -98,7 +100,9 @@ pub trait UpwindOperator2d: Send + Sync { } fn op_xi(&self) -> &dyn UpwindOperator1d; - fn op_eta(&self) -> &dyn UpwindOperator1d; + fn op_eta(&self) -> &dyn UpwindOperator1d { + self.op_xi() + } } pub trait InterpolationOperator: Send + Sync { diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 53d2c63..5a608a6 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -1,4 +1,5 @@ use super::*; +use ndarray::s; pub(crate) mod constmatrix; pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector}; @@ -8,11 +9,55 @@ mod fastfloat; #[cfg(feature = "fast-float")] use fastfloat::FastFloat; +#[derive(Clone, Debug, PartialEq)] +pub(crate) struct DiagonalMatrix { + pub start: [Float; B], + pub diag: Float, + pub end: [Float; B], +} + +impl DiagonalMatrix { + pub const fn new(block: [Float; B]) -> Self { + let start = block; + let diag = 1.0; + let mut end = block; + let mut i = 0; + while i < B { + end[i] = block[B - 1 - i]; + i += 1; + } + Self { start, diag, end } + } +} + +#[derive(Clone, Debug, PartialEq)] +pub(crate) struct BlockMatrix { + pub start: Matrix, + pub diag: RowVector, + pub end: Matrix, +} + +impl BlockMatrix { + pub const fn new( + start: Matrix, + diag: RowVector, + end: Matrix, + ) -> Self { + Self { start, diag, end } + } +} + +#[derive(PartialEq, Copy, Clone)] +pub(crate) enum OperatorType { + Normal, + H2, + // TODO: D2 +} + #[inline(always)] -pub(crate) fn diff_op_1d_matrix( - block: &Matrix, - blockend: &Matrix, - diag: &RowVector, +/// Works on all 1d vectors +pub(crate) fn diff_op_1d_fallback( + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView1, mut fut: ArrayViewMut1, @@ -29,12 +74,11 @@ pub(crate) fn diff_op_1d_matrix( }; let idx = 1.0 / dx; - for (bl, f) in block.iter_rows().zip(&mut fut) { - let diff = bl - .iter() - .zip(prev.iter()) - .map(|(x, y)| x * y) - .sum::(); + let (futstart, futmid, futend) = + fut.multi_slice_mut((s![..M], s![M..nx - 2 * M], s![nx - 2 * M..])); + + for (bl, f) in matrix.start.iter_rows().zip(futstart) { + let diff = dotproduct(bl.iter(), prev.iter()); *f = diff * idx; } @@ -46,42 +90,33 @@ pub(crate) fn diff_op_1d_matrix( .windows(D) .into_iter() .skip(window_elems_to_skip) - .zip(fut.iter_mut().skip(M)) - .take(nx - 2 * M) + .zip(futmid) { - let diff = diag.iter().zip(&window).map(|(x, y)| x * y).sum::(); + let diff = dotproduct(matrix.diag.row(0), window); *f = diff * idx; } let prev = prev.slice(ndarray::s![nx - N..]); - for (bl, f) in blockend.iter_rows().zip(fut.iter_mut().rev().take(M).rev()) { - let diff = bl - .iter() - .zip(prev.iter()) - .map(|(x, y)| x * y) - .sum::(); - + for (bl, f) in matrix.end.iter_rows().zip(futend) { + let diff = dotproduct(bl, prev); *f = diff * idx; } } #[inline(always)] -pub(crate) fn diff_op_1d_slice_matrix( - block: &Matrix, - endblock: &Matrix, - diag: &RowVector, +/// diff op in 1d for slices +pub(crate) fn diff_op_1d_slice( + matrix: &BlockMatrix, optype: OperatorType, prev: &[Float], fut: &mut [Float], ) { #[cfg(feature = "fast-float")] - let (block, endblock, diag, prev, fut) = { + let (matrix, prev, fut) = { use std::mem::transmute; unsafe { ( - transmute::<_, &Matrix>(block), - transmute::<_, &Matrix>(endblock), - transmute::<_, &RowVector>(diag), + transmute::<_, &BlockMatrix>(matrix), transmute::<_, &[FastFloat]>(prev), transmute::<_, &mut [FastFloat]>(fut), ) @@ -113,7 +148,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev.array_windows::().next().unwrap()); let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap()); - fut.matmul_into(block, prev); + fut.matmul_into(&matrix.start, prev); *fut *= idx; } @@ -129,7 +164,7 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col_mut(f); let prev = ColVector::<_, D>::map_to_col(window); - fut.matmul_into(diag, prev); + fut.matmul_into(&matrix.diag, prev); *fut *= idx; } @@ -138,154 +173,35 @@ pub(crate) fn diff_op_1d_slice_matrix::map_to_col(prev); let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap()); - fut.matmul_into(endblock, prev); + fut.matmul_into(&matrix.end, prev); *fut *= idx; } } #[inline(always)] -pub(crate) fn diff_op_1d( - block: &[&[Float]], - diag: &[Float], - symmetry: Symmetry, +/// Will always work on 1d, delegated based on slicedness +pub(crate) fn diff_op_1d( + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView1, mut fut: ArrayViewMut1, ) { assert_eq!(prev.shape(), fut.shape()); let nx = prev.shape()[0]; - assert!(nx >= 2 * block.len()); + assert!(nx >= 2 * M); - let dx = if optype == OperatorType::H2 { - 1.0 / (nx - 2) as Float + if let Some((prev, fut)) = prev.as_slice().zip(fut.as_slice_mut()) { + diff_op_1d_slice(matrix, optype, prev, fut) } else { - 1.0 / (nx - 1) as Float - }; - let idx = 1.0 / dx; - - for (bl, f) in block.iter().zip(&mut fut) { - let diff = bl - .iter() - .zip(prev.iter()) - .map(|(x, y)| x * y) - .sum::(); - *f = diff * idx; - } - - // The window needs to be aligned to the diagonal elements, - // based on the block size - let window_elems_to_skip = block.len() - ((diag.len() - 1) / 2); - - for (window, f) in prev - .windows(diag.len()) - .into_iter() - .skip(window_elems_to_skip) - .zip(fut.iter_mut().skip(block.len())) - .take(nx - 2 * block.len()) - { - let diff = diag.iter().zip(&window).map(|(x, y)| x * y).sum::(); - *f = diff * idx; - } - - for (bl, f) in block.iter().zip(fut.iter_mut().rev()) { - let diff = bl - .iter() - .zip(prev.iter().rev()) - .map(|(x, y)| x * y) - .sum::(); - - *f = idx - * if symmetry == Symmetry::Symmetric { - diff - } else { - -diff - }; - } -} - -#[derive(PartialEq, Copy, Clone)] -pub(crate) enum Symmetry { - Symmetric, - AntiSymmetric, -} - -#[derive(PartialEq, Copy, Clone)] -pub(crate) enum OperatorType { - Normal, - H2, -} - -#[inline(always)] -#[allow(unused)] -pub(crate) fn diff_op_col_naive( - block: &'static [&'static [Float]], - diag: &'static [Float], - symmetry: Symmetry, - optype: OperatorType, -) -> impl Fn(ArrayView2, ArrayViewMut2) { - #[inline(always)] - move |prev: ArrayView2, mut fut: ArrayViewMut2| { - assert_eq!(prev.shape(), fut.shape()); - let nx = prev.shape()[1]; - assert!(nx >= 2 * block.len()); - - assert_eq!(prev.strides()[0], 1); - assert_eq!(fut.strides()[0], 1); - - let dx = if optype == OperatorType::H2 { - 1.0 / (nx - 2) as Float - } else { - 1.0 / (nx - 1) as Float - }; - let idx = 1.0 / dx; - - fut.fill(0.0); - - // First block - for (bl, mut fut) in block.iter().zip(fut.axis_iter_mut(ndarray::Axis(1))) { - debug_assert_eq!(fut.len(), prev.shape()[0]); - for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) { - debug_assert_eq!(prev.len(), fut.len()); - fut.scaled_add(idx * bl, &prev); - } - } - - let half_diag_width = (diag.len() - 1) / 2; - assert!(half_diag_width <= block.len()); - - // Diagonal entries - for (ifut, mut fut) in fut - .axis_iter_mut(ndarray::Axis(1)) - .enumerate() - .skip(block.len()) - .take(nx - 2 * block.len()) - { - for (id, &d) in diag.iter().enumerate() { - let offset = ifut - half_diag_width + id; - fut.scaled_add(idx * d, &prev.slice(ndarray::s![.., offset])) - } - } - - // End block - for (bl, mut fut) in block.iter().zip(fut.axis_iter_mut(ndarray::Axis(1)).rev()) { - fut.fill(0.0); - for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1)).rev()) { - if symmetry == Symmetry::Symmetric { - fut.scaled_add(idx * bl, &prev); - } else { - fut.scaled_add(-idx * bl, &prev); - } - } - } + diff_op_1d_fallback(matrix, optype, prev, fut) } } #[inline(always)] #[allow(unused)] -pub(crate) fn diff_op_col_naive_matrix( - block: &Matrix, - blockend: &Matrix, - diag: &RowVector, +/// 2D diff fallback for when matrices are not slicable +pub(crate) fn diff_op_2d_fallback( + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView2, mut fut: ArrayViewMut2, @@ -295,9 +211,6 @@ pub(crate) fn diff_op_col_naive_matrix= 2 * M); - assert_eq!(prev.strides()[0], 1); - assert_eq!(fut.strides()[0], 1); - let dx = if optype == OperatorType::H2 { 1.0 / (nx - 2) as Float } else { @@ -314,7 +227,11 @@ pub(crate) fn diff_op_col_naive_matrix( + matrix: &BlockMatrix, optype: OperatorType, -) -> impl Fn(ArrayView2, ArrayViewMut2) { - diff_op_col_simd(block, diag, symmetry, optype) -} - -#[inline(always)] -pub(crate) fn diff_op_col_simd( - block: &'static [&'static [Float]], - diag: &'static [Float], - symmetry: Symmetry, - optype: OperatorType, -) -> impl Fn(ArrayView2, ArrayViewMut2) { - #[inline(always)] - move |prev: ArrayView2, mut fut: ArrayViewMut2| { - assert_eq!(prev.shape(), fut.shape()); - let nx = prev.shape()[1]; - assert!(nx >= 2 * block.len()); - - assert_eq!(prev.strides()[0], 1); - assert_eq!(fut.strides()[0], 1); - - let dx = if optype == OperatorType::H2 { - 1.0 / (nx - 2) as Float - } else { - 1.0 / (nx - 1) as Float - }; - let idx = 1.0 / dx; - - #[cfg(not(feature = "f32"))] - type SimdT = packed_simd::f64x8; - #[cfg(feature = "f32")] - type SimdT = packed_simd::f32x16; - - let ny = prev.shape()[0]; - // How many elements that can be simdified - let simdified = SimdT::lanes() * (ny / SimdT::lanes()); - - let half_diag_width = (diag.len() - 1) / 2; - assert!(half_diag_width <= block.len()); - - let fut_base_ptr = fut.as_mut_ptr(); - let fut_stride = fut.strides()[1]; - let fut_ptr = |j, i| { - debug_assert!(j < ny && i < nx); - unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) } - }; - - let prev_base_ptr = prev.as_ptr(); - let prev_stride = prev.strides()[1]; - let prev_ptr = |j, i| { - debug_assert!(j < ny && i < nx); - unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) } - }; - - // Not algo necessary, but gives performance increase - assert_eq!(fut_stride, prev_stride); - - // First block - { - for (ifut, &bl) in block.iter().enumerate() { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; - let mut f = SimdT::splat(0.0); - for (iprev, &bl) in bl.iter().enumerate() { - f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); - } - f *= idx; - - unsafe { - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } - } - for j in simdified..ny { - unsafe { - let mut f = 0.0; - for (iprev, bl) in bl.iter().enumerate() { - f += bl * *prev_ptr(j, iprev); - } - *fut_ptr(j, ifut) = f * idx; - } - } - } - } - - // Diagonal elements - { - for ifut in block.len()..nx - block.len() { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; - let mut f = SimdT::splat(0.0); - for (id, &d) in diag.iter().enumerate() { - let offset = ifut - half_diag_width + id; - f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); - } - f *= idx; - unsafe { - // puts simd along stride 1, j never goes past end of slice - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } - } - for j in simdified..ny { - let mut f = 0.0; - for (id, &d) in diag.iter().enumerate() { - let offset = ifut - half_diag_width + id; - unsafe { - f += d * *prev_ptr(j, offset); - } - } - unsafe { - *fut_ptr(j, ifut) = idx * f; - } - } - } - } - - // End block - { - // Get blocks and corresponding offsets - // (rev to iterate in ifut increasing order) - for (bl, ifut) in block.iter().zip((0..nx).rev()) { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; - let mut f = SimdT::splat(0.0); - for (&bl, iprev) in bl.iter().zip((0..nx).rev()) { - f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); - } - f = if symmetry == Symmetry::Symmetric { - f * idx - } else { - -f * idx - }; - unsafe { - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } - } - - for j in simdified..ny { - unsafe { - let mut f = 0.0; - for (&bl, iprev) in bl.iter().zip((0..nx).rev()).rev() { - f += bl * *prev_ptr(j, iprev); - } - *fut_ptr(j, ifut) = if symmetry == Symmetry::Symmetric { - f * idx - } else { - -f * idx - }; - } - } - } - } + prev: ArrayView2, + mut fut: ArrayViewMut2, +) { + assert_eq!(prev.shape(), fut.shape()); + let nx = prev.shape()[1]; + for (prev, mut fut) in prev.outer_iter().zip(fut.outer_iter_mut()) { + let prev = &prev.as_slice().unwrap()[..nx]; + let fut = &mut fut.as_slice_mut().unwrap()[..nx]; + diff_op_1d_slice(matrix, optype, prev, fut) } } #[inline(always)] -fn dotproduct<'a>(u: impl Iterator, v: impl Iterator) -> Float { - u.zip(v).fold(0.0, |acc, (&u, &v)| { - #[cfg(feature = "fast-float")] - { - // We do not care about the order of multiplication nor addition - (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into() - } - #[cfg(not(feature = "fast-float"))] - { - acc + u * v - } - }) +/// Dispatch based on strides +pub(crate) fn diff_op_2d( + matrix: &BlockMatrix, + optype: OperatorType, + prev: ArrayView2, + fut: ArrayViewMut2, +) { + assert_eq!(prev.shape(), fut.shape()); + match (prev.strides(), fut.strides()) { + ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut), + _ => diff_op_2d_fallback(matrix, optype, prev, fut), + } } +/* #[inline(always)] +/// Way to too much overhead with SIMD: +/// output SIMD oriented: +/// |S | = |P0 P1| |P0 P1| +/// |S | = a1|P0 P1| + b1|P0 P1| +/// |S | = |P0 P1| |P0 P1| +/// +/// | S | = |P0 P1| |P0 P1| +/// | S | = a2|P0 P1| + b1|P0 P1| +/// | S | = |P0 P1| |P0 P1| pub(crate) fn diff_op_col_matrix( - block: &Matrix, - block2: &Matrix, - diag: &RowVector, + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView2, fut: ArrayViewMut2, @@ -615,7 +367,7 @@ pub(crate) fn diff_op_col_matrix { let prev = prev.slice(ndarray::s![.., ..N]); let (prevb, prevl) = prev.split_at(Axis(0), simdified); - for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(block.iter_rows()) { + for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(matrix.start.iter_rows()) { let fut = fut.as_slice_mut().unwrap(); let fut = &mut fut[..ny]; @@ -662,7 +414,7 @@ pub(crate) fn diff_op_col_matrix for (fut, prev) in fut.by_ref().zip(prev) { let mut f = SimdT::splat(0.0); - for (&d, prev) in diag.iter().zip(prev.axis_iter(Axis(1))) { + for (&d, prev) in matrix.diag.iter().zip(prev.axis_iter(Axis(1))) { let prev = prev.to_slice().unwrap(); let prev = SimdT::from_slice_unaligned(prev); f = prev.mul_adde(SimdT::splat(d), f); @@ -677,7 +429,7 @@ pub(crate) fn diff_op_col_matrix .zip(prevl.axis_iter(Axis(0))) { let mut f = 0.0; - for (&d, prev) in diag.iter().zip(prev) { + for (&d, prev) in matrix.diag.iter().zip(prev) { f += d * prev; } *fut = idx * f; @@ -687,7 +439,7 @@ pub(crate) fn diff_op_col_matrix // End block { - for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(block2.iter_rows()) { + for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(matrix.end.iter_rows()) { let fut = fut.as_slice_mut().unwrap(); let fut = &mut fut[..ny]; let mut fut = fut.chunks_exact_mut(SimdT::lanes()); @@ -720,94 +472,51 @@ pub(crate) fn diff_op_col_matrix } } } +*/ #[inline(always)] -pub(crate) fn diff_op_row( - block: &'static [&'static [Float]], - diag: &'static [Float], - symmetry: Symmetry, - optype: OperatorType, -) -> impl Fn(ArrayView2, ArrayViewMut2) { - #[inline(always)] - move |prev: ArrayView2, mut fut: ArrayViewMut2| { - assert_eq!(prev.shape(), fut.shape()); - let nx = prev.shape()[1]; - assert!(nx >= 2 * block.len()); - - assert_eq!(prev.strides()[1], 1); - assert_eq!(fut.strides()[1], 1); - - let dx = if optype == OperatorType::H2 { - 1.0 / (nx - 2) as Float - } else { - 1.0 / (nx - 1) as Float - }; - let idx = 1.0 / dx; - - for (prev, mut fut) in prev - .axis_iter(ndarray::Axis(0)) - .zip(fut.axis_iter_mut(ndarray::Axis(0))) +fn dotproduct<'a>( + u: impl IntoIterator, + v: impl IntoIterator, +) -> Float { + u.into_iter().zip(v.into_iter()).fold(0.0, |acc, (&u, &v)| { + #[cfg(feature = "fast-float")] { - let prev = prev.as_slice().unwrap(); - let fut = fut.as_slice_mut().unwrap(); - assert_eq!(prev.len(), fut.len()); - assert!(prev.len() >= 2 * block.len()); - - for (bl, f) in block.iter().zip(fut.iter_mut()) { - let diff = dotproduct(bl.iter(), prev[..bl.len()].iter()); - *f = diff * idx; - } - - // The window needs to be aligned to the diagonal elements, - // based on the block size - let window_elems_to_skip = block.len() - ((diag.len() - 1) / 2); - - for (window, f) in prev - .windows(diag.len()) - .skip(window_elems_to_skip) - .zip(fut.iter_mut().skip(block.len())) - .take(nx - 2 * block.len()) - { - let diff = dotproduct(diag.iter(), window.iter()); - *f = diff * idx; - } - - for (bl, f) in block.iter().zip(fut.iter_mut().rev()) { - let diff = dotproduct(bl.iter(), prev.iter().rev()); - - *f = idx - * if symmetry == Symmetry::Symmetric { - diff - } else { - -diff - }; - } + // We do not care about the order of multiplication nor addition + (FastFloat::from(acc) + FastFloat::from(u) * FastFloat::from(v)).into() } - } + #[cfg(not(feature = "fast-float"))] + { + acc + u * v + } + }) } #[cfg(feature = "sparse")] -pub(crate) fn sparse_from_block( - block: &[&[Float]], - diag: &[Float], - symmetry: Symmetry, +pub(crate) fn sparse_from_block( + matrix: &BlockMatrix, optype: OperatorType, n: usize, ) -> sprs::CsMat { - assert!(n >= 2 * block.len()); + assert!(n >= 2 * M); let nnz = { - let block_elems = block.iter().fold(0, |acc, x| { - acc + x - .iter() - .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc }) - }); - - let diag_elems = diag + let blockstart_elems = matrix + .start .iter() .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc }); - 2 * block_elems + (n - 2 * block.len()) * diag_elems + let diag_elems = matrix + .diag + .iter() + .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc }); + + let blockend_elems = matrix + .end + .iter() + .fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc }); + + blockstart_elems + (n - 2 * M) * diag_elems + blockend_elems }; let mut mat = sprs::TriMat::with_capacity((n, n), nnz); @@ -819,7 +528,7 @@ pub(crate) fn sparse_from_block( }; let idx = 1.0 / dx; - for (j, bl) in block.iter().enumerate() { + for (j, bl) in matrix.start.iter_rows().enumerate() { for (i, &b) in bl.iter().enumerate() { if b == 0.0 { continue; @@ -828,9 +537,9 @@ pub(crate) fn sparse_from_block( } } - for j in block.len()..n - block.len() { - let half_diag_len = diag.len() / 2; - for (&d, i) in diag.iter().zip(j - half_diag_len..) { + for j in M..n - M { + let half_diag_len = D / 2; + for (&d, i) in matrix.diag.iter().zip(j - half_diag_len..) { if d == 0.0 { continue; } @@ -838,16 +547,12 @@ pub(crate) fn sparse_from_block( } } - for (bl, j) in block.iter().zip((0..n).rev()).rev() { - for (&b, i) in bl.iter().zip((0..n).rev()).rev() { + for (bl, j) in matrix.end.iter_rows().zip(n - M..) { + for (&b, i) in bl.iter().zip(n - N..) { if b == 0.0 { continue; } - if symmetry == Symmetry::AntiSymmetric { - mat.add_triplet(j, i, -b * idx); - } else { - mat.add_triplet(j, i, b * idx); - } + mat.add_triplet(j, i, b * idx); } } @@ -855,17 +560,22 @@ pub(crate) fn sparse_from_block( } #[cfg(feature = "sparse")] -pub(crate) fn h_matrix(diag: &[Float], n: usize, is_h2: bool) -> sprs::CsMat { +pub(crate) fn h_matrix( + hmatrix: &DiagonalMatrix, + n: usize, + is_h2: bool, +) -> sprs::CsMat { let h = if is_h2 { 1.0 / (n - 2) as Float } else { 1.0 / (n - 1) as Float }; - let nmiddle = n - 2 * diag.len(); - let iter = diag + let nmiddle = n - 2 * D; + let iter = hmatrix + .start .iter() - .chain(std::iter::repeat(&1.0).take(nmiddle)) - .chain(diag.iter().rev()) + .chain(std::iter::repeat(&hmatrix.diag).take(nmiddle)) + .chain(hmatrix.end.iter()) .map(|&x| h * x); let mut mat = sprs::TriMat::with_capacity((n, n), n); diff --git a/sbp/src/operators/algos/constmatrix.rs b/sbp/src/operators/algos/constmatrix.rs index 6d66fea..b409bed 100644 --- a/sbp/src/operators/algos/constmatrix.rs +++ b/sbp/src/operators/algos/constmatrix.rs @@ -72,6 +72,10 @@ impl Matrix { ) -> impl ExactSizeIterator + DoubleEndedIterator { self.data.iter() } + #[inline(always)] + pub const fn row(&self, idx: usize) -> &[T; N] { + &self.data[idx] + } } impl ColVector { diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 417b799..167d36b 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -1,4 +1,6 @@ -use super::{SbpOperator1d, SbpOperator2d}; +use super::{ + BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d, +}; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -7,76 +9,61 @@ pub struct SBP4; impl SBP4 { #[rustfmt::skip] - const HBLOCK: &'static [Float] = &[ + const H: DiagonalMatrix<4> = DiagonalMatrix::new([ 17.0 / 48.0, 59.0 / 48.0, 43.0 / 48.0, 49.0 / 48.0, - ]; + ]); #[rustfmt::skip] - const DIAG: &'static [Float] = &[ - 1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0, - ]; - - const DIAG_MATRIX: super::RowVector = - super::RowVector::new([[1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0]]); + const DIFF_DIAG: RowVector = RowVector::new([[ + 1.0 / 12.0, -2.0 / 3.0, 0.0, 2.0 / 3.0, -1.0 / 12.0 + ]]); #[rustfmt::skip] - const BLOCK: &'static [&'static [Float]] = &[ - &[-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0], - &[-1.0/2.0, 0.0, 1.0/2.0], - &[4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0], - &[3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0] - ]; - #[rustfmt::skip] - const BLOCK_MATRIX: super::Matrix = super::Matrix::new([ + const DIFF_BLOCK: Matrix = Matrix::new([ [-24.0/17.0, 59.0/34.0, -4.0/17.0, -3.0/34.0, 0.0, 0.0], [-1.0/2.0, 0.0, 1.0/2.0, 0.0, 0.0, 0.0], [4.0/43.0, -59.0/86.0, 0.0, 59.0/86.0, -4.0/43.0, 0.0], [3.0/98.0, 0.0, -59.0/98.0, 0.0, 32.0/49.0, -4.0/49.0] ]); - const BLOCKEND_MATRIX: super::Matrix = - super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX))); + const DIFF_BLOCKEND: super::Matrix = + super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))); + + const DIFF: BlockMatrix<4, 6, 5> = + BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND); #[rustfmt::skip] - const D2DIAG: &'static [Float] = &[ + const D2DIAG: RowVector = RowVector::new([[ -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0 - ]; + ]]); #[rustfmt::skip] - const D2BLOCK: &'static [&'static [Float]] = &[ - &[2.0, -5.0, 4.0, -1.0], - &[1.0, -2.0, 1.0], - &[-4.0/43.0, 59.0/43.0, -110.0/43.0, 59.0/43.0, -4.0/43.0], - &[-1.0/49.0, 0.0, 59.0/49.0, -118.0/49.0, 64.0/49.0, -4.0/49.0] - ]; + const D2BLOCK: Matrix = Matrix::new([ + [2.0, -5.0, 4.0, -1.0, 0.0, 0.0], + [1.0, -2.0, 1.0, 0.0, 0.0, 0.0], + [-4.0/43.0, 59.0/43.0, -110.0/43.0, 59.0/43.0, -4.0/43.0, 0.0], + [-1.0/49.0, 0.0, 59.0/49.0, -118.0/49.0, 64.0/49.0, -4.0/49.0] + ]); + const D2: BlockMatrix<4, 6, 5> = BlockMatrix::new( + Self::D2BLOCK, + Self::D2DIAG, + super::flip_ud(super::flip_lr(Self::D2BLOCK)), + ); } impl SbpOperator1d for SBP4 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d_matrix( - &Self::BLOCK_MATRIX, - &Self::BLOCKEND_MATRIX, - &Self::DIAG_MATRIX, - super::OperatorType::Normal, - prev, - fut, - ) + super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut) } fn h(&self) -> &'static [Float] { - Self::HBLOCK + &Self::H.start } #[cfg(feature = "sparse")] fn diff_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - n, - ) + super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n) } #[cfg(feature = "sparse")] fn h_matrix(&self, n: usize) -> sprs::CsMat { - super::h_matrix(Self::HBLOCK, n, self.is_h2()) + super::h_matrix(&Self::H, n, self.is_h2()) } fn d2(&self) -> Option<&dyn super::SbpOperator1d2> { @@ -84,76 +71,19 @@ impl SbpOperator1d for SBP4 { } } -fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { - for (p, mut f) in prev - .axis_iter(ndarray::Axis(0)) - .zip(fut.axis_iter_mut(ndarray::Axis(0))) - { - super::diff_op_1d_slice_matrix( - &SBP4::BLOCK_MATRIX, - &SBP4::BLOCKEND_MATRIX, - &SBP4::DIAG_MATRIX, - super::OperatorType::Normal, - p.as_slice().unwrap(), - f.as_slice_mut().unwrap(), - ) - } -} -fn diff_op_col_local(prev: ndarray::ArrayView2, fut: ndarray::ArrayViewMut2) { - let optype = super::OperatorType::Normal; - super::diff_op_col_naive_matrix( - &SBP4::BLOCK_MATRIX, - &SBP4::BLOCKEND_MATRIX, - &SBP4::DIAG_MATRIX, - optype, - prev, - fut, - ) -} - impl SbpOperator2d for SBP4 { - fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn diffxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * SBP4::BLOCK.len()); - - let symmetry = super::Symmetry::AntiSymmetric; - let optype = super::OperatorType::Normal; - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - //diff_op_row(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); - diff_op_row_local(prev, fut) - } - ([1, _], [1, _]) => { - //diff_op_col(SBP4::BLOCK, SBP4::DIAG, symmetry, optype)(prev, fut); - diff_op_col_local(prev, fut) - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - SBP4.diff(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut) } fn op_xi(&self) -> &dyn SbpOperator1d { &Self } - fn op_eta(&self) -> &dyn SbpOperator1d { - &Self - } } impl super::SbpOperator1d2 for SBP4 { fn diff2(&self, prev: ArrayView1, mut fut: ArrayViewMut1) { - super::diff_op_1d( - Self::D2BLOCK, - Self::D2DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - prev, - fut.view_mut(), - ); + super::diff_op_1d(&Self::D2, OperatorType::Normal, prev, fut.view_mut()); let hi = (prev.len() - 1) as Float; fut.map_inplace(|x| *x *= hi) } @@ -162,13 +92,7 @@ impl super::SbpOperator1d2 for SBP4 { } #[cfg(feature = "sparse")] fn diff2_matrix(&self, n: usize) -> sprs::CsMat { - let mut m = super::sparse_from_block( - Self::D2BLOCK, - Self::D2DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - n, - ); + let mut m = super::sparse_from_block(&Self::D2, OperatorType::Normal, n); let hi = (n - 1) as Float; m.map_inplace(|v| v * hi); m diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index 32b392c..cfd7b8b 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -1,4 +1,6 @@ -use super::{diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d}; +use super::{ + BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d, +}; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -7,30 +9,15 @@ pub struct SBP8; impl SBP8 { #[rustfmt::skip] - const HBLOCK: &'static [Float] = &[ + const H: DiagonalMatrix<8> = DiagonalMatrix::new([ 2.94890676177879e-01, 1.52572062389771e+00, 2.57452876984127e-01, 1.79811370149912e+00, 4.12708057760141e-01, 1.27848462301587e+00, 9.23295579805997e-01, 1.00933386085916e+00 - ]; + ]); #[rustfmt::skip] - const DIAG: &'static [Float] = &[ - 3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03 - ]; - #[rustfmt::skip] - const DIAG_MATRIX: RowVector = Matrix::new([[ + const DIFF_DIAG: RowVector = Matrix::new([[ 3.57142857142857e-03, -3.80952380952381e-02, 2.00000000000000e-01, -8.00000000000000e-01, -0.00000000000000e+00, 8.00000000000000e-01, -2.00000000000000e-01, 3.80952380952381e-02, -3.57142857142857e-03 ]]); #[rustfmt::skip] - const BLOCK: &'static [&'static [Float]] = &[ - &[-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02], - &[-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03], - &[3.88218088704253e-02, -5.44329744454984e-01, 0.00000000000000e+00, 3.89516189693211e-01, 1.36433486528546e-01, 1.03290582800845e-01, -1.79720579323281e-01, 5.59882558852296e-02], - &[1.28088632226564e-01, -4.19172130036008e-01, -5.57707021445779e-02, 0.00000000000000e+00, 1.24714160903055e-01, 2.81285212519100e-01, -3.94470423942641e-02, -1.96981310738430e-02], - &[-1.82119472519009e-02, 9.09986646154550e-02, -8.51090570277506e-02, -5.43362886365301e-01, 0.00000000000000e+00, 6.37392455438558e-01, -1.02950081118829e-01, 2.98964956216039e-02, -8.65364391190110e-03], - &[-7.93147196245203e-02, 2.22875323171502e-01, -2.07999824391436e-02, -3.95611167748401e-01, -2.05756876210586e-01, 0.00000000000000e+00, 5.45876519966127e-01, -9.42727926638298e-02, 2.97971812952850e-02, -2.79348574643297e-03], - &[2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03], - &[5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], - ]; - #[rustfmt::skip] - const BLOCK_MATRIX: Matrix = Matrix::new([ + const DIFF_BLOCK: Matrix = Matrix::new([ [-1.69554360443190e+00, 2.24741246341404e+00, -3.38931922601500e-02, -7.81028168126749e-01, 2.54881486107905e-02, 3.43865227388873e-01, -8.62858162633335e-02, -2.00150583315761e-02, 0.0, 0.0, 0.0, 0.0], [-4.34378988266985e-01, 0.00000000000000e+00, 9.18511925072956e-02, 4.94008626807984e-01, -2.46151762937235e-02, -1.86759403432935e-01, 5.27267838475813e-02, 7.16696483080115e-03, 0.0, 0.0, 0.0, 0.0], [3.88218088704253e-02, -5.44329744454984e-01, 0.00000000000000e+00, 3.89516189693211e-01, 1.36433486528546e-01, 1.03290582800845e-01, -1.79720579323281e-01, 5.59882558852296e-02, 0.0, 0.0, 0.0, 0.0], @@ -40,107 +27,40 @@ impl SBP8 { [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0], [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], ]); - const BLOCKEND_MATRIX: super::Matrix = - super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX))); + const DIFF: BlockMatrix<8, 12, 9> = BlockMatrix::new( + Self::DIFF_BLOCK, + Self::DIFF_DIAG, + super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), + ); } impl SbpOperator1d for SBP8 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d_matrix( - &Self::BLOCK_MATRIX, - &Self::BLOCKEND_MATRIX, - &Self::DIAG_MATRIX, - super::OperatorType::Normal, - prev, - fut, - ) + super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut) } fn h(&self) -> &'static [Float] { - Self::HBLOCK + &Self::H.start } #[cfg(feature = "sparse")] fn diff_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - n, - ) + super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n) } #[cfg(feature = "sparse")] fn h_matrix(&self, n: usize) -> sprs::CsMat { - super::h_matrix(Self::HBLOCK, n, self.is_h2()) + super::h_matrix(&Self::H, n, self.is_h2()) } } -fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { - /* - let mut flipmatrix = SBP8::BLOCK_MATRIX; - flipmatrix *= &-1.0; - */ - - for (p, mut f) in prev - .axis_iter(ndarray::Axis(0)) - .zip(fut.axis_iter_mut(ndarray::Axis(0))) - { - super::diff_op_1d_slice_matrix( - &SBP8::BLOCK_MATRIX, - &SBP8::BLOCKEND_MATRIX, - //&flipmatrix, - &SBP8::DIAG_MATRIX, - super::OperatorType::Normal, - p.as_slice().unwrap(), - f.as_slice_mut().unwrap(), - ) - } -} - -fn diff_op_col_local(prev: ndarray::ArrayView2, fut: ndarray::ArrayViewMut2) { - let optype = super::OperatorType::Normal; - super::diff_op_col_matrix( - &SBP8::BLOCK_MATRIX, - &SBP8::BLOCKEND_MATRIX, - &SBP8::DIAG_MATRIX, - optype, - prev, - fut, - ) -} - impl SbpOperator2d for SBP8 { - fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn diffxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * SBP8::BLOCK.len()); - - let symmetry = super::Symmetry::AntiSymmetric; - let optype = super::OperatorType::Normal; - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - //diff_op_row(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); - diff_op_row_local(prev, fut); - } - ([1, _], [1, _]) => { - //diff_op_col(SBP8::BLOCK, SBP8::DIAG, symmetry, optype)(prev, fut); - diff_op_col_local(prev, fut) - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - SBP8.diff(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut) } fn op_xi(&self) -> &dyn SbpOperator1d { &Self } - fn op_eta(&self) -> &dyn SbpOperator1d { - &Self - } } #[test] diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs index d2a4eb4..b9742bf 100644 --- a/sbp/src/operators/upwind4.rs +++ b/sbp/src/operators/upwind4.rs @@ -1,241 +1,66 @@ use super::{ - diff_op_col, diff_op_row, Matrix, RowVector, SbpOperator1d, SbpOperator2d, UpwindOperator1d, - UpwindOperator2d, + BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d, + UpwindOperator1d, UpwindOperator2d, }; use crate::Float; -use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Axis}; +use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; #[derive(Debug, Copy, Clone)] pub struct Upwind4; -/// Simdtype used in diff_simd_col -#[cfg(feature = "f32")] -type SimdT = packed_simd::f32x8; -#[cfg(not(feature = "f32"))] -type SimdT = packed_simd::f64x8; - -macro_rules! diff_simd_col_7_47 { - ($name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => { - #[inline(never)] - fn $name(prev: ArrayView2, mut fut: ArrayViewMut2) { - use std::slice; - assert_eq!(prev.shape(), fut.shape()); - assert_eq!(prev.stride_of(Axis(0)), 1); - assert_eq!(fut.stride_of(Axis(0)), 1); - let ny = prev.len_of(Axis(0)); - let nx = prev.len_of(Axis(1)); - assert!(nx >= 2 * $BLOCK.len()); - assert!(ny >= SimdT::lanes()); - assert!(ny % SimdT::lanes() == 0); - - let dx = 1.0 / (nx - 1) as Float; - let idx = 1.0 / dx; - - for j in (0..ny).step_by(SimdT::lanes()) { - let a = unsafe { - [ - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 0)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 1)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 2)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 3)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 4)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 5)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, 6)) as *const Float, - SimdT::lanes(), - )), - ] - }; - - for (i, bl) in $BLOCK.iter().enumerate() { - let b = idx - * (a[0] * bl[0] - + a[1] * bl[1] - + a[2] * bl[2] - + a[3] * bl[3] - + a[4] * bl[4] - + a[5] * bl[5] - + a[6] * bl[6]); - unsafe { - b.write_to_slice_unaligned(slice::from_raw_parts_mut( - fut.uget_mut((j, i)) as *mut Float, - SimdT::lanes(), - )); - } - } - - let mut a = a; - for i in $BLOCK.len()..nx - $BLOCK.len() { - // Push a onto circular buffer - a = [a[1], a[2], a[3], a[4], a[5], a[6], unsafe { - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, i + 3)) as *const Float, - SimdT::lanes(), - )) - }]; - let b = idx - * (a[0] * $DIAG[0] - + a[1] * $DIAG[1] - + a[2] * $DIAG[2] - + a[3] * $DIAG[3] - + a[4] * $DIAG[4] - + a[5] * $DIAG[5] - + a[6] * $DIAG[6]); - unsafe { - b.write_to_slice_unaligned(slice::from_raw_parts_mut( - fut.uget_mut((j, i)) as *mut Float, - SimdT::lanes(), - )); - } - } - - let a = unsafe { - [ - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 1)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 2)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 3)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 4)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 5)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 6)) as *const Float, - SimdT::lanes(), - )), - SimdT::from_slice_unaligned(slice::from_raw_parts( - prev.uget((j, nx - 7)) as *const Float, - SimdT::lanes(), - )), - ] - }; - - for (i, bl) in $BLOCK.iter().enumerate() { - let idx = if $symmetric { idx } else { -idx }; - let b = idx - * (a[0] * bl[0] - + a[1] * bl[1] - + a[2] * bl[2] - + a[3] * bl[3] - + a[4] * bl[4] - + a[5] * bl[5] - + a[6] * bl[6]); - unsafe { - b.write_to_slice_unaligned(slice::from_raw_parts_mut( - fut.uget_mut((j, nx - 1 - i)) as *mut Float, - SimdT::lanes(), - )); - } - } - } - } - }; -} - -diff_simd_col_7_47!(diff_simd_col, Upwind4::BLOCK, Upwind4::DIAG, false); -diff_simd_col_7_47!(diss_simd_col, Upwind4::DISS_BLOCK, Upwind4::DISS_DIAG, true); - impl Upwind4 { #[rustfmt::skip] - pub const HBLOCK: &'static [Float] = &[ + const H: DiagonalMatrix<4> = DiagonalMatrix::new([ 49.0 / 144.0, 61.0 / 48.0, 41.0 / 48.0, 149.0 / 144.0 - ]; - #[rustfmt::skip] - const DIAG: &'static [Float] = &[ - -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0 - ]; - #[rustfmt::skip] - const DIAG_MATRIX : RowVector = RowVector::new([ - [-1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0] ]); #[rustfmt::skip] - const BLOCK: &'static [&'static [Float]] = &[ - &[ -72.0 / 49.0, 187.0 / 98.0, -20.0 / 49.0, -3.0 / 98.0, 0.0, 0.0, 0.0], - &[-187.0 / 366.0, 0.0, 69.0 / 122.0, -16.0 / 183.0, 2.0 / 61.0, 0.0, 0.0], - &[ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], - &[ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], - ]; - #[rustfmt::skip] - const BLOCK_MATRIX: Matrix = Matrix::new([ + const DIFF_BLOCK: Matrix = Matrix::new([ [ -72.0 / 49.0, 187.0 / 98.0, -20.0 / 49.0, -3.0 / 98.0, 0.0, 0.0, 0.0], [-187.0 / 366.0, 0.0, 69.0 / 122.0, -16.0 / 183.0, 2.0 / 61.0, 0.0, 0.0], [ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], [ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], ]); - const BLOCKEND_MATRIX: Matrix = - super::flip_sign(super::flip_ud(super::flip_lr(Self::BLOCK_MATRIX))); + #[rustfmt::skip] + const DIFF_DIAG: RowVector = RowVector::new([[ + -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0 + ]]); + const DIFF_BLOCKEND: Matrix = + super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))); + + const DIFF: BlockMatrix<4, 7, 7> = + BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND); #[rustfmt::skip] - const DISS_BLOCK: &'static [&'static [Float]] = &[ - &[-3.0 / 49.0, 9.0 / 49.0, -9.0 / 49.0, 3.0 / 49.0, 0.0, 0.0, 0.0], - &[ 3.0 / 61.0, -11.0 / 61.0, 15.0 / 61.0, -9.0 / 61.0, 2.0 / 61.0, 0.0, 0.0], - &[-3.0 / 41.0, 15.0 / 41.0, -29.0 / 41.0, 27.0 / 41.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], - &[3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], - ]; - + const DISS_BLOCK: Matrix = Matrix::new([ + [-3.0 / 49.0, 9.0 / 49.0, -9.0 / 49.0, 3.0 / 49.0, 0.0, 0.0, 0.0], + [ 3.0 / 61.0, -11.0 / 61.0, 15.0 / 61.0, -9.0 / 61.0, 2.0 / 61.0, 0.0, 0.0], + [-3.0 / 41.0, 15.0 / 41.0, -29.0 / 41.0, 27.0 / 41.0, -12.0 / 41.0, 2.0 / 41.0, 0.0], + [3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0], + ]); #[rustfmt::skip] - const DISS_DIAG: &'static [Float; 7] = &[ + const DISS_DIAG: RowVector = Matrix::new([[ 1.0 / 24.0, -1.0 / 4.0, 5.0 / 8.0, -5.0 / 6.0, 5.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0 - ]; + ]]); + const DISS_BLOCKEND: Matrix = super::flip_ud(super::flip_lr(Self::DISS_BLOCK)); + + const DISS: BlockMatrix<4, 7, 7> = + BlockMatrix::new(Self::DISS_BLOCK, Self::DISS_DIAG, Self::DISS_BLOCKEND); } impl SbpOperator1d for Upwind4 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d_matrix( - &Self::BLOCK_MATRIX, - &Self::BLOCKEND_MATRIX, - &Self::DIAG_MATRIX, - super::OperatorType::Normal, - prev, - fut, - ) + super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut) } fn h(&self) -> &'static [Float] { - Self::HBLOCK + &Self::H.start } #[cfg(feature = "sparse")] fn diff_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - n, - ) + super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n) } #[cfg(feature = "sparse")] fn h_matrix(&self, n: usize) -> sprs::CsMat { - super::h_matrix(Self::HBLOCK, n, self.is_h2()) + super::h_matrix(&Self::H, n, self.is_h2()) } fn upwind(&self) -> Option<&dyn UpwindOperator1d> { @@ -243,53 +68,14 @@ impl SbpOperator1d for Upwind4 { } } -fn diff_op_row_local(prev: ndarray::ArrayView2, mut fut: ndarray::ArrayViewMut2) { - for (p, mut f) in prev - .axis_iter(ndarray::Axis(0)) - .zip(fut.axis_iter_mut(ndarray::Axis(0))) - { - super::diff_op_1d_slice_matrix( - &Upwind4::BLOCK_MATRIX, - &Upwind4::BLOCKEND_MATRIX, - &Upwind4::DIAG_MATRIX, - super::OperatorType::Normal, - p.as_slice().unwrap(), - f.as_slice_mut().unwrap(), - ) - } -} - impl SbpOperator2d for Upwind4 { - fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn diffxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind4::BLOCK.len()); - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => diff_op_row_local(prev, fut), - ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => { - diff_simd_col(prev, fut) - } - ([1, _], [1, _]) => diff_op_col( - Upwind4::BLOCK, - Upwind4::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - )(prev, fut), - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind4.diff(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut); } fn op_xi(&self) -> &dyn SbpOperator1d { &Self } - fn op_eta(&self) -> &dyn SbpOperator1d { - &Self - } fn upwind(&self) -> Option> { Some(Box::new(Self)) } @@ -385,14 +171,7 @@ fn upwind4_test() { impl UpwindOperator1d for Upwind4 { fn diss(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - prev, - fut, - ) + super::diff_op_1d(&Self::DISS, super::OperatorType::Normal, prev, fut) } fn as_sbp(&self) -> &dyn SbpOperator1d { @@ -401,53 +180,20 @@ impl UpwindOperator1d for Upwind4 { #[cfg(feature = "sparse")] fn diss_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - n, - ) + super::sparse_from_block(&Self::DISS, super::OperatorType::Normal, n) } } impl UpwindOperator2d for Upwind4 { - fn dissxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn dissxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind4::BLOCK.len()); - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => diff_op_row( - Upwind4::DISS_BLOCK, - Upwind4::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - )(prev, fut), - ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => { - diss_simd_col(prev, fut); - } - ([1, _], [1, _]) => diff_op_col( - Upwind4::DISS_BLOCK, - Upwind4::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - )(prev, fut), - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind4.diss(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DISS, OperatorType::Normal, prev, fut) } fn op_xi(&self) -> &dyn UpwindOperator1d { &Self } - fn op_eta(&self) -> &dyn UpwindOperator1d { - &Self - } } #[test] diff --git a/sbp/src/operators/upwind4h2.rs b/sbp/src/operators/upwind4h2.rs index 0ef5f9b..e1cbf60 100644 --- a/sbp/src/operators/upwind4h2.rs +++ b/sbp/src/operators/upwind4h2.rs @@ -1,5 +1,6 @@ use super::{ - diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d, + BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d, + UpwindOperator1d, UpwindOperator2d, }; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -9,49 +10,52 @@ pub struct Upwind4h2; impl Upwind4h2 { #[rustfmt::skip] - const HBLOCK: &'static [Float] = &[ + const H: DiagonalMatrix<4> = DiagonalMatrix::new([ 0.91e2/0.720e3, 0.325e3/0.384e3, 0.595e3/0.576e3, 0.1909e4/0.1920e4, - ]; + ]); #[rustfmt::skip] - const DIAG: &'static [Float] = &[ + const DIFF_BLOCK: Matrix = Matrix::new([ + [-3.95604395604396e+00, 5.41758241758242e+00, -1.94505494505495e+00, 4.83516483516484e-01, 0.0, 0.0, 0.0], + [-8.09025641025641e-01, 0.00000000000000e+00, 1.03948717948718e+00, -2.47384615384615e-01, 1.69230769230769e-02, 0.0, 0.0], + [2.37983193277311e-01, -8.51680672268908e-01, 0.00000000000000e+00, 7.35966386554622e-01, -1.36134453781513e-01, 1.38655462184874e-02, 0.0], + [-6.14632442814737e-02, 2.10581456259822e-01, -7.64623712240265e-01, 0.00000000000000e+00, 7.42535358826611e-01, -1.41435306443164e-01, 1.44054478784704e-02], + ]); + #[rustfmt::skip] + const DIFF_DIAG: RowVector = RowVector::new([[ -1.43229166666667e-02, 1.40625000000000e-01, -7.38281250000000e-01, 0.00000000000000e+00, 7.38281250000000e-01, -1.40625000000000e-01, 1.43229166666667e-02 - ]; - #[rustfmt::skip] - const BLOCK: &'static [&'static [Float]] = &[ - &[-3.95604395604396e+00, 5.41758241758242e+00, -1.94505494505495e+00, 4.83516483516484e-01], - &[-8.09025641025641e-01, 0.00000000000000e+00, 1.03948717948718e+00, -2.47384615384615e-01, 1.69230769230769e-02], - &[2.37983193277311e-01, -8.51680672268908e-01, 0.00000000000000e+00, 7.35966386554622e-01, -1.36134453781513e-01, 1.38655462184874e-02], - &[-6.14632442814737e-02, 2.10581456259822e-01, -7.64623712240265e-01, 0.00000000000000e+00, 7.42535358826611e-01, -1.41435306443164e-01, 1.44054478784704e-02], - ]; + ]]); + const DIFF: BlockMatrix<4, 7, 7> = BlockMatrix::new( + Self::DIFF_BLOCK, + Self::DIFF_DIAG, + super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), + ); #[rustfmt::skip] - const DISS_BLOCK: &'static [&'static [Float]] = &[ - &[-2.76989342315976e-01, 5.19355016842454e-01, -3.46236677894969e-01, 1.03871003368491e-01], - &[7.75570158484731e-02, -1.62342481638964e-01, 1.47715500579822e-01, -7.98531117124082e-02, 1.69230769230769e-02], - &[-4.23630758836198e-02, 1.21027405937249e-01, -1.91609307039399e-01, 1.82272708078206e-01, -8.31932773109244e-02, 1.38655462184874e-02], - &[1.32037874021759e-02, -6.79734450144910e-02, 1.89370108794365e-01, -2.78654929966754e-01, 2.16081718177056e-01, -8.64326872708224e-02, 1.44054478784704e-02], - ]; + const DISS_BLOCK: Matrix = Matrix::new([ + [-2.76989342315976e-01, 5.19355016842454e-01, -3.46236677894969e-01, 1.03871003368491e-01, 0.0, 0.0, 0.0], + [7.75570158484731e-02, -1.62342481638964e-01, 1.47715500579822e-01, -7.98531117124082e-02, 1.69230769230769e-02, 0.0, 0.0], + [-4.23630758836198e-02, 1.21027405937249e-01, -1.91609307039399e-01, 1.82272708078206e-01, -8.31932773109244e-02, 1.38655462184874e-02, 0.0], + [1.32037874021759e-02, -6.79734450144910e-02, 1.89370108794365e-01, -2.78654929966754e-01, 2.16081718177056e-01, -8.64326872708224e-02, 1.44054478784704e-02], + ]); #[rustfmt::skip] - const DISS_DIAG: &'static [Float; 7] = &[ + const DISS_DIAG: RowVector = RowVector::new([[ 1.43229166666667e-02, -8.59375000000000e-02, 2.14843750000000e-01, -2.86458333333333e-01, 2.14843750000000e-01, -8.59375000000000e-02, 1.43229166666667e-02, - ]; + ]]); + const DISS: BlockMatrix<4, 7, 7> = BlockMatrix::new( + Self::DISS_BLOCK, + Self::DISS_DIAG, + super::flip_ud(super::flip_lr(Self::DISS_BLOCK)), + ); } impl SbpOperator1d for Upwind4h2 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::H2, - prev, - fut, - ) + super::diff_op_1d(&Self::DIFF, OperatorType::H2, prev, fut) } fn h(&self) -> &'static [Float] { - Self::HBLOCK + &Self::H.start } fn is_h2(&self) -> bool { true @@ -59,17 +63,11 @@ impl SbpOperator1d for Upwind4h2 { #[cfg(feature = "sparse")] fn diff_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::H2, - n, - ) + super::sparse_from_block(&Self::DIFF, OperatorType::H2, n) } #[cfg(feature = "sparse")] fn h_matrix(&self, n: usize) -> sprs::CsMat { - super::h_matrix(Self::HBLOCK, n, self.is_h2()) + super::h_matrix(&Self::H, n, self.is_h2()) } fn upwind(&self) -> Option<&dyn UpwindOperator1d> { @@ -78,80 +76,28 @@ impl SbpOperator1d for Upwind4h2 { } impl SbpOperator2d for Upwind4h2 { - fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn diffxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind4h2::BLOCK.len()); - let symmetry = super::Symmetry::AntiSymmetric; - let optype = super::OperatorType::H2; - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - diff_op_row(Upwind4h2::BLOCK, Upwind4h2::DIAG, symmetry, optype)(prev, fut); - } - ([1, _], [1, _]) => { - diff_op_col(Upwind4h2::BLOCK, Upwind4h2::DIAG, symmetry, optype)(prev, fut); - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind4h2.diff(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DIFF, OperatorType::H2, prev, fut) } fn op_xi(&self) -> &dyn SbpOperator1d { &Self } - fn op_eta(&self) -> &dyn SbpOperator1d { - &Self - } fn upwind(&self) -> Option> { Some(Box::new(Self)) } } impl UpwindOperator2d for Upwind4h2 { - fn dissxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn dissxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind4h2::BLOCK.len()); - let symmetry = super::Symmetry::Symmetric; - let optype = super::OperatorType::H2; - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - diff_op_row( - Upwind4h2::DISS_BLOCK, - Upwind4h2::DISS_DIAG, - symmetry, - optype, - )(prev, fut); - } - ([1, _], [1, _]) => { - diff_op_col( - Upwind4h2::DISS_BLOCK, - Upwind4h2::DISS_DIAG, - symmetry, - optype, - )(prev, fut); - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind4h2.diss(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DISS, OperatorType::H2, prev, fut) } fn op_xi(&self) -> &dyn UpwindOperator1d { &Self } - fn op_eta(&self) -> &dyn UpwindOperator1d { - &Self - } } #[test] @@ -181,14 +127,7 @@ fn upwind4h2_test() { impl UpwindOperator1d for Upwind4h2 { fn diss(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::H2, - prev, - fut, - ) + super::diff_op_1d(&Self::DISS, super::OperatorType::H2, prev, fut) } fn as_sbp(&self) -> &dyn SbpOperator1d { @@ -197,12 +136,6 @@ impl UpwindOperator1d for Upwind4h2 { #[cfg(feature = "sparse")] fn diss_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::H2, - n, - ) + super::sparse_from_block(&Self::DISS, super::OperatorType::H2, n) } } diff --git a/sbp/src/operators/upwind9.rs b/sbp/src/operators/upwind9.rs index 596a354..c6d374e 100644 --- a/sbp/src/operators/upwind9.rs +++ b/sbp/src/operators/upwind9.rs @@ -1,5 +1,6 @@ use super::{ - diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d, + BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d, + UpwindOperator1d, UpwindOperator2d, }; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -9,72 +10,69 @@ pub struct Upwind9; impl Upwind9 { #[rustfmt::skip] - const HBLOCK: &'static [Float] = &[ + const H: DiagonalMatrix<8> = DiagonalMatrix::new([ 1070017.0/3628800.0, 5537111.0/3628800.0, 103613.0/403200.0, 261115.0/145152.0, 298951.0/725760.0, 515677.0/403200.0, 3349879.0/3628800.0, 3662753.0/3628800.0 - ]; + ]); #[rustfmt::skip] - const DIAG: &'static [Float] = &[ + const DIFF_DIAG: RowVector = RowVector::new([[ -1.0/1260.0, 5.0/504.0, -5.0/84.0, 5.0/21.0, -5.0/6.0, 0.0, 5.0/6.0, -5.0/21.0, 5.0/84.0, -5.0/504.0, 1.0/1260.0, - ]; + ]]); #[rustfmt::skip] - const BLOCK: &'static [&'static [Float]] = &[ - &[-1.69567399396458e+00, 2.29023358159400e+00, -2.16473500425698e-01, -5.05879766354449e-01, -1.01161106778154e-01, 2.59147072064383e-01, 1.93922119400659e-02, -4.95844980755642e-02], - &[-4.42575354959737e-01, 0.00000000000000e+00, 1.91582959381899e-01, 2.82222626681305e-01, 1.12083989713257e-01, -1.51334868892111e-01, -2.23600502721044e-02, 3.03806983474913e-02], - &[2.48392603571843e-01, -1.13758367065272e+00, 0.00000000000000e+00, 1.95334726810969e+00, -1.58879011773212e+00, 3.93797129320378e-01, 2.52140821030291e-01, -1.21304033647356e-01], - &[8.29213105268236e-02, -2.39388470313226e-01, -2.79038666398460e-01, 0.00000000000000e+00, 3.43018053395471e-01, 1.10370852514749e-01, 1.72029988649808e-03, -2.00445645303789e-02, 4.41184918522490e-04], - &[7.24159504343116e-02, -4.15199475743626e-01, 9.91181694804303e-01, -1.49802407438608e+00, 0.00000000000000e+00, 1.30188867830442e+00, -6.03535071819214e-01, 1.73429775718218e-01, -2.40842144699299e-02, 1.92673715759439e-03], - &[-5.97470838462221e-02, 1.80551858630298e-01, -7.91241454636765e-02, -1.55240829877729e-01, -4.19298775383066e-01, 0.00000000000000e+00, 6.42287612546289e-01, -1.48833147569152e-01, 4.65407609802260e-02, -7.75679349670433e-03, 6.20543479736347e-04], - &[-6.19425252179959e-03, 3.69595678895333e-02, -7.01892820620398e-02, -3.35233082197107e-03, 2.69304373763091e-01, -8.89857974743355e-01, 0.00000000000000e+00, 8.66656645522330e-01, -2.57919763669076e-01, 6.44799409172690e-02, -1.07466568195448e-02, 8.59732545563586e-04], - &[1.44853491014330e-02, -4.59275574977554e-02, 3.08833474560615e-02, 3.57240610228828e-02, -7.07760049349999e-02, 1.88587240076292e-01, -7.92626447113877e-01, 0.00000000000000e+00, 8.25608497215073e-01, -2.35888142061449e-01, 5.89720355153623e-02, -9.82867258589373e-03, 7.86293806871498e-04], - ]; + const DIFF_BLOCK: Matrix = Matrix::new([ + [-1.69567399396458e+00, 2.29023358159400e+00, -2.16473500425698e-01, -5.05879766354449e-01, -1.01161106778154e-01, 2.59147072064383e-01, 1.93922119400659e-02, -4.95844980755642e-02, 0.0, 0.0, 0.0, 0.0, 0.0], + [-4.42575354959737e-01, 0.00000000000000e+00, 1.91582959381899e-01, 2.82222626681305e-01, 1.12083989713257e-01, -1.51334868892111e-01, -2.23600502721044e-02, 3.03806983474913e-02, 0.0, 0.0, 0.0, 0.0,0.0], + [2.48392603571843e-01, -1.13758367065272e+00, 0.00000000000000e+00, 1.95334726810969e+00, -1.58879011773212e+00, 3.93797129320378e-01, 2.52140821030291e-01, -1.21304033647356e-01, 0.0, 0.0, 0.0, 0.0, 0.0], + [8.29213105268236e-02, -2.39388470313226e-01, -2.79038666398460e-01, 0.00000000000000e+00, 3.43018053395471e-01, 1.10370852514749e-01, 1.72029988649808e-03, -2.00445645303789e-02, 4.41184918522490e-04, 0.0, 0.0, 0.0, 0.0], + [7.24159504343116e-02, -4.15199475743626e-01, 9.91181694804303e-01, -1.49802407438608e+00, 0.00000000000000e+00, 1.30188867830442e+00, -6.03535071819214e-01, 1.73429775718218e-01, -2.40842144699299e-02, 1.92673715759439e-03, 0.0, 0.0, 0.0], + [-5.97470838462221e-02, 1.80551858630298e-01, -7.91241454636765e-02, -1.55240829877729e-01, -4.19298775383066e-01, 0.00000000000000e+00, 6.42287612546289e-01, -1.48833147569152e-01, 4.65407609802260e-02, -7.75679349670433e-03, 6.20543479736347e-04, 0.0, 0.0], + [-6.19425252179959e-03, 3.69595678895333e-02, -7.01892820620398e-02, -3.35233082197107e-03, 2.69304373763091e-01, -8.89857974743355e-01, 0.00000000000000e+00, 8.66656645522330e-01, -2.57919763669076e-01, 6.44799409172690e-02, -1.07466568195448e-02, 8.59732545563586e-04, 0.0], + [1.44853491014330e-02, -4.59275574977554e-02, 3.08833474560615e-02, 3.57240610228828e-02, -7.07760049349999e-02, 1.88587240076292e-01, -7.92626447113877e-01, 0.00000000000000e+00, 8.25608497215073e-01, -2.35888142061449e-01, 5.89720355153623e-02, -9.82867258589373e-03, 7.86293806871498e-04], + ]); + const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new( + Self::DIFF_BLOCK, + Self::DIFF_DIAG, + super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), + ); #[rustfmt::skip] - const DISS_BLOCK: &'static [&'static [Float]] = &[ - &[-3.99020778658945e-04, 2.05394169917502e-03, -4.24493243399805e-03, 4.38126393542801e-03, -2.18883813216888e-03, 2.98565988131608e-04, 1.38484104084115e-04, -3.94643819928825e-05], - &[3.96913216138553e-04, -2.28230530115522e-03, 5.43069719436758e-03, -6.81086901935894e-03, 4.69064759201504e-03, -1.61429862514855e-03, 1.62083873811316e-04, 2.71310693302277e-05], - &[-4.87084939816571e-03, 3.22464611075207e-02, -9.06094757860846e-02, 1.39830191253413e-01, -1.27675500367419e-01, 6.87310321912961e-02, -2.00917702215270e-02, 2.43991122096699e-03], - &[7.18155125886276e-04, -5.77715378536685e-03, 1.99749582302141e-02, -3.87940986951101e-02, 4.62756436981388e-02, -3.46770570075288e-02, 1.59058082995305e-02, -4.06744078428648e-03, 4.41184918522490e-04], - &[-1.56687484682703e-03, 1.73758484693946e-02, -7.96515646886111e-02, 2.02094401829054e-01, -3.16098733124618e-01, 3.17999240131250e-01, -2.06522928911140e-01, 8.37112455598470e-02, -1.92673715759439e-02, 1.92673715759439e-03], - &[6.88352254356072e-05, -1.92595810396278e-03, 1.38098624496279e-02, -4.87746083763075e-02, 1.02417890394006e-01, -1.38292226669620e-01, 1.23829022892659e-01, -7.34723830823462e-02, 2.79244565881356e-02, -6.20543479736347e-03, 6.20543479736347e-04], - &[4.42345367100640e-05, 2.67913080025652e-04, -5.59301314813691e-03, 3.09954862110834e-02, -9.21529346596015e-02, 1.71559035817103e-01, -2.12738289547735e-01, 1.79835101537893e-01, -1.03167905467630e-01, 3.86879645503614e-02, -8.59732545563586e-03, 8.59732545563586e-04], - &[-1.15289127131636e-05, 4.10149803795578e-05, 6.21188131452618e-04, -7.24912245235322e-03, 3.41622279353287e-02, -9.30972311856124e-02, 1.64473506705108e-01, -1.98013074867399e-01, 1.65121699443015e-01, -9.43552568245798e-02, 3.53832213092174e-02, -7.86293806871498e-03, 7.86293806871498e-04] - ]; + const DISS_BLOCK: Matrix = Matrix::new([ + [-3.99020778658945e-04, 2.05394169917502e-03, -4.24493243399805e-03, 4.38126393542801e-03, -2.18883813216888e-03, 2.98565988131608e-04, 1.38484104084115e-04, -3.94643819928825e-05, 0.0, 0.0, 0.0, 0.0, 0.0], + [3.96913216138553e-04, -2.28230530115522e-03, 5.43069719436758e-03, -6.81086901935894e-03, 4.69064759201504e-03, -1.61429862514855e-03, 1.62083873811316e-04, 2.71310693302277e-05, 0.0, 0.0, 0.0, 0.0, 0.0 ], + [-4.87084939816571e-03, 3.22464611075207e-02, -9.06094757860846e-02, 1.39830191253413e-01, -1.27675500367419e-01, 6.87310321912961e-02, -2.00917702215270e-02, 2.43991122096699e-03, 0.0, 0.0, 0.0, 0.0, 0.0], + [7.18155125886276e-04, -5.77715378536685e-03, 1.99749582302141e-02, -3.87940986951101e-02, 4.62756436981388e-02, -3.46770570075288e-02, 1.59058082995305e-02, -4.06744078428648e-03, 4.41184918522490e-04, 0.0, 0.0, 0.0, 0.0], + [-1.56687484682703e-03, 1.73758484693946e-02, -7.96515646886111e-02, 2.02094401829054e-01, -3.16098733124618e-01, 3.17999240131250e-01, -2.06522928911140e-01, 8.37112455598470e-02, -1.92673715759439e-02, 1.92673715759439e-03, 0.0, 0.0, 0.0], + [6.88352254356072e-05, -1.92595810396278e-03, 1.38098624496279e-02, -4.87746083763075e-02, 1.02417890394006e-01, -1.38292226669620e-01, 1.23829022892659e-01, -7.34723830823462e-02, 2.79244565881356e-02, -6.20543479736347e-03, 6.20543479736347e-04, 0.0, 0.0], + [4.42345367100640e-05, 2.67913080025652e-04, -5.59301314813691e-03, 3.09954862110834e-02, -9.21529346596015e-02, 1.71559035817103e-01, -2.12738289547735e-01, 1.79835101537893e-01, -1.03167905467630e-01, 3.86879645503614e-02, -8.59732545563586e-03, 8.59732545563586e-04, 0.0], + [-1.15289127131636e-05, 4.10149803795578e-05, 6.21188131452618e-04, -7.24912245235322e-03, 3.41622279353287e-02, -9.30972311856124e-02, 1.64473506705108e-01, -1.98013074867399e-01, 1.65121699443015e-01, -9.43552568245798e-02, 3.53832213092174e-02, -7.86293806871498e-03, 7.86293806871498e-04] + ]); #[rustfmt::skip] - const DISS_DIAG: &'static [Float] = &[ + const DISS_DIAG: RowVector = RowVector::new([[ 1.0/1260.0, -1.0/126.0, 1.0/28.0, -2.0/21.0, 1.0/6.0, -1.0/5.0, 1.0/6.0, -2.0/21.0, 1.0/28.0, -1.0/126.0, 1.0/1260.0, - ]; + ]]); + const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new( + Self::DISS_BLOCK, + Self::DISS_DIAG, + super::flip_ud(super::flip_lr(Self::DISS_BLOCK)), + ); } impl SbpOperator1d for Upwind9 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - prev, - fut, - ) + super::diff_op_1d(&Self::DIFF, super::OperatorType::Normal, prev, fut) } fn h(&self) -> &'static [Float] { - Self::HBLOCK + &Self::H.start } #[cfg(feature = "sparse")] fn diff_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::Normal, - n, - ) + super::sparse_from_block(&Self::DIFF, super::OperatorType::Normal, n) } #[cfg(feature = "sparse")] fn h_matrix(&self, n: usize) -> sprs::CsMat { - super::h_matrix(Self::HBLOCK, n, self.is_h2()) + super::h_matrix(&Self::H, n, self.is_h2()) } fn upwind(&self) -> Option<&dyn UpwindOperator1d> { @@ -83,35 +81,14 @@ impl SbpOperator1d for Upwind9 { } impl SbpOperator2d for Upwind9 { - fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn diffxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind9::BLOCK.len()); - let symmetry = super::Symmetry::AntiSymmetric; - let optype = super::OperatorType::Normal; - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - diff_op_row(Upwind9::BLOCK, Upwind9::DIAG, symmetry, optype)(prev, fut); - } - ([1, _], [1, _]) => { - diff_op_col(Upwind9::BLOCK, Upwind9::DIAG, symmetry, optype)(prev, fut); - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind9.diff(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DIFF, OperatorType::Normal, prev, fut) } fn op_xi(&self) -> &dyn SbpOperator1d { &Self } - fn op_eta(&self) -> &dyn SbpOperator1d { - &Self - } fn upwind(&self) -> Option> { Some(Box::new(Self)) } @@ -119,14 +96,7 @@ impl SbpOperator2d for Upwind9 { impl UpwindOperator1d for Upwind9 { fn diss(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - prev, - fut, - ) + super::diff_op_1d(&Self::DISS, super::OperatorType::Normal, prev, fut) } fn as_sbp(&self) -> &dyn SbpOperator1d { @@ -135,46 +105,18 @@ impl UpwindOperator1d for Upwind9 { #[cfg(feature = "sparse")] fn diss_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::Normal, - n, - ) + super::sparse_from_block(&Self::DISS, super::OperatorType::Normal, n) } } impl UpwindOperator2d for Upwind9 { - fn dissxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn dissxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind9::BLOCK.len()); - - let symmetry = super::Symmetry::Symmetric; - let optype = super::OperatorType::Normal; - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - diff_op_row(Upwind9::DISS_BLOCK, Upwind9::DISS_DIAG, symmetry, optype)(prev, fut); - } - ([1, _], [1, _]) => { - diff_op_col(Upwind9::DISS_BLOCK, Upwind9::DISS_DIAG, symmetry, optype)(prev, fut); - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind9.diss(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DISS, OperatorType::Normal, prev, fut) } fn op_xi(&self) -> &dyn UpwindOperator1d { &Self } - fn op_eta(&self) -> &dyn UpwindOperator1d { - &Self - } } #[test] diff --git a/sbp/src/operators/upwind9h2.rs b/sbp/src/operators/upwind9h2.rs index df2e284..f099008 100644 --- a/sbp/src/operators/upwind9h2.rs +++ b/sbp/src/operators/upwind9h2.rs @@ -1,5 +1,6 @@ use super::{ - diff_op_col, diff_op_row, SbpOperator1d, SbpOperator2d, UpwindOperator1d, UpwindOperator2d, + BlockMatrix, DiagonalMatrix, Matrix, OperatorType, RowVector, SbpOperator1d, SbpOperator2d, + UpwindOperator1d, UpwindOperator2d, }; use crate::Float; use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2}; @@ -9,57 +10,60 @@ pub struct Upwind9h2; impl Upwind9h2 { #[rustfmt::skip] - const HBLOCK: &'static [Float] = &[ + const H: DiagonalMatrix<8> = DiagonalMatrix::new([ 0.13528301e8/0.97297200e8, 0.189103813e9/0.232243200e9, 0.125336387e9/0.116121600e9, 0.24412231e8/0.25804800e8, 0.11976149e8/0.11612160e8, 0.27510113e8/0.27869184e8, 0.142384289e9/0.141926400e9, 0.3018054133e10/0.3019161600e10 - ]; + ]); #[rustfmt::skip] - const DIAG: &'static [Float] = &[ + const DIFF_DIAG: RowVector = RowVector::new([[ -7.93650793650794e-04, 9.92063492063492e-03, -5.95238095238095e-02, 2.38095238095238e-01, -8.33333333333333e-01, 0.00000000000000e+00, 8.33333333333333e-01, -2.38095238095238e-01, 5.95238095238095e-02, -9.92063492063492e-03, 7.93650793650794e-04 - ]; + ]]); #[rustfmt::skip] - const BLOCK: &'static [&'static [Float]] = &[ - &[-3.59606132359119e+00, 4.89833536312447e+00, -1.88949383693789e+00, 5.47742030095865e-01, 1.98323891961440e-01, -1.81919774755160e-01, 9.75536252790282e-03, 1.33182875745630e-02], - &[-8.36438764514407e-01, 0.00000000000000e+00, 1.13444534067338e+00, -2.98589126517177e-01, -5.64158604558766e-02, 6.76718494319307e-02, -7.11434545077044e-03, -3.55909316707803e-03], - &[2.43402051702672e-01, -8.55808694890095e-01, 0.00000000000000e+00, 7.10023434316597e-01, -8.34706840757189e-02, -2.34098045062645e-02, 9.87675724649922e-03, -6.13059793689851e-04], - &[-8.05029896098980e-02, 2.56994775122436e-01, -8.10083655220697e-01, 0.00000000000000e+00, 7.86933049074895e-01, -1.76732206311904e-01, 2.52232469669152e-02, -2.67114375632805e-03, 8.38923734582063e-04], - &[-2.67370674924706e-02, 4.45404208225958e-02, 8.73562441688813e-02, -7.21839392529084e-01, 0.00000000000000e+00, 7.74603212862825e-01, -2.00161722480411e-01, 5.10878939438559e-02, -9.61911880020865e-03, 7.69529504016692e-04], - &[2.56244589039153e-02, -5.58209474181657e-02, 2.55972803937942e-02, 1.69377044771715e-01, -8.09310829929974e-01, 0.00000000000000e+00, 8.33417128898551e-01, -2.39938756878570e-01, 6.03007337701593e-02, -1.00501222950266e-02, 8.04009783602125e-04], - &[-1.35203347497451e-03, 5.77422023528126e-03, -1.06262408660325e-02, -2.37853245409338e-02, 2.05772021953463e-01, -8.20033622612654e-01, 0.00000000000000e+00, 8.31345778788959e-01, -2.37329555369694e-01, 5.93323888424235e-02, -9.88873147373725e-03, 7.91098517898980e-04], - &[-1.85246767034256e-03, 2.89905176240824e-03, 6.61951741227212e-04, 2.52792141498726e-03, -5.27086038822990e-02, 2.36934258275492e-01, -8.34333946306806e-01, 0.00000000000000e+00, 8.33639122800983e-01, -2.38182606514566e-01, 5.95456516286416e-02, -9.92427527144027e-03, 7.93942021715222e-04] - ]; + const DIFF_BLOCK: Matrix = Matrix::new([ + [-3.59606132359119e+00, 4.89833536312447e+00, -1.88949383693789e+00, 5.47742030095865e-01, 1.98323891961440e-01, -1.81919774755160e-01, 9.75536252790282e-03, 1.33182875745630e-02, 0.0, 0.0, 0.0, 0.0, 0.0], + [-8.36438764514407e-01, 0.00000000000000e+00, 1.13444534067338e+00, -2.98589126517177e-01, -5.64158604558766e-02, 6.76718494319307e-02, -7.11434545077044e-03, -3.55909316707803e-03, 0.0, 0.0, 0.0, 0.0, 0.0], + [2.43402051702672e-01, -8.55808694890095e-01, 0.00000000000000e+00, 7.10023434316597e-01, -8.34706840757189e-02, -2.34098045062645e-02, 9.87675724649922e-03, -6.13059793689851e-04, 0.0, 0.0, 0.0, 0.0, 0.0], + [-8.05029896098980e-02, 2.56994775122436e-01, -8.10083655220697e-01, 0.00000000000000e+00, 7.86933049074895e-01, -1.76732206311904e-01, 2.52232469669152e-02, -2.67114375632805e-03, 8.38923734582063e-04, 0.0, 0.0, 0.0, 0.0], + [-2.67370674924706e-02, 4.45404208225958e-02, 8.73562441688813e-02, -7.21839392529084e-01, 0.00000000000000e+00, 7.74603212862825e-01, -2.00161722480411e-01, 5.10878939438559e-02, -9.61911880020865e-03, 7.69529504016692e-04, 0.0, 0.0, 0.0], + [2.56244589039153e-02, -5.58209474181657e-02, 2.55972803937942e-02, 1.69377044771715e-01, -8.09310829929974e-01, 0.00000000000000e+00, 8.33417128898551e-01, -2.39938756878570e-01, 6.03007337701593e-02, -1.00501222950266e-02, 8.04009783602125e-04, 0.0, 0.0], + [-1.35203347497451e-03, 5.77422023528126e-03, -1.06262408660325e-02, -2.37853245409338e-02, 2.05772021953463e-01, -8.20033622612654e-01, 0.00000000000000e+00, 8.31345778788959e-01, -2.37329555369694e-01, 5.93323888424235e-02, -9.88873147373725e-03, 7.91098517898980e-04, 0.0], + [-1.85246767034256e-03, 2.89905176240824e-03, 6.61951741227212e-04, 2.52792141498726e-03, -5.27086038822990e-02, 2.36934258275492e-01, -8.34333946306806e-01, 0.00000000000000e+00, 8.33639122800983e-01, -2.38182606514566e-01, 5.95456516286416e-02, -9.92427527144027e-03, 7.93942021715222e-04] + ]); + const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new( + Self::DIFF_BLOCK, + Self::DIFF_DIAG, + super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), + ); #[rustfmt::skip] - const DISS_BLOCK: &'static [&'static [Float]] = &[ - &[-9.67684432055993e-03, 2.46709290489728e-02, -3.59750445192099e-02, 3.68391266633410e-02, -2.15642540957128e-02, 6.31810399460400e-03, -5.50815969583136e-04, -6.12008018523449e-05], - &[4.21280289799980e-03, -1.11651391003720e-02, 1.78139108670387e-02, -2.04287733194061e-02, 1.39229889120575e-02, -5.16220078628933e-03, 8.08589544070426e-04, -2.17901509897605e-06], - &[-4.63425679136442e-03, 1.34385494509234e-02, -2.60541352669482e-02, 3.74164435120410e-02, -3.36394001786921e-02, 1.82199905150995e-02, -5.42550576921788e-03, 6.78314528158575e-04], - &[5.41433680102586e-03, -1.75829845731036e-02, 4.26893646894443e-02, -7.75663958780684e-02, 9.06986463369770e-02, -6.74666360933304e-02, 3.07997352073168e-02, -7.82499022484376e-03, 8.38923734582063e-04], - &[-2.90718839510249e-03, 1.09922241766815e-02, -3.52053141560315e-02, 8.31962208882433e-02, -1.27244413706703e-01, 1.27078393791315e-01, -8.23947157593937e-02, 3.34105586971409e-02, -7.69529504016692e-03, 7.69529504016692e-04], - &[8.89941714023592e-04, -4.25818033750240e-03, 1.99225160493104e-02, -6.46588399513875e-02, 1.32772390609268e-01, -1.78508501143865e-01, 1.59999344843642e-01, -9.51030239931668e-02, 3.61804402620956e-02, -8.04009783602125e-03, 8.04009783602125e-04], - &[-7.63397185185940e-05, 6.56275990492236e-04, -5.83721252683346e-03, 2.90439093207059e-02, -8.47041434795334e-02, 1.57429980520300e-01, -1.95480070025582e-01, 1.65419875422483e-01, -9.49318221478776e-02, 3.55994333054541e-02, -7.91098517898980e-03, 7.91098517898980e-04], - &[-8.51254383837181e-06, -1.77491211003807e-06, 7.32410586432031e-04, -7.40542710012764e-03, 3.44704736857857e-02, -9.39121496781828e-02, 1.66014456295034e-01, -1.99926171069111e-01, 1.66727824560197e-01, -9.52730426058266e-02, 3.57273909771850e-02, -7.93942021715222e-03, 7.93942021715222e-04] - ]; + const DISS_BLOCK: Matrix = Matrix::new([ + [-9.67684432055993e-03, 2.46709290489728e-02, -3.59750445192099e-02, 3.68391266633410e-02, -2.15642540957128e-02, 6.31810399460400e-03, -5.50815969583136e-04, -6.12008018523449e-05, 0.0, 0.0, 0.0, 0.0, 0.0], + [4.21280289799980e-03, -1.11651391003720e-02, 1.78139108670387e-02, -2.04287733194061e-02, 1.39229889120575e-02, -5.16220078628933e-03, 8.08589544070426e-04, -2.17901509897605e-06, 0.0, 0.0, 0.0, 0.0, 0.0], + [-4.63425679136442e-03, 1.34385494509234e-02, -2.60541352669482e-02, 3.74164435120410e-02, -3.36394001786921e-02, 1.82199905150995e-02, -5.42550576921788e-03, 6.78314528158575e-04, 0.0, 0.0, 0.0, 0.0, 0.0], + [5.41433680102586e-03, -1.75829845731036e-02, 4.26893646894443e-02, -7.75663958780684e-02, 9.06986463369770e-02, -6.74666360933304e-02, 3.07997352073168e-02, -7.82499022484376e-03, 8.38923734582063e-04, 0.0, 0.0, 0.0, 0.0], + [-2.90718839510249e-03, 1.09922241766815e-02, -3.52053141560315e-02, 8.31962208882433e-02, -1.27244413706703e-01, 1.27078393791315e-01, -8.23947157593937e-02, 3.34105586971409e-02, -7.69529504016692e-03, 7.69529504016692e-04, 0.0, 0.0, 0.0], + [8.89941714023592e-04, -4.25818033750240e-03, 1.99225160493104e-02, -6.46588399513875e-02, 1.32772390609268e-01, -1.78508501143865e-01, 1.59999344843642e-01, -9.51030239931668e-02, 3.61804402620956e-02, -8.04009783602125e-03, 8.04009783602125e-04, 0.0, 0.0], + [-7.63397185185940e-05, 6.56275990492236e-04, -5.83721252683346e-03, 2.90439093207059e-02, -8.47041434795334e-02, 1.57429980520300e-01, -1.95480070025582e-01, 1.65419875422483e-01, -9.49318221478776e-02, 3.55994333054541e-02, -7.91098517898980e-03, 7.91098517898980e-04, 0.0], + [-8.51254383837181e-06, -1.77491211003807e-06, 7.32410586432031e-04, -7.40542710012764e-03, 3.44704736857857e-02, -9.39121496781828e-02, 1.66014456295034e-01, -1.99926171069111e-01, 1.66727824560197e-01, -9.52730426058266e-02, 3.57273909771850e-02, -7.93942021715222e-03, 7.93942021715222e-04] + ]); #[rustfmt::skip] - const DISS_DIAG: &'static [Float; 11] = &[ + const DISS_DIAG: RowVector = RowVector::new([[ 7.93650793650794e-04, -7.93650793650794e-03, 3.57142857142857e-02, -9.52380952380952e-02, 1.66666666666667e-01, -2.00000000000000e-01, 1.66666666666667e-01, -9.52380952380952e-02, 3.57142857142857e-02, -7.93650793650794e-03, 7.93650793650794e-04 - ]; + ]]); + const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new( + Self::DISS_BLOCK, + Self::DISS_DIAG, + super::flip_ud(super::flip_lr(Self::DISS_BLOCK)), + ); } impl SbpOperator1d for Upwind9h2 { fn diff(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::H2, - prev, - fut, - ) + super::diff_op_1d(&Self::DIFF, super::OperatorType::H2, prev, fut) } fn h(&self) -> &'static [Float] { - Self::HBLOCK + &Self::H.start } fn is_h2(&self) -> bool { true @@ -67,17 +71,11 @@ impl SbpOperator1d for Upwind9h2 { #[cfg(feature = "sparse")] fn diff_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::BLOCK, - Self::DIAG, - super::Symmetry::AntiSymmetric, - super::OperatorType::H2, - n, - ) + super::sparse_from_block(&Self::DIFF, super::OperatorType::H2, n) } #[cfg(feature = "sparse")] fn h_matrix(&self, n: usize) -> sprs::CsMat { - super::h_matrix(Self::HBLOCK, n, self.is_h2()) + super::h_matrix(&Self::H, n, self.is_h2()) } fn upwind(&self) -> Option<&dyn UpwindOperator1d> { @@ -86,35 +84,14 @@ impl SbpOperator1d for Upwind9h2 { } impl SbpOperator2d for Upwind9h2 { - fn diffxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn diffxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind9h2::BLOCK.len()); - let symmetry = super::Symmetry::AntiSymmetric; - let optype = super::OperatorType::H2; - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - diff_op_row(Upwind9h2::BLOCK, Upwind9h2::DIAG, symmetry, optype)(prev, fut); - } - ([1, _], [1, _]) => { - diff_op_col(Upwind9h2::BLOCK, Upwind9h2::DIAG, symmetry, optype)(prev, fut); - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind9h2.diff(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DIFF, OperatorType::H2, prev, fut) } fn op_xi(&self) -> &dyn SbpOperator1d { &Self } - fn op_eta(&self) -> &dyn SbpOperator1d { - &Self - } fn upwind(&self) -> Option> { Some(Box::new(Self)) } @@ -147,14 +124,7 @@ fn upwind9h2_test() { impl UpwindOperator1d for Upwind9h2 { fn diss(&self, prev: ArrayView1, fut: ArrayViewMut1) { - super::diff_op_1d( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::H2, - prev, - fut, - ) + super::diff_op_1d(&Self::DISS, super::OperatorType::H2, prev, fut) } fn as_sbp(&self) -> &dyn SbpOperator1d { self @@ -162,54 +132,17 @@ impl UpwindOperator1d for Upwind9h2 { #[cfg(feature = "sparse")] fn diss_matrix(&self, n: usize) -> sprs::CsMat { - super::sparse_from_block( - Self::DISS_BLOCK, - Self::DISS_DIAG, - super::Symmetry::Symmetric, - super::OperatorType::H2, - n, - ) + super::sparse_from_block(&Self::DISS, OperatorType::H2, n) } } impl UpwindOperator2d for Upwind9h2 { - fn dissxi(&self, prev: ArrayView2, mut fut: ArrayViewMut2) { + fn dissxi(&self, prev: ArrayView2, fut: ArrayViewMut2) { assert_eq!(prev.shape(), fut.shape()); - assert!(prev.shape()[1] >= 2 * Upwind9h2::BLOCK.len()); - let symmetry = super::Symmetry::Symmetric; - let optype = super::OperatorType::H2; - - match (prev.strides(), fut.strides()) { - ([_, 1], [_, 1]) => { - diff_op_row( - Upwind9h2::DISS_BLOCK, - Upwind9h2::DISS_DIAG, - symmetry, - optype, - )(prev, fut); - } - ([1, _], [1, _]) => { - diff_op_col( - Upwind9h2::DISS_BLOCK, - Upwind9h2::DISS_DIAG, - symmetry, - optype, - )(prev, fut); - } - ([_, _], [_, _]) => { - // Fallback, work row by row - for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) { - Upwind9h2.diss(r0, r1); - } - } - _ => unreachable!("Should only be two elements in the strides vectors"), - } + super::diff_op_2d(&Self::DISS, OperatorType::H2, prev, fut) } fn op_xi(&self) -> &dyn UpwindOperator1d { &Self } - fn op_eta(&self) -> &dyn UpwindOperator1d { - &Self - } } From 299b4f8083a76d98655dfc7d03d9b6ca50e18ee4 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 23:13:10 +0100 Subject: [PATCH 25/35] ensure FastFloat flag works --- sbp/src/operators/algos.rs | 30 +++++++++++++----------------- sbp/src/operators/traditional4.rs | 4 ++-- sbp/src/operators/traditional8.rs | 2 +- sbp/src/operators/upwind4.rs | 4 ++-- sbp/src/operators/upwind4h2.rs | 4 ++-- sbp/src/operators/upwind9.rs | 4 ++-- sbp/src/operators/upwind9h2.rs | 4 ++-- 7 files changed, 24 insertions(+), 28 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 5a608a6..d66e74c 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -31,18 +31,14 @@ impl DiagonalMatrix { } #[derive(Clone, Debug, PartialEq)] -pub(crate) struct BlockMatrix { - pub start: Matrix, - pub diag: RowVector, - pub end: Matrix, +pub(crate) struct BlockMatrix { + pub start: Matrix, + pub diag: RowVector, + pub end: Matrix, } -impl BlockMatrix { - pub const fn new( - start: Matrix, - diag: RowVector, - end: Matrix, - ) -> Self { +impl BlockMatrix { + pub const fn new(start: Matrix, diag: RowVector, end: Matrix) -> Self { Self { start, diag, end } } } @@ -57,7 +53,7 @@ pub(crate) enum OperatorType { #[inline(always)] /// Works on all 1d vectors pub(crate) fn diff_op_1d_fallback( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView1, mut fut: ArrayViewMut1, @@ -106,7 +102,7 @@ pub(crate) fn diff_op_1d_fallback( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, prev: &[Float], fut: &mut [Float], @@ -181,7 +177,7 @@ pub(crate) fn diff_op_1d_slice( #[inline(always)] /// Will always work on 1d, delegated based on slicedness pub(crate) fn diff_op_1d( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView1, mut fut: ArrayViewMut1, @@ -201,7 +197,7 @@ pub(crate) fn diff_op_1d( #[allow(unused)] /// 2D diff fallback for when matrices are not slicable pub(crate) fn diff_op_2d_fallback( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView2, mut fut: ArrayViewMut2, @@ -275,7 +271,7 @@ pub(crate) fn diff_op_2d_fallback( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView2, mut fut: ArrayViewMut2, @@ -292,7 +288,7 @@ pub(crate) fn diff_op_2d_sliceable( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView2, fut: ArrayViewMut2, @@ -494,7 +490,7 @@ fn dotproduct<'a>( #[cfg(feature = "sparse")] pub(crate) fn sparse_from_block( - matrix: &BlockMatrix, + matrix: &BlockMatrix, optype: OperatorType, n: usize, ) -> sprs::CsMat { diff --git a/sbp/src/operators/traditional4.rs b/sbp/src/operators/traditional4.rs index 167d36b..e2b59df 100644 --- a/sbp/src/operators/traditional4.rs +++ b/sbp/src/operators/traditional4.rs @@ -27,7 +27,7 @@ impl SBP4 { const DIFF_BLOCKEND: super::Matrix = super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))); - const DIFF: BlockMatrix<4, 6, 5> = + const DIFF: BlockMatrix = BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND); #[rustfmt::skip] @@ -41,7 +41,7 @@ impl SBP4 { [-4.0/43.0, 59.0/43.0, -110.0/43.0, 59.0/43.0, -4.0/43.0, 0.0], [-1.0/49.0, 0.0, 59.0/49.0, -118.0/49.0, 64.0/49.0, -4.0/49.0] ]); - const D2: BlockMatrix<4, 6, 5> = BlockMatrix::new( + const D2: BlockMatrix = BlockMatrix::new( Self::D2BLOCK, Self::D2DIAG, super::flip_ud(super::flip_lr(Self::D2BLOCK)), diff --git a/sbp/src/operators/traditional8.rs b/sbp/src/operators/traditional8.rs index cfd7b8b..d1dcf4c 100644 --- a/sbp/src/operators/traditional8.rs +++ b/sbp/src/operators/traditional8.rs @@ -27,7 +27,7 @@ impl SBP8 { [2.75587615266177e-02, -8.71295642560637e-02, 5.01135077563584e-02, 7.68229253600969e-02, 4.60181213406519e-02, -7.55873581663580e-01, 0.00000000000000e+00, 8.21713248844682e-01, -2.16615355227872e-01, 4.12600676624518e-02, -3.86813134335486e-03, 0.0], [5.84767272160451e-03, -1.08336661209337e-02, -1.42810403117803e-02, 3.50919361287023e-02, -1.22244235731112e-02, 1.19411743193552e-01, -7.51668243727123e-01, 0.00000000000000e+00, 7.92601963555477e-01, -1.98150490888869e-01, 3.77429506454989e-02, -3.53840162301552e-03], ]); - const DIFF: BlockMatrix<8, 12, 9> = BlockMatrix::new( + const DIFF: BlockMatrix = BlockMatrix::new( Self::DIFF_BLOCK, Self::DIFF_DIAG, super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), diff --git a/sbp/src/operators/upwind4.rs b/sbp/src/operators/upwind4.rs index b9742bf..0126a9d 100644 --- a/sbp/src/operators/upwind4.rs +++ b/sbp/src/operators/upwind4.rs @@ -27,7 +27,7 @@ impl Upwind4 { const DIFF_BLOCKEND: Matrix = super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))); - const DIFF: BlockMatrix<4, 7, 7> = + const DIFF: BlockMatrix = BlockMatrix::new(Self::DIFF_BLOCK, Self::DIFF_DIAG, Self::DIFF_BLOCKEND); #[rustfmt::skip] @@ -43,7 +43,7 @@ impl Upwind4 { ]]); const DISS_BLOCKEND: Matrix = super::flip_ud(super::flip_lr(Self::DISS_BLOCK)); - const DISS: BlockMatrix<4, 7, 7> = + const DISS: BlockMatrix = BlockMatrix::new(Self::DISS_BLOCK, Self::DISS_DIAG, Self::DISS_BLOCKEND); } diff --git a/sbp/src/operators/upwind4h2.rs b/sbp/src/operators/upwind4h2.rs index e1cbf60..2a270b3 100644 --- a/sbp/src/operators/upwind4h2.rs +++ b/sbp/src/operators/upwind4h2.rs @@ -24,7 +24,7 @@ impl Upwind4h2 { const DIFF_DIAG: RowVector = RowVector::new([[ -1.43229166666667e-02, 1.40625000000000e-01, -7.38281250000000e-01, 0.00000000000000e+00, 7.38281250000000e-01, -1.40625000000000e-01, 1.43229166666667e-02 ]]); - const DIFF: BlockMatrix<4, 7, 7> = BlockMatrix::new( + const DIFF: BlockMatrix = BlockMatrix::new( Self::DIFF_BLOCK, Self::DIFF_DIAG, super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), @@ -42,7 +42,7 @@ impl Upwind4h2 { const DISS_DIAG: RowVector = RowVector::new([[ 1.43229166666667e-02, -8.59375000000000e-02, 2.14843750000000e-01, -2.86458333333333e-01, 2.14843750000000e-01, -8.59375000000000e-02, 1.43229166666667e-02, ]]); - const DISS: BlockMatrix<4, 7, 7> = BlockMatrix::new( + const DISS: BlockMatrix = BlockMatrix::new( Self::DISS_BLOCK, Self::DISS_DIAG, super::flip_ud(super::flip_lr(Self::DISS_BLOCK)), diff --git a/sbp/src/operators/upwind9.rs b/sbp/src/operators/upwind9.rs index c6d374e..192b577 100644 --- a/sbp/src/operators/upwind9.rs +++ b/sbp/src/operators/upwind9.rs @@ -28,7 +28,7 @@ impl Upwind9 { [-6.19425252179959e-03, 3.69595678895333e-02, -7.01892820620398e-02, -3.35233082197107e-03, 2.69304373763091e-01, -8.89857974743355e-01, 0.00000000000000e+00, 8.66656645522330e-01, -2.57919763669076e-01, 6.44799409172690e-02, -1.07466568195448e-02, 8.59732545563586e-04, 0.0], [1.44853491014330e-02, -4.59275574977554e-02, 3.08833474560615e-02, 3.57240610228828e-02, -7.07760049349999e-02, 1.88587240076292e-01, -7.92626447113877e-01, 0.00000000000000e+00, 8.25608497215073e-01, -2.35888142061449e-01, 5.89720355153623e-02, -9.82867258589373e-03, 7.86293806871498e-04], ]); - const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new( + const DIFF: BlockMatrix = BlockMatrix::new( Self::DIFF_BLOCK, Self::DIFF_DIAG, super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), @@ -50,7 +50,7 @@ impl Upwind9 { const DISS_DIAG: RowVector = RowVector::new([[ 1.0/1260.0, -1.0/126.0, 1.0/28.0, -2.0/21.0, 1.0/6.0, -1.0/5.0, 1.0/6.0, -2.0/21.0, 1.0/28.0, -1.0/126.0, 1.0/1260.0, ]]); - const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new( + const DISS: BlockMatrix = BlockMatrix::new( Self::DISS_BLOCK, Self::DISS_DIAG, super::flip_ud(super::flip_lr(Self::DISS_BLOCK)), diff --git a/sbp/src/operators/upwind9h2.rs b/sbp/src/operators/upwind9h2.rs index f099008..9eb6830 100644 --- a/sbp/src/operators/upwind9h2.rs +++ b/sbp/src/operators/upwind9h2.rs @@ -28,7 +28,7 @@ impl Upwind9h2 { [-1.35203347497451e-03, 5.77422023528126e-03, -1.06262408660325e-02, -2.37853245409338e-02, 2.05772021953463e-01, -8.20033622612654e-01, 0.00000000000000e+00, 8.31345778788959e-01, -2.37329555369694e-01, 5.93323888424235e-02, -9.88873147373725e-03, 7.91098517898980e-04, 0.0], [-1.85246767034256e-03, 2.89905176240824e-03, 6.61951741227212e-04, 2.52792141498726e-03, -5.27086038822990e-02, 2.36934258275492e-01, -8.34333946306806e-01, 0.00000000000000e+00, 8.33639122800983e-01, -2.38182606514566e-01, 5.95456516286416e-02, -9.92427527144027e-03, 7.93942021715222e-04] ]); - const DIFF: BlockMatrix<8, 13, 11> = BlockMatrix::new( + const DIFF: BlockMatrix = BlockMatrix::new( Self::DIFF_BLOCK, Self::DIFF_DIAG, super::flip_sign(super::flip_ud(super::flip_lr(Self::DIFF_BLOCK))), @@ -50,7 +50,7 @@ impl Upwind9h2 { const DISS_DIAG: RowVector = RowVector::new([[ 7.93650793650794e-04, -7.93650793650794e-03, 3.57142857142857e-02, -9.52380952380952e-02, 1.66666666666667e-01, -2.00000000000000e-01, 1.66666666666667e-01, -9.52380952380952e-02, 3.57142857142857e-02, -7.93650793650794e-03, 7.93650793650794e-04 ]]); - const DISS: BlockMatrix<8, 13, 11> = BlockMatrix::new( + const DISS: BlockMatrix = BlockMatrix::new( Self::DISS_BLOCK, Self::DISS_DIAG, super::flip_ud(super::flip_lr(Self::DISS_BLOCK)), From b15ea57e6d68497f45f4c9b06ac7eb59976bb598 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 23:21:06 +0100 Subject: [PATCH 26/35] inline for perf --- sbp/src/operators/algos.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index d66e74c..9d24e9d 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -270,6 +270,7 @@ pub(crate) fn diff_op_2d_fallback( matrix: &BlockMatrix, optype: OperatorType, From 74d99a4a181dd1d7f87882167f9c1c79f7074122 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Mon, 1 Feb 2021 23:58:13 +0100 Subject: [PATCH 27/35] try ndarray transmute --- sbp/Cargo.toml | 1 + sbp/src/operators/algos.rs | 25 ++++++---- sbp/src/operators/algos/fastfloat.rs | 71 +++++++++++++++++++++++++++- 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/sbp/Cargo.toml b/sbp/Cargo.toml index 9b7834f..9cb7a50 100644 --- a/sbp/Cargo.toml +++ b/sbp/Cargo.toml @@ -11,6 +11,7 @@ packed_simd = { version = "0.3.3", package = "packed_simd_2" } rayon = { version = "1.3.0", optional = true } sprs = { version = "0.9.0", optional = true, default-features = false } serde = { version = "1.0.115", optional = true, default-features = false, features = ["derive"] } +num-traits = "0.2.14" [features] # Use f32 as precision, default is f64 diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 9d24e9d..a6485a8 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -1,5 +1,6 @@ use super::*; use ndarray::s; +use num_traits::Zero; pub(crate) mod constmatrix; pub(crate) use constmatrix::{flip_lr, flip_sign, flip_ud, ColVector, Matrix, RowVector}; @@ -194,14 +195,24 @@ pub(crate) fn diff_op_1d( } #[inline(always)] -#[allow(unused)] /// 2D diff fallback for when matrices are not slicable pub(crate) fn diff_op_2d_fallback( matrix: &BlockMatrix, optype: OperatorType, prev: ArrayView2, - mut fut: ArrayViewMut2, + fut: ArrayViewMut2, ) { + #[cfg(feature = "fast-float")] + let (matrix, prev, mut fut) = unsafe { + ( + std::mem::transmute::<_, &BlockMatrix>(matrix), + std::mem::transmute::<_, ArrayView2>(prev), + std::mem::transmute::<_, ArrayViewMut2>(fut), + ) + }; + #[cfg(not(feature = "fast-float"))] + let mut fut = fut; + assert_eq!(prev.shape(), fut.shape()); let nx = prev.shape()[1]; let ny = prev.shape()[0]; @@ -214,8 +225,7 @@ pub(crate) fn diff_op_2d_fallback for FastFloat { + type Output = FastFloat; + #[inline(always)] + fn mul(self, o: Float) -> Self::Output { + unsafe { Self(fmul_fast(self.0, o)) } + } +} + +impl core::ops::Mul for Float { + type Output = FastFloat; + #[inline(always)] + fn mul(self, o: FastFloat) -> Self::Output { + unsafe { FastFloat(fmul_fast(self, o.0)) } + } +} + +impl core::ops::Add for FastFloat { + type Output = FastFloat; #[inline(always)] fn add(self, o: FastFloat) -> Self::Output { unsafe { Self(fadd_fast(self.0, o.0)) } } } +impl core::ops::Add for FastFloat { + type Output = FastFloat; + #[inline(always)] + fn add(self, o: Float) -> Self::Output { + unsafe { Self(fadd_fast(self.0, o)) } + } +} + +impl core::ops::Add for Float { + type Output = FastFloat; + #[inline(always)] + fn add(self, o: FastFloat) -> Self::Output { + unsafe { FastFloat(fadd_fast(self, o.0)) } + } +} + +impl core::ops::Sub for FastFloat { + type Output = Self; + #[inline(always)] + fn sub(self, o: FastFloat) -> Self::Output { + unsafe { Self(fadd_fast(self.0, -o.0)) } + } +} + +impl core::ops::Div for FastFloat { + type Output = Self; + #[inline(always)] + fn div(self, o: FastFloat) -> Self::Output { + Self(self.0 / o.0) + } +} + impl core::ops::MulAssign for FastFloat { #[inline(always)] fn mul_assign(&mut self, o: FastFloat) { @@ -51,3 +99,22 @@ impl From for Float { f.0 } } + +mod numt { + use super::{FastFloat, Float}; + use num_traits::identities::{One, Zero}; + + impl One for FastFloat { + fn one() -> Self { + Self(Float::one()) + } + } + impl Zero for FastFloat { + fn zero() -> Self { + Self(Float::zero()) + } + fn is_zero(&self) -> bool { + self.0.is_zero() + } + } +} From c709cf465e8c85947a6ee9022050771cb59db8f3 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Tue, 2 Feb 2021 00:11:41 +0100 Subject: [PATCH 28/35] remove ndarray transmute --- sbp/src/operators/algos.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index a6485a8..8340632 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -200,8 +200,9 @@ pub(crate) fn diff_op_2d_fallback, optype: OperatorType, prev: ArrayView2, - fut: ArrayViewMut2, + mut fut: ArrayViewMut2, ) { + /* Does not increase the perf... #[cfg(feature = "fast-float")] let (matrix, prev, mut fut) = unsafe { ( @@ -212,6 +213,7 @@ pub(crate) fn diff_op_2d_fallback Date: Tue, 2 Feb 2021 00:33:37 +0100 Subject: [PATCH 29/35] specialise on contigous ny --- sbp/src/operators/algos.rs | 100 +++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 8340632..2a12fc3 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -281,6 +281,105 @@ pub(crate) fn diff_op_2d_fallback( + matrix: &BlockMatrix, + optype: OperatorType, + prev: ArrayView2, + mut fut: ArrayViewMut2, +) { + /* Does not increase the perf... + #[cfg(feature = "fast-float")] + let (matrix, prev, mut fut) = unsafe { + ( + std::mem::transmute::<_, &BlockMatrix>(matrix), + std::mem::transmute::<_, ArrayView2>(prev), + std::mem::transmute::<_, ArrayViewMut2>(fut), + ) + }; + #[cfg(not(feature = "fast-float"))] + let mut fut = fut; + */ + + assert_eq!(prev.shape(), fut.shape()); + let nx = prev.shape()[1]; + let ny = prev.shape()[0]; + assert!(nx >= 2 * M); + assert_eq!(prev.strides()[0], 1); + assert_eq!(fut.strides()[0], 1); + + let dx = if optype == OperatorType::H2 { + 1.0 / (nx - 2) as Float + } else { + 1.0 / (nx - 1) as Float + }; + let idx = 1.0 / dx; + + fut.fill(0.0.into()); + let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut(( + ndarray::s![.., ..M], + ndarray::s![.., M..nx - M], + ndarray::s![.., nx - M..], + )); + + // First block + for (bl, mut fut) in matrix + .start + .iter_rows() + .zip(fut0.axis_iter_mut(ndarray::Axis(1))) + { + let fut = &mut fut.as_slice_mut().unwrap()[..ny]; + for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) { + if bl.is_zero() { + continue; + } + let prev = &prev.as_slice().unwrap()[..ny]; + for (fut, prev) in fut.iter_mut().zip(prev) { + *fut = *fut + idx * prev * bl; + } + } + } + + let window_elems_to_skip = M - ((D - 1) / 2); + + // Diagonal entries + for (mut fut, id) in futmid + .axis_iter_mut(ndarray::Axis(1)) + .zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip)) + { + let fut = &mut fut.as_slice_mut().unwrap()[..ny]; + for (&d, id) in matrix.diag.iter().zip(id.axis_iter(ndarray::Axis(1))) { + if d.is_zero() { + continue; + } + let id = id.as_slice().unwrap(); + for (fut, id) in fut.iter_mut().zip(id) { + *fut = *fut + idx * id * d; + } + } + } + + // End block + let prev = prev.slice(ndarray::s!(.., nx - N..)); + for (bl, mut fut) in matrix + .end + .iter_rows() + .zip(futn.axis_iter_mut(ndarray::Axis(1))) + { + let fut = &mut fut.as_slice_mut().unwrap()[..ny]; + for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) { + if bl.is_zero() { + continue; + } + let prev = &prev.as_slice().unwrap()[..ny]; + for (fut, prev) in fut.iter_mut().zip(prev) { + *fut = *fut + idx * prev * bl; + } + } + } +} + #[inline(always)] pub(crate) fn diff_op_2d_sliceable( matrix: &BlockMatrix, @@ -308,6 +407,7 @@ pub(crate) fn diff_op_2d( assert_eq!(prev.shape(), fut.shape()); match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut), + ([1, _], [1, _]) => diff_op_2d_sliceable_y(matrix, optype, prev, fut), _ => diff_op_2d_fallback(matrix, optype, prev, fut), } } From 7ec426b5a8e8c79149a99c88e96a3fb7ce0374d0 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Wed, 3 Feb 2021 08:31:33 +0100 Subject: [PATCH 30/35] add more fast intrinsics --- sbp/src/operators/algos/fastfloat.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sbp/src/operators/algos/fastfloat.rs b/sbp/src/operators/algos/fastfloat.rs index efad00b..f4e4e8b 100644 --- a/sbp/src/operators/algos/fastfloat.rs +++ b/sbp/src/operators/algos/fastfloat.rs @@ -4,7 +4,7 @@ use super::*; #[derive(Copy, Clone, Debug, PartialEq, Default)] pub(crate) struct FastFloat(Float); -use core::intrinsics::{fadd_fast, fmul_fast}; +use core::intrinsics::{fadd_fast, fdiv_fast, fmul_fast, fsub_fast}; impl core::ops::Mul for FastFloat { type Output = Self; @@ -58,7 +58,7 @@ impl core::ops::Sub for FastFloat { type Output = Self; #[inline(always)] fn sub(self, o: FastFloat) -> Self::Output { - unsafe { Self(fadd_fast(self.0, -o.0)) } + unsafe { Self(fsub_fast(self.0, o.0)) } } } @@ -66,7 +66,7 @@ impl core::ops::Div for FastFloat { type Output = Self; #[inline(always)] fn div(self, o: FastFloat) -> Self::Output { - Self(self.0 / o.0) + unsafe { Self(fdiv_fast(self.0, o.0)) } } } From cf4d8f1e9ba7fefb1992f9363217d1eb47df9e1b Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Wed, 3 Feb 2021 08:41:11 +0100 Subject: [PATCH 31/35] add Zero for constmatrix --- sbp/src/operators/algos/constmatrix.rs | 34 ++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/sbp/src/operators/algos/constmatrix.rs b/sbp/src/operators/algos/constmatrix.rs index b409bed..7203d64 100644 --- a/sbp/src/operators/algos/constmatrix.rs +++ b/sbp/src/operators/algos/constmatrix.rs @@ -1,4 +1,7 @@ #![allow(unused)] + +use num_traits::identities::Zero; + /// A row-major matrix #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[repr(C)] @@ -16,6 +19,17 @@ impl Default for Matrix Zero for Matrix { + fn zero() -> Self { + Self { + data: [[T::zero(); N]; M], + } + } + fn is_zero(&self) -> bool { + self == &Self::zero() + } +} + impl core::ops::Index<(usize, usize)> for Matrix { type Output = T; #[inline(always)] @@ -123,9 +137,9 @@ macro_rules! impl_op_mul_mul { T: Copy + Default + core::ops::Add + core::ops::Mul, { type Output = Matrix; - fn mul(self, rhs: $rhs) -> Self::Output { + fn mul(self, lhs: $rhs) -> Self::Output { let mut out = Matrix::default(); - out.matmul_into(&self, &rhs); + out.matmul_into(&self, &lhs); out } } @@ -147,6 +161,22 @@ where } } +impl core::ops::Add> for Matrix +where + T: Copy + Zero + core::ops::Add + PartialEq, +{ + type Output = Self; + fn add(self, lhs: Self) -> Self::Output { + let mut out = Matrix::zero(); + for i in 0..M { + for j in 0..N { + out[(i, j)] = self[(i, j)] + lhs[(i, j)]; + } + } + out + } +} + #[cfg(test)] mod tests { use super::{super::*, *}; From 87c055f81e31ed40189a0a1066d0ce3599d3c89e Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Tue, 9 Feb 2021 21:44:35 +0100 Subject: [PATCH 32/35] Add back simd column algo --- sbp/src/operators/algos.rs | 171 ++++++++++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 1 deletion(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 2a12fc3..3ec4aff 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -283,6 +283,7 @@ pub(crate) fn diff_op_2d_fallback( matrix: &BlockMatrix, optype: OperatorType, @@ -380,6 +381,174 @@ pub(crate) fn diff_op_2d_sliceable_y( + matrix: &BlockMatrix, + optype: OperatorType, + prev: ArrayView2, + mut fut: ArrayViewMut2, +) { + assert_eq!(prev.shape(), fut.shape()); + let nx = prev.shape()[1]; + assert!(nx >= 2 * M); + + assert_eq!(prev.strides()[0], 1); + assert_eq!(fut.strides()[0], 1); + + let dx = if optype == OperatorType::H2 { + 1.0 / (nx - 2) as Float + } else { + 1.0 / (nx - 1) as Float + }; + let idx = 1.0 / dx; + + #[cfg(not(feature = "f32"))] + type SimdT = packed_simd::f64x8; + #[cfg(feature = "f32")] + type SimdT = packed_simd::f32x16; + + let ny = prev.shape()[0]; + // How many elements that can be simdified + let simdified = SimdT::lanes() * (ny / SimdT::lanes()); + + let half_diag_width = (D - 1) / 2; + assert!(half_diag_width <= M); + + let fut_base_ptr = fut.as_mut_ptr(); + let fut_stride = fut.strides()[1]; + let fut_ptr = |j, i| { + debug_assert!(j < ny && i < nx); + unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) } + }; + + let prev_base_ptr = prev.as_ptr(); + let prev_stride = prev.strides()[1]; + let prev_ptr = |j, i| { + debug_assert!(j < ny && i < nx); + unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) } + }; + + // Not algo necessary, but gives performance increase + assert_eq!(fut_stride, prev_stride); + + // First block + { + for (ifut, &bl) in matrix.start.iter_rows().enumerate() { + for j in (0..simdified).step_by(SimdT::lanes()) { + let index_to_simd = |i| unsafe { + // j never moves past end of slice due to step_by and + // rounding down + SimdT::from_slice_unaligned(std::slice::from_raw_parts( + prev_ptr(j, i), + SimdT::lanes(), + )) + }; + let mut f = SimdT::splat(0.0); + for (iprev, &bl) in bl.iter().enumerate() { + f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); + } + f *= idx; + + unsafe { + f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( + fut_ptr(j, ifut), + SimdT::lanes(), + )); + } + } + for j in simdified..ny { + unsafe { + let mut f = 0.0; + for (iprev, bl) in bl.iter().enumerate() { + f += bl * *prev_ptr(j, iprev); + } + *fut_ptr(j, ifut) = f * idx; + } + } + } + } + + // Diagonal elements + { + for ifut in M..nx - M { + for j in (0..simdified).step_by(SimdT::lanes()) { + let index_to_simd = |i| unsafe { + // j never moves past end of slice due to step_by and + // rounding down + SimdT::from_slice_unaligned(std::slice::from_raw_parts( + prev_ptr(j, i), + SimdT::lanes(), + )) + }; + let mut f = SimdT::splat(0.0); + for (id, &d) in matrix.diag.iter().enumerate() { + let offset = ifut - half_diag_width + id; + f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); + } + f *= idx; + unsafe { + // puts simd along stride 1, j never goes past end of slice + f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( + fut_ptr(j, ifut), + SimdT::lanes(), + )); + } + } + for j in simdified..ny { + let mut f = 0.0; + for (id, &d) in matrix.diag.iter().enumerate() { + let offset = ifut - half_diag_width + id; + unsafe { + f += d * *prev_ptr(j, offset); + } + } + unsafe { + *fut_ptr(j, ifut) = idx * f; + } + } + } + } + + // End block + { + // Get blocks and corresponding offsets + // (rev to iterate in ifut increasing order) + for (bl, ifut) in matrix.end.iter_rows().zip(nx - M..) { + for j in (0..simdified).step_by(SimdT::lanes()) { + let index_to_simd = |i| unsafe { + // j never moves past end of slice due to step_by and + // rounding down + SimdT::from_slice_unaligned(std::slice::from_raw_parts( + prev_ptr(j, i), + SimdT::lanes(), + )) + }; + let mut f = SimdT::splat(0.0); + for (&bl, iprev) in bl.iter().zip(nx - N..) { + f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); + } + f = f * idx; + unsafe { + f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( + fut_ptr(j, ifut), + SimdT::lanes(), + )); + } + } + + for j in simdified..ny { + unsafe { + let mut f = 0.0; + for (&bl, iprev) in bl.iter().zip(nx - N..) { + f += bl * *prev_ptr(j, iprev); + } + *fut_ptr(j, ifut) = f * idx; + } + } + } + } +} + #[inline(always)] pub(crate) fn diff_op_2d_sliceable( matrix: &BlockMatrix, @@ -407,7 +576,7 @@ pub(crate) fn diff_op_2d( assert_eq!(prev.shape(), fut.shape()); match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut), - ([1, _], [1, _]) => diff_op_2d_sliceable_y(matrix, optype, prev, fut), + ([1, _], [1, _]) => diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut), _ => diff_op_2d_fallback(matrix, optype, prev, fut), } } From 8a6dc60edf9834f3f6cc3f65ecd617bb58cb5ecd Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Wed, 10 Feb 2021 19:02:48 +0100 Subject: [PATCH 33/35] remove some unsafe from simd --- sbp/src/operators/algos.rs | 166 ++++++++++++++----------------------- 1 file changed, 64 insertions(+), 102 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 3ec4aff..72750e9 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -390,10 +390,16 @@ pub(crate) fn diff_op_2d_sliceable_y_simd= 2 * M); - assert_eq!(prev.strides()[0], 1); - assert_eq!(fut.strides()[0], 1); + assert_eq!(prev.strides(), fut.strides()); + assert_eq!(prev.strides(), &[1, ny as isize]); + + let prev = prev.as_slice_memory_order().unwrap(); + let fut = fut.as_slice_memory_order_mut().unwrap(); + let prev = &prev[..nx * ny]; + let fut = &mut fut[..nx * ny]; let dx = if optype == OperatorType::H2 { 1.0 / (nx - 2) as Float @@ -407,103 +413,88 @@ pub(crate) fn diff_op_2d_sliceable_y_simd( + matrix: &Matrix, + idx: Float, + ny: usize, + simdified: usize, + prev: &[Float], + fut: &mut [Float], + ) { + assert_eq!(prev.len(), N * ny); + assert_eq!(fut.len(), M * ny); + let prev = &prev[..N * ny]; + let fut = &mut fut[..M * ny]; - let prev_base_ptr = prev.as_ptr(); - let prev_stride = prev.strides()[1]; - let prev_ptr = |j, i| { - debug_assert!(j < ny && i < nx); - unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) } - }; + let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] }; - // Not algo necessary, but gives performance increase - assert_eq!(fut_stride, prev_stride); - - // First block - { - for (ifut, &bl) in matrix.start.iter_rows().enumerate() { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; + for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) { + let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + for (j, fut) in fut.by_ref().enumerate() { + let index_to_simd = + |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); let mut f = SimdT::splat(0.0); for (iprev, &bl) in bl.iter().enumerate() { f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f); } f *= idx; - - unsafe { - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); - } + f.write_to_slice_unaligned(fut); } - for j in simdified..ny { - unsafe { - let mut f = 0.0; - for (iprev, bl) in bl.iter().enumerate() { - f += bl * *prev_ptr(j, iprev); - } - *fut_ptr(j, ifut) = f * idx; + for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { + let mut f = 0.0; + for (iprev, bl) in bl.iter().enumerate() { + f += bl * prevcol(iprev)[j]; } + *fut = f * idx; } } } + // First block + { + let prev = &prev[..N * ny]; + block_multiply(&matrix.start, idx, ny, simdified, prev, fut0); + } + // Diagonal elements { - for ifut in M..nx - M { - for j in (0..simdified).step_by(SimdT::lanes()) { - let index_to_simd = |i| unsafe { - // j never moves past end of slice due to step_by and - // rounding down - SimdT::from_slice_unaligned(std::slice::from_raw_parts( - prev_ptr(j, i), - SimdT::lanes(), - )) - }; + let half_diag_width = (D - 1) / 2; + assert!(half_diag_width <= M); + + let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] }; + for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) { + let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + for (j, fut) in fut.by_ref().enumerate() { + let index_to_simd = + |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); let mut f = SimdT::splat(0.0); for (id, &d) in matrix.diag.iter().enumerate() { let offset = ifut - half_diag_width + id; f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); } f *= idx; - unsafe { + { // puts simd along stride 1, j never goes past end of slice - f.write_to_slice_unaligned(std::slice::from_raw_parts_mut( - fut_ptr(j, ifut), - SimdT::lanes(), - )); + f.write_to_slice_unaligned(fut); } } - for j in simdified..ny { + for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { let mut f = 0.0; for (id, &d) in matrix.diag.iter().enumerate() { let offset = ifut - half_diag_width + id; - unsafe { - f += d * *prev_ptr(j, offset); + { + f += d * prevcol(offset)[j]; } } - unsafe { - *fut_ptr(j, ifut) = idx * f; + { + *fut = idx * f; } } } @@ -511,41 +502,8 @@ pub(crate) fn diff_op_2d_sliceable_y_simd( assert_eq!(prev.shape(), fut.shape()); match (prev.strides(), fut.strides()) { ([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut), - ([1, _], [1, _]) => diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut), + ([1, _], [1, _]) + if prev.as_slice_memory_order().is_some() && fut.as_slice_memory_order().is_some() => + { + diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut) + } _ => diff_op_2d_fallback(matrix, optype, prev, fut), } } From 02175d1734c96a577f37eed797acf78dcf0265d6 Mon Sep 17 00:00:00 2001 From: Magnus Ulimoen Date: Wed, 10 Feb 2021 19:29:26 +0100 Subject: [PATCH 34/35] use some unsafe... --- sbp/src/operators/algos.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 72750e9..78844dd 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -433,6 +433,14 @@ pub(crate) fn diff_op_2d_sliceable_y_simd &[Float] { + unsafe { std::slice::from_raw_parts(prev_ptr.add(i * ny), ny) } + } + }; + */ let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] }; for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) { @@ -468,9 +476,16 @@ pub(crate) fn diff_op_2d_sliceable_y_simd &[Float] { &prev[i * ny..(i + 1) * ny] }; + let prevcol = { + let prev_ptr = prev.as_ptr(); + move |i: usize| -> &[Float] { + unsafe { std::slice::from_raw_parts(prev_ptr.add(i * ny), ny) } + } + }; + //let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] }; + for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) { - let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>(); for (j, fut) in fut.by_ref().enumerate() { let index_to_simd = |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); @@ -481,7 +496,6 @@ pub(crate) fn diff_op_2d_sliceable_y_simd Date: Wed, 10 Feb 2021 21:15:21 +0100 Subject: [PATCH 35/35] remove iterator inhibiting optimisation --- sbp/src/operators/algos.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sbp/src/operators/algos.rs b/sbp/src/operators/algos.rs index 78844dd..d14997a 100644 --- a/sbp/src/operators/algos.rs +++ b/sbp/src/operators/algos.rs @@ -444,7 +444,7 @@ pub(crate) fn diff_op_2d_sliceable_y_simd &[Float] { &prev[i * ny..(i + 1) * ny] }; for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) { - let mut fut = fut.chunks_exact_mut(SimdT::lanes()); + let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>(); for (j, fut) in fut.by_ref().enumerate() { let index_to_simd = |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); @@ -487,17 +487,23 @@ pub(crate) fn diff_op_2d_sliceable_y_simd(); for (j, fut) in fut.by_ref().enumerate() { - let index_to_simd = - |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); + //let index_to_simd = + // |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]); + let index_to_simd = |i: usize| unsafe { + let prev = std::slice::from_raw_parts( + prev.as_ptr().add(i * ny + SimdT::lanes() * j), + SimdT::lanes(), + ); + SimdT::from_slice_unaligned_unchecked(prev) + }; let mut f = SimdT::splat(0.0); - for (id, &d) in matrix.diag.iter().enumerate() { + // direct iter does not optimize well here + for (id, &d) in matrix.diag.row(0).iter().enumerate() { let offset = ifut - half_diag_width + id; f = index_to_simd(offset).mul_adde(SimdT::splat(d), f); } f *= idx; - { - f.write_to_slice_unaligned(fut); - } + f.write_to_slice_unaligned(fut); } for (j, fut) in (simdified..ny).zip(fut.into_remainder()) { let mut f = 0.0; @@ -507,9 +513,7 @@ pub(crate) fn diff_op_2d_sliceable_y_simd