SummationByParts/sbp/src/operators/algos.rs

820 lines
25 KiB
Rust
Raw Normal View History

2021-01-18 21:37:24 +00:00
use super::*;
2021-02-01 22:08:55 +00:00
use ndarray::s;
2021-02-01 22:58:13 +00:00
use num_traits::Zero;
2021-01-18 21:37:24 +00:00
2021-03-15 18:31:41 +00:00
pub(crate) use constmatrix::{ColVector, Matrix, RowVector};
2021-01-28 19:59:11 +00:00
2021-02-01 22:08:55 +00:00
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct DiagonalMatrix<const B: usize> {
pub start: [Float; B],
pub diag: Float,
pub end: [Float; B],
}
impl<const B: usize> DiagonalMatrix<B> {
pub const fn new(block: [Float; B]) -> Self {
let start = block;
let diag = 1.0;
let mut end = block;
let mut i = 0;
while i < B {
end[i] = block[B - 1 - i];
i += 1;
}
Self { start, diag, end }
}
}
#[derive(Clone, Debug, PartialEq)]
2021-02-01 22:13:10 +00:00
pub(crate) struct BlockMatrix<T, const M: usize, const N: usize, const D: usize> {
pub start: Matrix<T, M, N>,
pub diag: RowVector<T, D>,
pub end: Matrix<T, M, N>,
2021-02-01 22:08:55 +00:00
}
2021-02-01 22:13:10 +00:00
impl<T, const M: usize, const N: usize, const D: usize> BlockMatrix<T, M, N, D> {
pub const fn new(start: Matrix<T, M, N>, diag: RowVector<T, D>, end: Matrix<T, M, N>) -> Self {
2021-02-01 22:08:55 +00:00
Self { start, diag, end }
}
}
#[derive(PartialEq, Copy, Clone)]
pub(crate) enum OperatorType {
Normal,
H2,
// TODO: D2
}
2021-01-28 19:59:11 +00:00
#[inline(always)]
2021-02-01 22:08:55 +00:00
/// Works on all 1d vectors
pub(crate) fn diff_op_1d_fallback<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-28 19:59:11 +00:00
optype: OperatorType,
prev: ArrayView1<Float>,
mut fut: ArrayViewMut1<Float>,
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[0];
assert!(nx >= 2 * M);
assert!(nx >= N);
let dx = if optype == OperatorType::H2 {
1.0 / (nx - 2) as Float
} else {
1.0 / (nx - 1) as Float
};
let idx = 1.0 / dx;
2021-02-01 22:08:55 +00:00
let (futstart, futmid, futend) =
fut.multi_slice_mut((s![..M], s![M..nx - 2 * M], s![nx - 2 * M..]));
for (bl, f) in matrix.start.iter_rows().zip(futstart) {
let diff = dotproduct(bl.iter(), prev.iter());
2021-01-28 19:59:11 +00:00
*f = diff * idx;
}
// The window needs to be aligned to the diagonal elements,
// based on the block size
let window_elems_to_skip = M - ((D - 1) / 2);
for (window, f) in prev
.windows(D)
.into_iter()
.skip(window_elems_to_skip)
2021-02-01 22:08:55 +00:00
.zip(futmid)
2021-01-28 19:59:11 +00:00
{
2021-02-01 22:08:55 +00:00
let diff = dotproduct(matrix.diag.row(0), window);
2021-01-28 19:59:11 +00:00
*f = diff * idx;
}
2021-01-31 13:54:15 +00:00
let prev = prev.slice(ndarray::s![nx - N..]);
2021-02-01 22:08:55 +00:00
for (bl, f) in matrix.end.iter_rows().zip(futend) {
let diff = dotproduct(bl, prev);
2021-01-29 16:36:05 +00:00
*f = diff * idx;
2021-01-28 19:59:11 +00:00
}
}
#[inline(always)]
2021-02-01 22:08:55 +00:00
/// diff op in 1d for slices
pub(crate) fn diff_op_1d_slice<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-28 19:59:11 +00:00
optype: OperatorType,
prev: &[Float],
fut: &mut [Float],
) {
use std::convert::TryInto;
2021-03-23 18:21:38 +00:00
#[inline(never)]
/// This prevents code bloat, both start and end block gives
/// a matrix multiplication with the same matrix sizes
2021-03-23 18:21:38 +00:00
fn dedup_matmul<const M: usize, const N: usize>(
c: &mut ColVector<Float, M>,
a: &Matrix<Float, M, N>,
b: &ColVector<Float, N>,
) {
c.matmul_float_into(a, b)
}
2021-01-28 21:31:54 +00:00
2021-01-28 19:59:11 +00:00
assert_eq!(prev.len(), fut.len());
let nx = prev.len();
assert!(nx >= 2 * M);
assert!(nx >= N);
let prev = &prev[..nx];
let fut = &mut fut[..nx];
let dx = if optype == OperatorType::H2 {
1.0 / (nx - 2) as Float
} else {
1.0 / (nx - 1) as Float
};
2021-01-28 23:08:31 +00:00
let idx = 1.0 / dx;
2021-01-28 19:59:11 +00:00
2021-01-29 16:59:35 +00:00
// Help aliasing analysis
let (futb1, fut) = fut.split_at_mut(M);
let (fut, futb2) = fut.split_at_mut(nx - 2 * M);
2021-01-28 19:59:11 +00:00
{
2021-01-29 16:47:13 +00:00
let prev = ColVector::<_, N>::map_to_col(prev.array_windows::<N>().next().unwrap());
2021-01-29 16:59:35 +00:00
let fut = ColVector::<_, M>::map_to_col_mut(futb1.try_into().unwrap());
2021-01-28 19:59:11 +00:00
2021-03-23 18:21:38 +00:00
dedup_matmul(fut, &matrix.start, prev);
2021-01-29 15:42:49 +00:00
*fut *= idx;
2021-01-28 19:59:11 +00:00
}
{
// The window needs to be aligned to the diagonal elements,
// based on the block size
let window_elems_to_skip = M - ((D - 1) / 2);
// The compiler is pretty clever right here. It is able to
// inline the entire computation into a very tight loop.
// It seems the "optimal" way is to broadcast each element of diag
// into separate SIMD vectors.
// Then an unroll of SIMD::lanes items from fut and prev:
// f[0] = diag[0] * in[0] + diag[1] * in[1] + diag[2] * in[3] + ...
// f[1] = diag[0] * in[1] + diag[1] * in[2] + diag[2] * in[4] + ...
// \-- fma's along the vertical axis, then horizontal
//
// The resulting inner loop performs:
// one mul, D-1 fma,
// one mul (by idx),
// one store
// two integer adds (input and output offsets)
// one test + jump
//
// The compiler is clever enough to combine two such unrollings and merge
// these computations to prevent stalling
for (window, f) in prev[window_elems_to_skip..]
.array_windows::<D>()
.zip(fut.array_chunks_mut::<1>())
{
let fut = ColVector::<_, 1>::map_to_col_mut(f);
let prev = ColVector::<_, D>::map_to_col(window);
fut.matmul_float_into(&matrix.diag, prev);
*fut *= idx;
}
2021-01-28 19:59:11 +00:00
}
{
2021-01-28 23:08:31 +00:00
let prev = prev.array_windows::<N>().next_back().unwrap();
2021-01-28 20:27:25 +00:00
let prev = ColVector::<_, N>::map_to_col(prev);
2021-01-29 16:59:35 +00:00
let fut = ColVector::<_, M>::map_to_col_mut(futb2.try_into().unwrap());
2021-01-28 19:59:11 +00:00
2021-03-23 18:21:38 +00:00
dedup_matmul(fut, &matrix.end, prev);
2021-01-29 15:42:49 +00:00
*fut *= idx;
2021-01-28 19:59:11 +00:00
}
}
2021-01-18 21:37:24 +00:00
#[inline(always)]
2021-02-01 22:08:55 +00:00
/// Will always work on 1d, delegated based on slicedness
pub(crate) fn diff_op_1d<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-18 21:37:24 +00:00
optype: OperatorType,
prev: ArrayView1<Float>,
mut fut: ArrayViewMut1<Float>,
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[0];
2021-02-01 22:08:55 +00:00
assert!(nx >= 2 * M);
2021-01-18 21:37:24 +00:00
2021-02-01 22:08:55 +00:00
if let Some((prev, fut)) = prev.as_slice().zip(fut.as_slice_mut()) {
diff_op_1d_slice(matrix, optype, prev, fut)
2021-01-18 21:37:24 +00:00
} else {
2021-02-01 22:08:55 +00:00
diff_op_1d_fallback(matrix, optype, prev, fut)
2021-01-18 21:37:24 +00:00
}
2021-01-31 14:40:28 +00:00
}
#[inline(always)]
2021-02-01 22:08:55 +00:00
/// 2D diff fallback for when matrices are not slicable
pub(crate) fn diff_op_2d_fallback<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-31 14:40:28 +00:00
optype: OperatorType,
prev: ArrayView2<Float>,
2021-02-01 23:11:41 +00:00
mut fut: ArrayViewMut2<Float>,
2021-01-31 14:40:28 +00:00
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[1];
let ny = prev.shape()[0];
assert!(nx >= 2 * M);
let dx = if optype == OperatorType::H2 {
1.0 / (nx - 2) as Float
} else {
1.0 / (nx - 1) as Float
};
let idx = 1.0 / dx;
2021-02-01 22:58:13 +00:00
fut.fill(0.0.into());
2021-01-31 14:40:28 +00:00
let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut((
ndarray::s![.., ..M],
ndarray::s![.., M..nx - M],
ndarray::s![.., nx - M..],
));
// First block
2021-02-01 22:08:55 +00:00
for (bl, mut fut) in matrix
.start
.iter_rows()
.zip(fut0.axis_iter_mut(ndarray::Axis(1)))
{
2021-01-31 14:40:28 +00:00
debug_assert_eq!(fut.len(), prev.shape()[0]);
for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
2021-02-01 22:58:13 +00:00
if bl.is_zero() {
2021-01-31 14:40:28 +00:00
continue;
}
debug_assert_eq!(prev.len(), fut.len());
fut.scaled_add(idx * bl, &prev);
}
}
let window_elems_to_skip = M - ((D - 1) / 2);
// Diagonal entries
for (mut fut, id) in futmid
.axis_iter_mut(ndarray::Axis(1))
.zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip))
{
2021-02-01 22:08:55 +00:00
for (&d, id) in matrix.diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
2021-02-01 22:58:13 +00:00
if d.is_zero() {
2021-01-31 14:40:28 +00:00
continue;
}
fut.scaled_add(idx * d, &id)
}
}
// End block
let prev = prev.slice(ndarray::s!(.., nx - N..));
2021-02-01 22:08:55 +00:00
for (bl, mut fut) in matrix
.end
2021-01-31 14:40:28 +00:00
.iter_rows()
.zip(futn.axis_iter_mut(ndarray::Axis(1)))
{
for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
2021-02-01 22:58:13 +00:00
if bl.is_zero() {
2021-01-31 14:40:28 +00:00
continue;
}
fut.scaled_add(idx * bl, &prev);
}
}
2021-01-18 21:37:24 +00:00
}
2021-02-01 23:33:37 +00:00
#[inline(always)]
/// 2D diff when first axis is contiguous
2021-02-09 20:44:35 +00:00
#[allow(unused)]
2021-02-01 23:33:37 +00:00
pub(crate) fn diff_op_2d_sliceable_y<const M: usize, const N: usize, const D: usize>(
matrix: &BlockMatrix<Float, M, N, D>,
optype: OperatorType,
prev: ArrayView2<Float>,
mut fut: ArrayViewMut2<Float>,
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[1];
let ny = prev.shape()[0];
assert!(nx >= 2 * M);
assert_eq!(prev.strides()[0], 1);
assert_eq!(fut.strides()[0], 1);
let dx = if optype == OperatorType::H2 {
1.0 / (nx - 2) as Float
} else {
1.0 / (nx - 1) as Float
};
let idx = 1.0 / dx;
fut.fill(0.0.into());
let (mut fut0, mut futmid, mut futn) = fut.multi_slice_mut((
ndarray::s![.., ..M],
ndarray::s![.., M..nx - M],
ndarray::s![.., nx - M..],
));
// First block
for (bl, mut fut) in matrix
.start
.iter_rows()
.zip(fut0.axis_iter_mut(ndarray::Axis(1)))
{
let fut = &mut fut.as_slice_mut().unwrap()[..ny];
for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
if bl.is_zero() {
continue;
}
let prev = &prev.as_slice().unwrap()[..ny];
for (fut, prev) in fut.iter_mut().zip(prev) {
*fut = *fut + idx * prev * bl;
}
}
}
let window_elems_to_skip = M - ((D - 1) / 2);
// Diagonal entries
for (mut fut, id) in futmid
.axis_iter_mut(ndarray::Axis(1))
.zip(prev.windows((ny, D)).into_iter().skip(window_elems_to_skip))
{
let fut = &mut fut.as_slice_mut().unwrap()[..ny];
for (&d, id) in matrix.diag.iter().zip(id.axis_iter(ndarray::Axis(1))) {
if d.is_zero() {
continue;
}
let id = id.as_slice().unwrap();
for (fut, id) in fut.iter_mut().zip(id) {
*fut = *fut + idx * id * d;
}
}
}
// End block
let prev = prev.slice(ndarray::s!(.., nx - N..));
for (bl, mut fut) in matrix
.end
.iter_rows()
.zip(futn.axis_iter_mut(ndarray::Axis(1)))
{
let fut = &mut fut.as_slice_mut().unwrap()[..ny];
for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) {
if bl.is_zero() {
continue;
}
let prev = &prev.as_slice().unwrap()[..ny];
for (fut, prev) in fut.iter_mut().zip(prev) {
*fut = *fut + idx * prev * bl;
}
}
}
}
2021-02-09 20:44:35 +00:00
#[inline(always)]
pub(crate) fn diff_op_2d_sliceable_y_simd<const M: usize, const N: usize, const D: usize>(
matrix: &BlockMatrix<Float, M, N, D>,
optype: OperatorType,
prev: ArrayView2<Float>,
mut fut: ArrayViewMut2<Float>,
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[1];
2021-02-10 18:02:48 +00:00
let ny = prev.shape()[0];
2021-02-09 20:44:35 +00:00
assert!(nx >= 2 * M);
2021-02-10 18:02:48 +00:00
assert_eq!(prev.strides(), fut.strides());
assert_eq!(prev.strides(), &[1, ny as isize]);
let prev = prev.as_slice_memory_order().unwrap();
let fut = fut.as_slice_memory_order_mut().unwrap();
let prev = &prev[..nx * ny];
let fut = &mut fut[..nx * ny];
2021-02-09 20:44:35 +00:00
let dx = if optype == OperatorType::H2 {
1.0 / (nx - 2) as Float
} else {
1.0 / (nx - 1) as Float
};
let idx = 1.0 / dx;
#[cfg(not(feature = "f32"))]
type SimdT = packed_simd::f64x8;
#[cfg(feature = "f32")]
type SimdT = packed_simd::f32x16;
// How many elements that can be simdified
let simdified = SimdT::lanes() * (ny / SimdT::lanes());
2021-02-10 18:02:48 +00:00
let (fut0, futmid) = fut.split_at_mut(M * ny);
let (futmid, futn) = futmid.split_at_mut((nx - 2 * M) * ny);
#[inline(always)]
fn block_multiply<const M: usize, const N: usize>(
matrix: &Matrix<Float, M, N>,
idx: Float,
ny: usize,
simdified: usize,
prev: &[Float],
fut: &mut [Float],
) {
assert_eq!(prev.len(), N * ny);
assert_eq!(fut.len(), M * ny);
let prev = &prev[..N * ny];
let fut = &mut fut[..M * ny];
2021-02-10 18:29:26 +00:00
/*
let prevcol = {
let prev_ptr = prev.as_ptr();
move |i: usize| -> &[Float] {
unsafe { std::slice::from_raw_parts(prev_ptr.add(i * ny), ny) }
}
};
*/
2021-02-10 18:02:48 +00:00
let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
for (&bl, fut) in matrix.iter_rows().zip(fut.chunks_exact_mut(ny)) {
let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
2021-02-10 18:02:48 +00:00
for (j, fut) in fut.by_ref().enumerate() {
let index_to_simd =
|i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
2021-02-09 20:44:35 +00:00
let mut f = SimdT::splat(0.0);
for (iprev, &bl) in bl.iter().enumerate() {
f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
}
f *= idx;
2021-02-10 18:02:48 +00:00
f.write_to_slice_unaligned(fut);
2021-02-09 20:44:35 +00:00
}
2021-02-10 18:02:48 +00:00
for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
let mut f = 0.0;
for (iprev, bl) in bl.iter().enumerate() {
f += bl * prevcol(iprev)[j];
2021-02-09 20:44:35 +00:00
}
2021-02-10 18:02:48 +00:00
*fut = f * idx;
2021-02-09 20:44:35 +00:00
}
}
}
2021-02-10 18:02:48 +00:00
// First block
{
let prev = &prev[..N * ny];
block_multiply(&matrix.start, idx, ny, simdified, prev, fut0);
}
2021-02-09 20:44:35 +00:00
// Diagonal elements
{
2021-02-10 18:02:48 +00:00
let half_diag_width = (D - 1) / 2;
assert!(half_diag_width <= M);
2021-02-10 18:29:26 +00:00
let prevcol = {
let prev_ptr = prev.as_ptr();
move |i: usize| -> &[Float] {
unsafe { std::slice::from_raw_parts(prev_ptr.add(i * ny), ny) }
}
};
//let prevcol = |i: usize| -> &[Float] { &prev[i * ny..(i + 1) * ny] };
2021-02-10 18:02:48 +00:00
for (fut, ifut) in futmid.chunks_exact_mut(ny).zip(M..nx - M) {
2021-02-10 18:29:26 +00:00
let mut fut = fut.array_chunks_mut::<{ SimdT::lanes() }>();
2021-02-10 18:02:48 +00:00
for (j, fut) in fut.by_ref().enumerate() {
//let index_to_simd =
// |i| SimdT::from_slice_unaligned(&prevcol(i)[SimdT::lanes() * j..]);
let index_to_simd = |i: usize| unsafe {
let prev = std::slice::from_raw_parts(
prev.as_ptr().add(i * ny + SimdT::lanes() * j),
SimdT::lanes(),
);
SimdT::from_slice_unaligned_unchecked(prev)
};
2021-02-09 20:44:35 +00:00
let mut f = SimdT::splat(0.0);
// direct iter does not optimize well here
for (id, &d) in matrix.diag.row(0).iter().enumerate() {
2021-02-09 20:44:35 +00:00
let offset = ifut - half_diag_width + id;
f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
}
f *= idx;
f.write_to_slice_unaligned(fut);
2021-02-09 20:44:35 +00:00
}
2021-02-10 18:02:48 +00:00
for (j, fut) in (simdified..ny).zip(fut.into_remainder()) {
2021-02-09 20:44:35 +00:00
let mut f = 0.0;
for (id, &d) in matrix.diag.iter().enumerate() {
let offset = ifut - half_diag_width + id;
2021-02-10 18:02:48 +00:00
{
f += d * prevcol(offset)[j];
2021-02-09 20:44:35 +00:00
}
}
*fut = idx * f;
2021-02-09 20:44:35 +00:00
}
}
}
// End block
{
2021-02-10 18:02:48 +00:00
let prev = &prev[((nx - N) * ny)..];
block_multiply(&matrix.end, idx, ny, simdified, prev, futn);
2021-02-09 20:44:35 +00:00
}
}
2021-02-01 22:21:06 +00:00
#[inline(always)]
2021-02-01 22:08:55 +00:00
pub(crate) fn diff_op_2d_sliceable<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-18 21:37:24 +00:00
optype: OperatorType,
2021-02-01 22:08:55 +00:00
prev: ArrayView2<Float>,
mut fut: ArrayViewMut2<Float>,
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[1];
for (prev, mut fut) in prev.outer_iter().zip(fut.outer_iter_mut()) {
let prev = &prev.as_slice().unwrap()[..nx];
let fut = &mut fut.as_slice_mut().unwrap()[..nx];
diff_op_1d_slice(matrix, optype, prev, fut)
}
2021-01-18 21:37:24 +00:00
}
#[inline(always)]
2021-02-01 22:08:55 +00:00
/// Dispatch based on strides
pub(crate) fn diff_op_2d<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-18 21:37:24 +00:00
optype: OperatorType,
2021-02-01 22:08:55 +00:00
prev: ArrayView2<Float>,
fut: ArrayViewMut2<Float>,
) {
assert_eq!(prev.shape(), fut.shape());
match (prev.strides(), fut.strides()) {
([_, 1], [_, 1]) => diff_op_2d_sliceable(matrix, optype, prev, fut),
2021-02-10 18:02:48 +00:00
([1, _], [1, _])
if prev.as_slice_memory_order().is_some() && fut.as_slice_memory_order().is_some() =>
{
diff_op_2d_sliceable_y_simd(matrix, optype, prev, fut)
}
2021-02-01 22:08:55 +00:00
_ => diff_op_2d_fallback(matrix, optype, prev, fut),
2021-01-18 21:37:24 +00:00
}
}
2021-02-01 22:08:55 +00:00
/*
2021-01-18 21:37:24 +00:00
#[inline(always)]
2021-02-01 22:08:55 +00:00
/// Way to too much overhead with SIMD:
/// output SIMD oriented:
/// |S | = |P0 P1| |P0 P1|
/// |S | = a1|P0 P1| + b1|P0 P1|
/// |S | = |P0 P1| |P0 P1|
///
/// | S | = |P0 P1| |P0 P1|
/// | S | = a2|P0 P1| + b1|P0 P1|
/// | S | = |P0 P1| |P0 P1|
2021-01-29 16:36:05 +00:00
pub(crate) fn diff_op_col_matrix<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:08:55 +00:00
matrix: &BlockMatrix<M, N, D>,
2021-01-29 16:36:05 +00:00
optype: OperatorType,
prev: ArrayView2<Float>,
2021-01-30 14:58:03 +00:00
fut: ArrayViewMut2<Float>,
2021-01-29 16:36:05 +00:00
) {
assert_eq!(prev.shape(), fut.shape());
let nx = prev.shape()[1];
assert!(nx >= 2 * M);
assert_eq!(prev.strides()[0], 1);
assert_eq!(fut.strides()[0], 1);
let dx = if optype == OperatorType::H2 {
1.0 / (nx - 2) as Float
} else {
1.0 / (nx - 1) as Float
};
let idx = 1.0 / dx;
#[cfg(not(feature = "f32"))]
type SimdT = packed_simd::f64x8;
#[cfg(feature = "f32")]
type SimdT = packed_simd::f32x16;
let ny = prev.shape()[0];
// How many elements that can be simdified
let simdified = SimdT::lanes() * (ny / SimdT::lanes());
let half_diag_width = (D - 1) / 2;
assert!(half_diag_width <= M);
let fut_stride = fut.strides()[1];
let prev_base_ptr = prev.as_ptr();
let prev_stride = prev.strides()[1];
let prev_ptr = |j, i| {
debug_assert!(j < ny && i < nx);
unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
};
// Not algo necessary, but gives performance increase
assert_eq!(fut_stride, prev_stride);
2021-01-30 14:58:03 +00:00
use ndarray::Axis;
let (mut fut1, fut) = fut.split_at(Axis(1), M);
let (mut fut, mut fut2) = fut.split_at(Axis(1), nx - 2 * M);
2021-01-29 16:36:05 +00:00
// First block
{
2021-01-30 14:58:03 +00:00
let prev = prev.slice(ndarray::s![.., ..N]);
let (prevb, prevl) = prev.split_at(Axis(0), simdified);
2021-02-01 22:08:55 +00:00
for (mut fut, &bl) in fut1.axis_iter_mut(Axis(1)).zip(matrix.start.iter_rows()) {
2021-01-30 14:58:03 +00:00
let fut = fut.as_slice_mut().unwrap();
let fut = &mut fut[..ny];
let mut fut = fut.chunks_exact_mut(SimdT::lanes());
let mut prev = prevb.axis_chunks_iter(Axis(0), SimdT::lanes());
for (fut, prev) in fut.by_ref().zip(prev.by_ref()) {
2021-01-29 16:36:05 +00:00
let mut f = SimdT::splat(0.0);
2021-01-30 14:58:03 +00:00
for (&bl, prev) in bl.iter().zip(prev.axis_iter(Axis(1))) {
let prev = prev.to_slice().unwrap();
let prev = SimdT::from_slice_unaligned(prev);
f = prev.mul_adde(SimdT::splat(bl), f);
2021-01-29 16:36:05 +00:00
}
f *= idx;
2021-01-30 14:58:03 +00:00
f.write_to_slice_unaligned(fut);
2021-01-29 16:36:05 +00:00
}
2021-01-30 14:58:03 +00:00
for (fut, prev) in fut
.into_remainder()
.iter_mut()
.zip(prevl.axis_iter(Axis(0)))
{
let mut f = 0.0;
for (bl, prev) in bl.iter().zip(prev.iter()) {
f += bl * prev;
2021-01-29 16:36:05 +00:00
}
2021-01-30 14:58:03 +00:00
*fut = f * idx;
2021-01-29 16:36:05 +00:00
}
}
}
// Diagonal elements
{
2021-01-30 14:58:03 +00:00
let window_elems_to_skip = M - ((D - 1) / 2);
let prev = prev.slice(ndarray::s![.., window_elems_to_skip..]);
let prev = prev.windows((ny, D));
for (mut fut, prev) in fut.axis_iter_mut(Axis(1)).zip(prev) {
let fut = fut.as_slice_mut().unwrap();
let fut = &mut fut[..ny];
let mut fut = fut.chunks_exact_mut(SimdT::lanes());
let (prevb, prevl) = prev.split_at(Axis(0), simdified);
let prev = prevb.axis_chunks_iter(Axis(0), SimdT::lanes());
for (fut, prev) in fut.by_ref().zip(prev) {
2021-01-29 16:36:05 +00:00
let mut f = SimdT::splat(0.0);
2021-02-01 22:08:55 +00:00
for (&d, prev) in matrix.diag.iter().zip(prev.axis_iter(Axis(1))) {
2021-01-30 14:58:03 +00:00
let prev = prev.to_slice().unwrap();
let prev = SimdT::from_slice_unaligned(prev);
f = prev.mul_adde(SimdT::splat(d), f);
2021-01-29 16:36:05 +00:00
}
f *= idx;
2021-01-30 14:58:03 +00:00
f.write_to_slice_unaligned(fut);
2021-01-29 16:36:05 +00:00
}
2021-01-30 14:58:03 +00:00
for (fut, prev) in fut
.into_remainder()
.into_iter()
.zip(prevl.axis_iter(Axis(0)))
{
2021-01-29 16:36:05 +00:00
let mut f = 0.0;
2021-02-01 22:08:55 +00:00
for (&d, prev) in matrix.diag.iter().zip(prev) {
2021-01-30 14:58:03 +00:00
f += d * prev;
2021-01-29 16:36:05 +00:00
}
2021-01-30 14:58:03 +00:00
*fut = idx * f;
2021-01-29 16:36:05 +00:00
}
}
}
// End block
{
2021-02-01 22:08:55 +00:00
for (mut fut, &bl) in fut2.axis_iter_mut(Axis(1)).zip(matrix.end.iter_rows()) {
2021-01-30 14:58:03 +00:00
let fut = fut.as_slice_mut().unwrap();
let fut = &mut fut[..ny];
let mut fut = fut.chunks_exact_mut(SimdT::lanes());
for (fut, j) in fut.by_ref().zip((0..simdified).step_by(SimdT::lanes())) {
2021-01-29 16:36:05 +00:00
let index_to_simd = |i| unsafe {
// j never moves past end of slice due to step_by and
// rounding down
SimdT::from_slice_unaligned(std::slice::from_raw_parts(
prev_ptr(j, i),
SimdT::lanes(),
))
};
let mut f = SimdT::splat(0.0);
for (iprev, &bl) in (nx - N..nx).zip(bl.iter()) {
f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
}
f *= idx;
2021-01-30 14:58:03 +00:00
f.write_to_slice_unaligned(fut);
2021-01-29 16:36:05 +00:00
}
2021-01-30 14:58:03 +00:00
for (fut, j) in fut.into_remainder().into_iter().zip(simdified..ny) {
2021-01-29 16:36:05 +00:00
unsafe {
let mut f = 0.0;
for (iprev, bl) in (nx - N..nx).zip(bl.iter()) {
f += bl * *prev_ptr(j, iprev);
}
2021-01-30 14:58:03 +00:00
*fut = f * idx;
2021-01-29 16:36:05 +00:00
}
}
}
}
}
2021-02-01 22:08:55 +00:00
*/
2021-01-29 16:36:05 +00:00
2021-01-18 21:37:24 +00:00
#[inline(always)]
2021-02-01 22:08:55 +00:00
fn dotproduct<'a>(
u: impl IntoIterator<Item = &'a Float>,
v: impl IntoIterator<Item = &'a Float>,
) -> Float {
2021-03-23 18:21:38 +00:00
u.into_iter()
.zip(v.into_iter())
.fold(0.0, |acc, (&u, &v)| Float::mul_add(u, v, acc))
2021-01-18 21:37:24 +00:00
}
#[cfg(feature = "sparse")]
2021-02-01 22:08:55 +00:00
pub(crate) fn sparse_from_block<const M: usize, const N: usize, const D: usize>(
2021-02-01 22:13:10 +00:00
matrix: &BlockMatrix<Float, M, N, D>,
2021-01-18 21:37:24 +00:00
optype: OperatorType,
n: usize,
) -> sprs::CsMat<Float> {
2021-02-01 22:08:55 +00:00
assert!(n >= 2 * M);
2021-01-18 21:37:24 +00:00
let nnz = {
2021-02-01 22:08:55 +00:00
let blockstart_elems = matrix
.start
.iter()
.fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc });
let diag_elems = matrix
.diag
.iter()
.fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc });
2021-01-18 21:37:24 +00:00
2021-02-01 22:08:55 +00:00
let blockend_elems = matrix
.end
2021-01-18 21:37:24 +00:00
.iter()
.fold(0, |acc, &x| if x != 0.0 { acc + 1 } else { acc });
2021-02-01 22:08:55 +00:00
blockstart_elems + (n - 2 * M) * diag_elems + blockend_elems
2021-01-18 21:37:24 +00:00
};
let mut mat = sprs::TriMat::with_capacity((n, n), nnz);
let dx = if optype == OperatorType::H2 {
1.0 / (n - 2) as Float
} else {
1.0 / (n - 1) as Float
};
let idx = 1.0 / dx;
2021-02-01 22:08:55 +00:00
for (j, bl) in matrix.start.iter_rows().enumerate() {
2021-01-18 21:37:24 +00:00
for (i, &b) in bl.iter().enumerate() {
if b == 0.0 {
continue;
}
mat.add_triplet(j, i, b * idx);
}
}
2021-02-01 22:08:55 +00:00
for j in M..n - M {
let half_diag_len = D / 2;
for (&d, i) in matrix.diag.iter().zip(j - half_diag_len..) {
2021-01-18 21:37:24 +00:00
if d == 0.0 {
continue;
}
mat.add_triplet(j, i, d * idx);
}
}
2021-02-01 22:08:55 +00:00
for (bl, j) in matrix.end.iter_rows().zip(n - M..) {
for (&b, i) in bl.iter().zip(n - N..) {
2021-01-18 21:37:24 +00:00
if b == 0.0 {
continue;
}
2021-02-01 22:08:55 +00:00
mat.add_triplet(j, i, b * idx);
2021-01-18 21:37:24 +00:00
}
}
mat.to_csr()
}
#[cfg(feature = "sparse")]
2021-02-01 22:08:55 +00:00
pub(crate) fn h_matrix<const D: usize>(
hmatrix: &DiagonalMatrix<D>,
n: usize,
is_h2: bool,
) -> sprs::CsMat<Float> {
2021-01-18 21:37:24 +00:00
let h = if is_h2 {
1.0 / (n - 2) as Float
} else {
1.0 / (n - 1) as Float
};
2021-02-01 22:08:55 +00:00
let nmiddle = n - 2 * D;
let iter = hmatrix
.start
2021-01-18 21:37:24 +00:00
.iter()
2021-02-01 22:08:55 +00:00
.chain(std::iter::repeat(&hmatrix.diag).take(nmiddle))
.chain(hmatrix.end.iter())
2021-01-18 21:37:24 +00:00
.map(|&x| h * x);
let mut mat = sprs::TriMat::with_capacity((n, n), n);
for (i, d) in iter.enumerate() {
mat.add_triplet(i, i, d);
}
mat.to_csr()
}