gather simd into diffxi
This commit is contained in:
parent
dd3e7cfb31
commit
0106ba5fbd
|
@ -11,5 +11,59 @@ pub trait UpwindOperator: SbpOperator {
|
||||||
fn disseta(prev: ArrayView2<f32>, fut: ArrayViewMut2<f32>);
|
fn disseta(prev: ArrayView2<f32>, fut: ArrayViewMut2<f32>);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! diff_op_1d {
|
||||||
|
($self: ty, $name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => {
|
||||||
|
impl $self {
|
||||||
|
fn $name(prev: ArrayView1<f32>, mut fut: ArrayViewMut1<f32>) {
|
||||||
|
assert_eq!(prev.shape(), fut.shape());
|
||||||
|
let nx = prev.shape()[0];
|
||||||
|
assert!(nx >= 2 * $BLOCK.len());
|
||||||
|
|
||||||
|
let dx = 1.0 / (nx - 1) as f32;
|
||||||
|
let idx = 1.0 / dx;
|
||||||
|
|
||||||
|
let block = ::ndarray::arr2($BLOCK);
|
||||||
|
let diag = ::ndarray::arr1($DIAG);
|
||||||
|
|
||||||
|
|
||||||
|
let first_elems = prev.slice(::ndarray::s!(..block.len_of(::ndarray::Axis(1))));
|
||||||
|
for (bl, f) in block.outer_iter().zip(&mut fut) {
|
||||||
|
let diff = first_elems.dot(&bl);
|
||||||
|
*f = diff * idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The window needs to be aligned to the diagonal elements,
|
||||||
|
// based on the block size
|
||||||
|
let window_elems_to_skip =
|
||||||
|
block.len_of(::ndarray::Axis(0)) - ((diag.len() - 1) / 2);
|
||||||
|
|
||||||
|
for (window, f) in prev
|
||||||
|
.windows(diag.len())
|
||||||
|
.into_iter()
|
||||||
|
.skip(window_elems_to_skip)
|
||||||
|
.zip(fut.iter_mut().skip(block.len_of(::ndarray::Axis(0))))
|
||||||
|
.take(nx - 2 * block.len_of(::ndarray::Axis(0)))
|
||||||
|
{
|
||||||
|
let diff = diag.dot(&window);
|
||||||
|
*f = diff * idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
let last_elems = prev.slice(::ndarray::s!(nx - block.len_of(::ndarray::Axis(1))..;-1));
|
||||||
|
for (bl, f) in block.outer_iter()
|
||||||
|
.zip(&mut fut.slice_mut(s![nx - block.len_of(::ndarray::Axis(0))..;-1]))
|
||||||
|
{
|
||||||
|
let diff = if $symmetric {
|
||||||
|
bl.dot(&last_elems)
|
||||||
|
} else {
|
||||||
|
-bl.dot(&last_elems)
|
||||||
|
};
|
||||||
|
*f = diff * idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
mod upwind4;
|
mod upwind4;
|
||||||
pub use upwind4::Upwind4;
|
pub use upwind4::Upwind4;
|
||||||
|
|
|
@ -1,11 +1,21 @@
|
||||||
use super::{SbpOperator, UpwindOperator};
|
use super::{SbpOperator, UpwindOperator};
|
||||||
use ndarray::{arr1, arr2, s, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
|
use crate::diff_op_1d;
|
||||||
|
use ndarray::{s, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Axis};
|
||||||
/// Simdtype used in diffeta_simd
|
|
||||||
type SimdT = packed_simd::f32x8;
|
|
||||||
|
|
||||||
pub struct Upwind4 {}
|
pub struct Upwind4 {}
|
||||||
|
|
||||||
|
/// Simdtype used in diff_simd_col
|
||||||
|
type SimdT = packed_simd::f32x8;
|
||||||
|
|
||||||
|
diff_op_1d!(Upwind4, diff_1d, Upwind4::BLOCK, Upwind4::DIAG, false);
|
||||||
|
diff_op_1d!(
|
||||||
|
Upwind4,
|
||||||
|
diss_1d,
|
||||||
|
Upwind4::DISS_BLOCK,
|
||||||
|
Upwind4::DISS_DIAG,
|
||||||
|
true
|
||||||
|
);
|
||||||
|
|
||||||
impl Upwind4 {
|
impl Upwind4 {
|
||||||
#[rustfmt::skip]
|
#[rustfmt::skip]
|
||||||
const HBLOCK: &'static [f32] = &[
|
const HBLOCK: &'static [f32] = &[
|
||||||
|
@ -23,125 +33,69 @@ impl Upwind4 {
|
||||||
[ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
|
[ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
|
||||||
];
|
];
|
||||||
|
|
||||||
const DISS_BLOCK: [[f32; 7]; 4] = [
|
#[rustfmt::skip]
|
||||||
[
|
const DISS_BLOCK: &'static [[f32; 7]; 4] = &[
|
||||||
-3.0 / 49.0,
|
[-3.0 / 49.0, 9.0 / 49.0, -9.0 / 49.0, 3.0 / 49.0, 0.0, 0.0, 0.0],
|
||||||
9.0 / 49.0,
|
[ 3.0 / 61.0, -11.0 / 61.0, 15.0 / 61.0, -9.0 / 61.0, 2.0 / 61.0, 0.0, 0.0],
|
||||||
-9.0 / 49.0,
|
[-3.0 / 41.0, 15.0 / 41.0, -29.0 / 41.0, 27.0 / 41.0, -12.0 / 41.0, 2.0 / 41.0, 0.0],
|
||||||
3.0 / 49.0,
|
[3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
3.0 / 61.0,
|
|
||||||
-11.0 / 61.0,
|
|
||||||
15.0 / 61.0,
|
|
||||||
-9.0 / 61.0,
|
|
||||||
2.0 / 61.0,
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
-3.0 / 41.0,
|
|
||||||
15.0 / 41.0,
|
|
||||||
-29.0 / 41.0,
|
|
||||||
27.0 / 41.0,
|
|
||||||
-12.0 / 41.0,
|
|
||||||
2.0 / 41.0,
|
|
||||||
0.0,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
3.0 / 149.0,
|
|
||||||
-27.0 / 149.0,
|
|
||||||
81.0 / 149.0,
|
|
||||||
-117.0 / 149.0,
|
|
||||||
90.0 / 149.0,
|
|
||||||
-36.0 / 149.0,
|
|
||||||
6.0 / 149.0,
|
|
||||||
],
|
|
||||||
];
|
];
|
||||||
|
|
||||||
const DISS_DIAG: [f32; 7] = [
|
#[rustfmt::skip]
|
||||||
1.0 / 24.0,
|
const DISS_DIAG: &'static [f32; 7] = &[
|
||||||
-1.0 / 4.0,
|
1.0 / 24.0, -1.0 / 4.0, 5.0 / 8.0, -5.0 / 6.0, 5.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
|
||||||
5.0 / 8.0,
|
|
||||||
-5.0 / 6.0,
|
|
||||||
5.0 / 8.0,
|
|
||||||
-1.0 / 4.0,
|
|
||||||
1.0 / 24.0,
|
|
||||||
];
|
];
|
||||||
|
|
||||||
#[inline(never)]
|
#[inline(never)]
|
||||||
fn diff_simd(prev: &[f32], fut: &mut [f32]) {
|
fn diff_simd_row(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
|
||||||
use packed_simd::{f32x8, u32x8};
|
use packed_simd::{f32x8, u32x8};
|
||||||
assert_eq!(prev.len(), fut.len());
|
assert_eq!(prev.shape(), fut.shape());
|
||||||
assert!(prev.len() >= 2 * Self::BLOCK.len());
|
assert!(prev.len_of(Axis(1)) >= 2 * Self::BLOCK.len());
|
||||||
let nx = prev.len();
|
assert!(prev.len() >= f32x8::lanes());
|
||||||
|
// The prev array must have contigous last dimension
|
||||||
|
assert_eq!(prev.strides()[1], 1);
|
||||||
|
|
||||||
|
let nx = prev.len_of(Axis(1));
|
||||||
let dx = 1.0 / (nx - 1) as f32;
|
let dx = 1.0 / (nx - 1) as f32;
|
||||||
let idx = 1.0 / dx;
|
let idx = 1.0 / dx;
|
||||||
|
|
||||||
let first_elems = unsafe { f32x8::from_slice_unaligned_unchecked(prev) };
|
for j in 0..prev.len_of(Axis(0)) {
|
||||||
let block = [
|
//use std::slice;
|
||||||
f32x8::new(
|
//let prev = unsafe { slice::from_raw_parts(prev.slice(s![j, ..]).as_ptr(), nx) };
|
||||||
Self::BLOCK[0][0],
|
let prev = prev.slice(s![j, ..]);
|
||||||
Self::BLOCK[0][1],
|
let prev = prev.as_slice_memory_order().unwrap();
|
||||||
Self::BLOCK[0][2],
|
|
||||||
Self::BLOCK[0][3],
|
|
||||||
Self::BLOCK[0][4],
|
|
||||||
Self::BLOCK[0][5],
|
|
||||||
Self::BLOCK[0][6],
|
|
||||||
0.0,
|
|
||||||
),
|
|
||||||
f32x8::new(
|
|
||||||
Self::BLOCK[1][0],
|
|
||||||
Self::BLOCK[1][1],
|
|
||||||
Self::BLOCK[1][2],
|
|
||||||
Self::BLOCK[1][3],
|
|
||||||
Self::BLOCK[1][4],
|
|
||||||
Self::BLOCK[1][5],
|
|
||||||
Self::BLOCK[1][6],
|
|
||||||
0.0,
|
|
||||||
),
|
|
||||||
f32x8::new(
|
|
||||||
Self::BLOCK[2][0],
|
|
||||||
Self::BLOCK[2][1],
|
|
||||||
Self::BLOCK[2][2],
|
|
||||||
Self::BLOCK[2][3],
|
|
||||||
Self::BLOCK[2][4],
|
|
||||||
Self::BLOCK[2][5],
|
|
||||||
Self::BLOCK[2][6],
|
|
||||||
0.0,
|
|
||||||
),
|
|
||||||
f32x8::new(
|
|
||||||
Self::BLOCK[3][0],
|
|
||||||
Self::BLOCK[3][1],
|
|
||||||
Self::BLOCK[3][2],
|
|
||||||
Self::BLOCK[3][3],
|
|
||||||
Self::BLOCK[3][4],
|
|
||||||
Self::BLOCK[3][5],
|
|
||||||
Self::BLOCK[3][6],
|
|
||||||
0.0,
|
|
||||||
),
|
|
||||||
];
|
|
||||||
unsafe {
|
|
||||||
*fut.get_unchecked_mut(0) = idx * (block[0] * first_elems).sum();
|
|
||||||
*fut.get_unchecked_mut(1) = idx * (block[1] * first_elems).sum();
|
|
||||||
*fut.get_unchecked_mut(2) = idx * (block[2] * first_elems).sum();
|
|
||||||
*fut.get_unchecked_mut(3) = idx * (block[3] * first_elems).sum()
|
|
||||||
};
|
|
||||||
|
|
||||||
let diag = f32x8::new(
|
let first_elems = unsafe { f32x8::from_slice_unaligned_unchecked(prev) };
|
||||||
Self::DIAG[0],
|
let block = {
|
||||||
Self::DIAG[1],
|
let bl = Self::BLOCK;
|
||||||
Self::DIAG[2],
|
[
|
||||||
Self::DIAG[3],
|
f32x8::new(
|
||||||
Self::DIAG[4],
|
bl[0][0], bl[0][1], bl[0][2], bl[0][3], bl[0][4], bl[0][5], bl[0][6], 0.0,
|
||||||
Self::DIAG[5],
|
),
|
||||||
Self::DIAG[6],
|
f32x8::new(
|
||||||
0.0,
|
bl[1][0], bl[1][1], bl[1][2], bl[1][3], bl[1][4], bl[1][5], bl[1][6], 0.0,
|
||||||
);
|
),
|
||||||
|
f32x8::new(
|
||||||
|
bl[2][0], bl[2][1], bl[2][2], bl[2][3], bl[2][4], bl[2][5], bl[2][6], 0.0,
|
||||||
|
),
|
||||||
|
f32x8::new(
|
||||||
|
bl[3][0], bl[3][1], bl[3][2], bl[3][3], bl[3][4], bl[3][5], bl[3][6], 0.0,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
};
|
||||||
|
fut[(j, 0)] = idx * (block[0] * first_elems).sum();
|
||||||
|
fut[(j, 1)] = idx * (block[1] * first_elems).sum();
|
||||||
|
fut[(j, 2)] = idx * (block[2] * first_elems).sum();
|
||||||
|
fut[(j, 3)] = idx * (block[3] * first_elems).sum();
|
||||||
|
|
||||||
|
let diag = {
|
||||||
|
let diag = Self::DIAG;
|
||||||
|
f32x8::new(
|
||||||
|
diag[0], diag[1], diag[2], diag[3], diag[4], diag[5], diag[6], 0.0,
|
||||||
|
)
|
||||||
|
};
|
||||||
for (f, p) in fut
|
for (f, p) in fut
|
||||||
|
.slice_mut(s![j, ..])
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.skip(block.len())
|
.skip(block.len())
|
||||||
.zip(
|
.zip(
|
||||||
|
@ -156,38 +110,64 @@ impl Upwind4 {
|
||||||
|
|
||||||
let last_elems = unsafe { f32x8::from_slice_unaligned_unchecked(&prev[nx - 8..]) }
|
let last_elems = unsafe { f32x8::from_slice_unaligned_unchecked(&prev[nx - 8..]) }
|
||||||
.shuffle1_dyn(u32x8::new(7, 6, 5, 4, 3, 2, 1, 0));
|
.shuffle1_dyn(u32x8::new(7, 6, 5, 4, 3, 2, 1, 0));
|
||||||
unsafe {
|
fut[(j, nx - 4)] = -idx * (block[3] * last_elems).sum();
|
||||||
*fut.get_unchecked_mut(nx - 4) = -idx * (block[3] * last_elems).sum();
|
fut[(j, nx - 3)] = -idx * (block[2] * last_elems).sum();
|
||||||
*fut.get_unchecked_mut(nx - 3) = -idx * (block[2] * last_elems).sum();
|
fut[(j, nx - 2)] = -idx * (block[1] * last_elems).sum();
|
||||||
*fut.get_unchecked_mut(nx - 2) = -idx * (block[1] * last_elems).sum();
|
fut[(j, nx - 1)] = -idx * (block[0] * last_elems).sum();
|
||||||
*fut.get_unchecked_mut(nx - 1) = -idx * (block[0] * last_elems).sum();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(never)]
|
#[inline(never)]
|
||||||
fn diffeta_simd(prev: &[f32], fut: &mut [f32], nx: usize, ny: usize) {
|
fn diff_simd_col(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
|
||||||
assert!(ny >= 2 * Self::BLOCK.len());
|
use std::slice;
|
||||||
assert!(nx >= SimdT::lanes());
|
assert_eq!(prev.shape(), fut.shape());
|
||||||
assert!(nx % SimdT::lanes() == 0);
|
assert_eq!(prev.stride_of(Axis(0)), 1);
|
||||||
assert_eq!(prev.len(), fut.len());
|
assert_eq!(prev.stride_of(Axis(0)), 1);
|
||||||
assert_eq!(prev.len(), nx * ny);
|
let ny = prev.len_of(Axis(0));
|
||||||
|
let nx = prev.len_of(Axis(1));
|
||||||
|
assert!(nx >= 2 * Self::BLOCK.len());
|
||||||
|
assert!(ny >= SimdT::lanes());
|
||||||
|
assert!(ny % SimdT::lanes() == 0);
|
||||||
|
|
||||||
let dy = 1.0 / (ny - 1) as f32;
|
let dx = 1.0 / (nx - 1) as f32;
|
||||||
let idy = 1.0 / dy;
|
let idx = 1.0 / dx;
|
||||||
|
|
||||||
for j in (0..nx).step_by(SimdT::lanes()) {
|
for j in (0..ny).step_by(SimdT::lanes()) {
|
||||||
let a = [
|
let a = unsafe {
|
||||||
SimdT::from_slice_unaligned(&prev[0 * nx + j..]),
|
[
|
||||||
SimdT::from_slice_unaligned(&prev[1 * nx + j..]),
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
SimdT::from_slice_unaligned(&prev[2 * nx + j..]),
|
prev.slice(s![j.., 0]).as_ptr(),
|
||||||
SimdT::from_slice_unaligned(&prev[3 * nx + j..]),
|
SimdT::lanes(),
|
||||||
SimdT::from_slice_unaligned(&prev[4 * nx + j..]),
|
)),
|
||||||
SimdT::from_slice_unaligned(&prev[5 * nx + j..]),
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
SimdT::from_slice_unaligned(&prev[6 * nx + j..]),
|
prev.slice(s![j.., 1]).as_ptr(),
|
||||||
];
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., 2]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., 3]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., 4]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., 5]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., 6]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
]
|
||||||
|
};
|
||||||
|
|
||||||
for (i, bl) in Self::BLOCK.iter().enumerate() {
|
for (i, bl) in Self::BLOCK.iter().enumerate() {
|
||||||
let b = idy
|
let b = idx
|
||||||
* (a[0] * bl[0]
|
* (a[0] * bl[0]
|
||||||
+ a[1] * bl[1]
|
+ a[1] * bl[1]
|
||||||
+ a[2] * bl[2]
|
+ a[2] * bl[2]
|
||||||
|
@ -195,22 +175,24 @@ impl Upwind4 {
|
||||||
+ a[4] * bl[4]
|
+ a[4] * bl[4]
|
||||||
+ a[5] * bl[5]
|
+ a[5] * bl[5]
|
||||||
+ a[6] * bl[6]);
|
+ a[6] * bl[6]);
|
||||||
b.write_to_slice_unaligned(&mut fut[i * nx + j..]);
|
unsafe {
|
||||||
|
b.write_to_slice_unaligned(slice::from_raw_parts_mut(
|
||||||
|
fut.slice_mut(s![j.., i]).as_mut_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut a = a;
|
let mut a = a;
|
||||||
for i in Self::BLOCK.len()..ny - Self::BLOCK.len() {
|
for i in Self::BLOCK.len()..nx - Self::BLOCK.len() {
|
||||||
// Push a onto circular buffer
|
// Push a onto circular buffer
|
||||||
a = [
|
a = [a[1], a[2], a[3], a[4], a[5], a[6], unsafe {
|
||||||
a[1],
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
a[2],
|
prev.slice(s![j.., i + 3]).as_ptr(),
|
||||||
a[3],
|
SimdT::lanes(),
|
||||||
a[4],
|
))
|
||||||
a[5],
|
}];
|
||||||
a[6],
|
let b = idx
|
||||||
SimdT::from_slice_unaligned(&prev[nx * (i + 3) + j..]),
|
|
||||||
];
|
|
||||||
let b = idy
|
|
||||||
* (a[0] * Self::DIAG[0]
|
* (a[0] * Self::DIAG[0]
|
||||||
+ a[1] * Self::DIAG[1]
|
+ a[1] * Self::DIAG[1]
|
||||||
+ a[2] * Self::DIAG[2]
|
+ a[2] * Self::DIAG[2]
|
||||||
|
@ -218,21 +200,49 @@ impl Upwind4 {
|
||||||
+ a[4] * Self::DIAG[4]
|
+ a[4] * Self::DIAG[4]
|
||||||
+ a[5] * Self::DIAG[5]
|
+ a[5] * Self::DIAG[5]
|
||||||
+ a[6] * Self::DIAG[6]);
|
+ a[6] * Self::DIAG[6]);
|
||||||
b.write_to_slice_unaligned(&mut fut[nx * i + j..]);
|
unsafe {
|
||||||
|
b.write_to_slice_unaligned(slice::from_raw_parts_mut(
|
||||||
|
fut.slice_mut(s![j.., i]).as_mut_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let a = [
|
let a = unsafe {
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 1) * nx + j..]),
|
[
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 2) * nx + j..]),
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 3) * nx + j..]),
|
prev.slice(s![j.., nx - 1]).as_ptr(),
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 4) * nx + j..]),
|
SimdT::lanes(),
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 5) * nx + j..]),
|
)),
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 6) * nx + j..]),
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
SimdT::from_slice_unaligned(&prev[(ny - 7) * nx + j..]),
|
prev.slice(s![j.., nx - 2]).as_ptr(),
|
||||||
];
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., nx - 3]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., nx - 4]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., nx - 5]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., nx - 6]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
SimdT::from_slice_unaligned(slice::from_raw_parts(
|
||||||
|
prev.slice(s![j.., nx - 7]).as_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
)),
|
||||||
|
]
|
||||||
|
};
|
||||||
|
|
||||||
for (i, bl) in Self::BLOCK.iter().enumerate() {
|
for (i, bl) in Self::BLOCK.iter().enumerate() {
|
||||||
let b = -idy
|
let b = -idx
|
||||||
* (a[0] * bl[0]
|
* (a[0] * bl[0]
|
||||||
+ a[1] * bl[1]
|
+ a[1] * bl[1]
|
||||||
+ a[2] * bl[2]
|
+ a[2] * bl[2]
|
||||||
|
@ -240,50 +250,16 @@ impl Upwind4 {
|
||||||
+ a[4] * bl[4]
|
+ a[4] * bl[4]
|
||||||
+ a[5] * bl[5]
|
+ a[5] * bl[5]
|
||||||
+ a[6] * bl[6]);
|
+ a[6] * bl[6]);
|
||||||
b.write_to_slice_unaligned(&mut fut[(ny - 1 - i) * nx + j..]);
|
unsafe {
|
||||||
|
b.write_to_slice_unaligned(slice::from_raw_parts_mut(
|
||||||
|
fut.slice_mut(s![j.., nx - 1 - i]).as_mut_ptr(),
|
||||||
|
SimdT::lanes(),
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn diff(prev: ArrayView1<f32>, mut fut: ArrayViewMut1<f32>) {
|
|
||||||
assert_eq!(prev.shape(), fut.shape());
|
|
||||||
let nx = prev.shape()[0];
|
|
||||||
assert!(nx >= 2 * Self::BLOCK.len());
|
|
||||||
|
|
||||||
if let (Some(p), Some(f)) = (prev.as_slice(), fut.as_slice_mut()) {
|
|
||||||
Self::diff_simd(p, f);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let dx = 1.0 / (nx - 1) as f32;
|
|
||||||
let idx = 1.0 / dx;
|
|
||||||
|
|
||||||
let diag = arr1(Self::DIAG);
|
|
||||||
let block = arr2(Self::BLOCK);
|
|
||||||
|
|
||||||
let first_elems = prev.slice(s!(..7));
|
|
||||||
for (bl, f) in block.outer_iter().zip(&mut fut) {
|
|
||||||
let diff = first_elems.dot(&bl);
|
|
||||||
*f = diff * idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (window, f) in prev
|
|
||||||
.windows(diag.len())
|
|
||||||
.into_iter()
|
|
||||||
.skip(1)
|
|
||||||
.zip(fut.iter_mut().skip(4))
|
|
||||||
.take(nx - 8)
|
|
||||||
{
|
|
||||||
let diff = diag.dot(&window);
|
|
||||||
*f = diff * idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
let last_elems = prev.slice(s!(nx - 7..;-1));
|
|
||||||
for (bl, f) in block.outer_iter().zip(&mut fut.slice_mut(s![nx - 4..;-1])) {
|
|
||||||
let diff = -bl.dot(&last_elems);
|
|
||||||
*f = diff * idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#[inline(never)]
|
#[inline(never)]
|
||||||
fn diss_simd(prev: &[f32], fut: &mut [f32]) {
|
fn diss_simd(prev: &[f32], fut: &mut [f32]) {
|
||||||
use packed_simd::{f32x8, u32x8};
|
use packed_simd::{f32x8, u32x8};
|
||||||
|
@ -456,69 +432,32 @@ impl Upwind4 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn diss(prev: ArrayView1<f32>, mut fut: ArrayViewMut1<f32>) {
|
|
||||||
assert_eq!(prev.shape(), fut.shape());
|
|
||||||
let nx = prev.shape()[0];
|
|
||||||
assert!(nx >= 2 * Self::DISS_BLOCK.len());
|
|
||||||
|
|
||||||
if let (Some(p), Some(f)) = (prev.as_slice(), fut.as_slice_mut()) {
|
|
||||||
Self::diss_simd(p, f);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let dx = 1.0 / (nx - 1) as f32;
|
|
||||||
let idx = 1.0 / dx;
|
|
||||||
|
|
||||||
let diag = arr1(&Self::DISS_DIAG);
|
|
||||||
let block = arr2(&Self::DISS_BLOCK);
|
|
||||||
|
|
||||||
let first_elems = prev.slice(s!(..7));
|
|
||||||
for (bl, f) in block.outer_iter().zip(&mut fut) {
|
|
||||||
let diff = first_elems.dot(&bl);
|
|
||||||
*f = diff * idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (window, f) in prev
|
|
||||||
.windows(diag.len())
|
|
||||||
.into_iter()
|
|
||||||
.skip(1)
|
|
||||||
.zip(fut.iter_mut().skip(4))
|
|
||||||
.take(nx - 8)
|
|
||||||
{
|
|
||||||
let diff = diag.dot(&window);
|
|
||||||
*f = diff * idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
let last_elems = prev.slice(s!(nx - 7..;-1));
|
|
||||||
for (bl, f) in block.outer_iter().zip(&mut fut.slice_mut(s![nx - 4..;-1])) {
|
|
||||||
let diff = bl.dot(&last_elems);
|
|
||||||
*f = diff * idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SbpOperator for Upwind4 {
|
impl SbpOperator for Upwind4 {
|
||||||
fn diffxi(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
|
fn diffxi(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
|
||||||
assert_eq!(prev.shape(), fut.shape());
|
assert_eq!(prev.shape(), fut.shape());
|
||||||
assert!(prev.shape()[1] >= 2 * Self::BLOCK.len());
|
assert!(prev.shape()[1] >= 2 * Self::BLOCK.len());
|
||||||
|
|
||||||
|
match (prev.strides(), fut.strides()) {
|
||||||
|
([_, 1], [_, _]) => {
|
||||||
|
Self::diff_simd_row(prev, fut);
|
||||||
|
}
|
||||||
|
([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
|
||||||
|
Self::diff_simd_col(prev, fut);
|
||||||
|
}
|
||||||
|
([_, _], [_, _]) => {
|
||||||
|
// Fallback, work row by row
|
||||||
for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
|
for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
|
||||||
Self::diff(r0, r1)
|
Self::diff_1d(r0, r1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => unreachable!("Should only be two elements in the strides vectors"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn diffeta(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
|
fn diffeta(prev: ArrayView2<f32>, fut: ArrayViewMut2<f32>) {
|
||||||
assert_eq!(prev.shape(), fut.shape());
|
// transpose then use diffxi
|
||||||
assert!(prev.shape()[0] >= 2 * Self::BLOCK.len());
|
|
||||||
let nx = prev.shape()[1];
|
|
||||||
let ny = prev.shape()[0];
|
|
||||||
if nx >= SimdT::lanes() && nx % SimdT::lanes() == 0 {
|
|
||||||
if let (Some(p), Some(f)) = (prev.as_slice(), fut.as_slice_mut()) {
|
|
||||||
Self::diffeta_simd(p, f, nx, ny);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// diffeta = transpose then use diffxi
|
|
||||||
Self::diffxi(prev.reversed_axes(), fut.reversed_axes());
|
Self::diffxi(prev.reversed_axes(), fut.reversed_axes());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -541,7 +480,7 @@ fn upwind4_test() {
|
||||||
target[i] = 1.0;
|
target[i] = 1.0;
|
||||||
}
|
}
|
||||||
res.fill(0.0);
|
res.fill(0.0);
|
||||||
Upwind4::diff(source.view(), res.view_mut());
|
Upwind4::diff_1d(source.view(), res.view_mut());
|
||||||
approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-4);
|
approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-4);
|
||||||
{
|
{
|
||||||
let source = source.to_owned().insert_axis(ndarray::Axis(0));
|
let source = source.to_owned().insert_axis(ndarray::Axis(0));
|
||||||
|
@ -567,7 +506,7 @@ fn upwind4_test() {
|
||||||
target[i] = 2.0 * x;
|
target[i] = 2.0 * x;
|
||||||
}
|
}
|
||||||
res.fill(0.0);
|
res.fill(0.0);
|
||||||
Upwind4::diff(source.view(), res.view_mut());
|
Upwind4::diff_1d(source.view(), res.view_mut());
|
||||||
approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-4);
|
approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-4);
|
||||||
{
|
{
|
||||||
let source = source.to_owned().insert_axis(ndarray::Axis(0));
|
let source = source.to_owned().insert_axis(ndarray::Axis(0));
|
||||||
|
@ -593,7 +532,7 @@ fn upwind4_test() {
|
||||||
target[i] = 3.0 * x * x;
|
target[i] = 3.0 * x * x;
|
||||||
}
|
}
|
||||||
res.fill(0.0);
|
res.fill(0.0);
|
||||||
Upwind4::diff(source.view(), res.view_mut());
|
Upwind4::diff_1d(source.view(), res.view_mut());
|
||||||
approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-2);
|
approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-2);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -620,7 +559,7 @@ impl UpwindOperator for Upwind4 {
|
||||||
assert_eq!(prev.shape(), fut.shape());
|
assert_eq!(prev.shape(), fut.shape());
|
||||||
assert!(prev.shape()[1] >= 2 * Self::DISS_BLOCK.len());
|
assert!(prev.shape()[1] >= 2 * Self::DISS_BLOCK.len());
|
||||||
for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
|
for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
|
||||||
Self::diss(r0, r1)
|
Self::diss_1d(r0, r1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue