macrofy diff/diss operators

This commit is contained in:
Magnus Ulimoen 2019-12-14 01:29:07 +01:00
parent 3a919e9950
commit 195bbc3ef2
1 changed files with 276 additions and 401 deletions

View File

@ -16,41 +16,14 @@ diff_op_1d!(
true
);
impl Upwind4 {
#[rustfmt::skip]
const HBLOCK: &'static [f32] = &[
49.0 / 144.0, 61.0 / 48.0, 41.0 / 48.0, 149.0 / 144.0
];
#[rustfmt::skip]
const DIAG: &'static [f32] = &[
-1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
];
#[rustfmt::skip]
const BLOCK: &'static [[f32; 7]] = &[
[ -72.0 / 49.0, 187.0 / 98.0, -20.0 / 49.0, -3.0 / 98.0, 0.0, 0.0, 0.0],
[-187.0 / 366.0, 0.0, 69.0 / 122.0, -16.0 / 183.0, 2.0 / 61.0, 0.0, 0.0],
[ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0],
[ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
];
#[rustfmt::skip]
const DISS_BLOCK: &'static [[f32; 7]; 4] = &[
[-3.0 / 49.0, 9.0 / 49.0, -9.0 / 49.0, 3.0 / 49.0, 0.0, 0.0, 0.0],
[ 3.0 / 61.0, -11.0 / 61.0, 15.0 / 61.0, -9.0 / 61.0, 2.0 / 61.0, 0.0, 0.0],
[-3.0 / 41.0, 15.0 / 41.0, -29.0 / 41.0, 27.0 / 41.0, -12.0 / 41.0, 2.0 / 41.0, 0.0],
[3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
];
#[rustfmt::skip]
const DISS_DIAG: &'static [f32; 7] = &[
1.0 / 24.0, -1.0 / 4.0, 5.0 / 8.0, -5.0 / 6.0, 5.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
];
macro_rules! diff_simd_row_7_47 {
($self: ident, $name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => {
impl $self {
#[inline(never)]
fn diff_simd_row(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
fn $name(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
use packed_simd::{f32x8, u32x8};
assert_eq!(prev.shape(), fut.shape());
assert!(prev.len_of(Axis(1)) >= 2 * Self::BLOCK.len());
assert!(prev.len_of(Axis(1)) >= 2 * $BLOCK.len());
assert!(prev.len() >= f32x8::lanes());
// The prev array must have contigous last dimension
assert_eq!(prev.strides()[1], 1);
@ -61,25 +34,31 @@ impl Upwind4 {
for j in 0..prev.len_of(Axis(0)) {
use std::slice;
let prev = unsafe { slice::from_raw_parts(prev.uget((j, 0)) as *const f32, nx) };
let fut = unsafe { slice::from_raw_parts_mut(fut.uget_mut((j, 0)) as *mut f32, nx) };
let prev =
unsafe { slice::from_raw_parts(prev.uget((j, 0)) as *const f32, nx) };
let fut =
unsafe { slice::from_raw_parts_mut(fut.uget_mut((j, 0)) as *mut f32, nx) };
//let mut fut = fut.slice_mut(s![j, ..]);
let first_elems = unsafe { f32x8::from_slice_unaligned_unchecked(prev) };
let block = {
let bl = Self::BLOCK;
let bl = $BLOCK;
[
f32x8::new(
bl[0][0], bl[0][1], bl[0][2], bl[0][3], bl[0][4], bl[0][5], bl[0][6], 0.0,
bl[0][0], bl[0][1], bl[0][2], bl[0][3], bl[0][4], bl[0][5],
bl[0][6], 0.0,
),
f32x8::new(
bl[1][0], bl[1][1], bl[1][2], bl[1][3], bl[1][4], bl[1][5], bl[1][6], 0.0,
bl[1][0], bl[1][1], bl[1][2], bl[1][3], bl[1][4], bl[1][5],
bl[1][6], 0.0,
),
f32x8::new(
bl[2][0], bl[2][1], bl[2][2], bl[2][3], bl[2][4], bl[2][5], bl[2][6], 0.0,
bl[2][0], bl[2][1], bl[2][2], bl[2][3], bl[2][4], bl[2][5],
bl[2][6], 0.0,
),
f32x8::new(
bl[3][0], bl[3][1], bl[3][2], bl[3][3], bl[3][4], bl[3][5], bl[3][6], 0.0,
bl[3][0], bl[3][1], bl[3][2], bl[3][3], bl[3][4], bl[3][5],
bl[3][6], 0.0,
),
]
};
@ -89,7 +68,7 @@ impl Upwind4 {
fut[3] = idx * (block[3] * first_elems).sum();
let diag = {
let diag = Self::DIAG;
let diag = $DIAG;
f32x8::new(
diag[0], diag[1], diag[2], diag[3], diag[4], diag[5], diag[6], 0.0,
)
@ -107,24 +86,47 @@ impl Upwind4 {
*f = idx * (p * diag).sum();
}
let last_elems = unsafe { f32x8::from_slice_unaligned_unchecked(&prev[nx - 8..]) }
let last_elems =
unsafe { f32x8::from_slice_unaligned_unchecked(&prev[nx - 8..]) }
.shuffle1_dyn(u32x8::new(7, 6, 5, 4, 3, 2, 1, 0));
if $symmetric {
fut[nx - 4] = idx * (block[3] * last_elems).sum();
fut[nx - 3] = idx * (block[2] * last_elems).sum();
fut[nx - 2] = idx * (block[1] * last_elems).sum();
fut[nx - 1] = idx * (block[0] * last_elems).sum();
} else {
fut[nx - 4] = -idx * (block[3] * last_elems).sum();
fut[nx - 3] = -idx * (block[2] * last_elems).sum();
fut[nx - 2] = -idx * (block[1] * last_elems).sum();
fut[nx - 1] = -idx * (block[0] * last_elems).sum();
}
}
}
}
};
}
diff_simd_row_7_47!(Upwind4, diff_simd_row, Upwind4::BLOCK, Upwind4::DIAG, false);
diff_simd_row_7_47!(
Upwind4,
diss_simd_row,
Upwind4::DISS_BLOCK,
Upwind4::DISS_DIAG,
true
);
macro_rules! diff_simd_col_7_47 {
($self: ident, $name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => {
impl $self {
#[inline(never)]
fn diff_simd_col(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
fn $name(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
use std::slice;
assert_eq!(prev.shape(), fut.shape());
assert_eq!(prev.stride_of(Axis(0)), 1);
assert_eq!(prev.stride_of(Axis(0)), 1);
let ny = prev.len_of(Axis(0));
let nx = prev.len_of(Axis(1));
assert!(nx >= 2 * Self::BLOCK.len());
assert!(nx >= 2 * $BLOCK.len());
assert!(ny >= SimdT::lanes());
assert!(ny % SimdT::lanes() == 0);
@ -165,7 +167,7 @@ impl Upwind4 {
]
};
for (i, bl) in Self::BLOCK.iter().enumerate() {
for (i, bl) in $BLOCK.iter().enumerate() {
let b = idx
* (a[0] * bl[0]
+ a[1] * bl[1]
@ -183,7 +185,7 @@ impl Upwind4 {
}
let mut a = a;
for i in Self::BLOCK.len()..nx - Self::BLOCK.len() {
for i in $BLOCK.len()..nx - $BLOCK.len() {
// Push a onto circular buffer
a = [a[1], a[2], a[3], a[4], a[5], a[6], unsafe {
SimdT::from_slice_unaligned(slice::from_raw_parts(
@ -192,13 +194,13 @@ impl Upwind4 {
))
}];
let b = idx
* (a[0] * Self::DIAG[0]
+ a[1] * Self::DIAG[1]
+ a[2] * Self::DIAG[2]
+ a[3] * Self::DIAG[3]
+ a[4] * Self::DIAG[4]
+ a[5] * Self::DIAG[5]
+ a[6] * Self::DIAG[6]);
* (a[0] * $DIAG[0]
+ a[1] * $DIAG[1]
+ a[2] * $DIAG[2]
+ a[3] * $DIAG[3]
+ a[4] * $DIAG[4]
+ a[5] * $DIAG[5]
+ a[6] * $DIAG[6]);
unsafe {
b.write_to_slice_unaligned(slice::from_raw_parts_mut(
fut.uget_mut((j, i)) as *mut f32,
@ -240,8 +242,9 @@ impl Upwind4 {
]
};
for (i, bl) in Self::BLOCK.iter().enumerate() {
let b = -idx
for (i, bl) in $BLOCK.iter().enumerate() {
let idx = if $symmetric { idx } else { -idx };
let b = idx
* (a[0] * bl[0]
+ a[1] * bl[1]
+ a[2] * bl[2]
@ -258,179 +261,48 @@ impl Upwind4 {
}
}
}
#[inline(never)]
fn diss_simd(prev: &[f32], fut: &mut [f32]) {
use packed_simd::{f32x8, u32x8};
assert_eq!(prev.len(), fut.len());
assert!(prev.len() >= 2 * Self::DISS_BLOCK.len());
let nx = prev.len();
let dx = 1.0 / (nx - 1) as f32;
let idx = 1.0 / dx;
let first_elems = unsafe { f32x8::from_slice_unaligned_unchecked(prev) };
let block = [
f32x8::new(
Self::DISS_BLOCK[0][0],
Self::DISS_BLOCK[0][1],
Self::DISS_BLOCK[0][2],
Self::DISS_BLOCK[0][3],
Self::DISS_BLOCK[0][4],
Self::DISS_BLOCK[0][5],
Self::DISS_BLOCK[0][6],
0.0,
),
f32x8::new(
Self::DISS_BLOCK[1][0],
Self::DISS_BLOCK[1][1],
Self::DISS_BLOCK[1][2],
Self::DISS_BLOCK[1][3],
Self::DISS_BLOCK[1][4],
Self::DISS_BLOCK[1][5],
Self::DISS_BLOCK[1][6],
0.0,
),
f32x8::new(
Self::DISS_BLOCK[2][0],
Self::DISS_BLOCK[2][1],
Self::DISS_BLOCK[2][2],
Self::DISS_BLOCK[2][3],
Self::DISS_BLOCK[2][4],
Self::DISS_BLOCK[2][5],
Self::DISS_BLOCK[2][6],
0.0,
),
f32x8::new(
Self::DISS_BLOCK[3][0],
Self::DISS_BLOCK[3][1],
Self::DISS_BLOCK[3][2],
Self::DISS_BLOCK[3][3],
Self::DISS_BLOCK[3][4],
Self::DISS_BLOCK[3][5],
Self::DISS_BLOCK[3][6],
0.0,
),
];
unsafe {
*fut.get_unchecked_mut(0) = idx * (block[0] * first_elems).sum();
*fut.get_unchecked_mut(1) = idx * (block[1] * first_elems).sum();
*fut.get_unchecked_mut(2) = idx * (block[2] * first_elems).sum();
*fut.get_unchecked_mut(3) = idx * (block[3] * first_elems).sum()
}
};
}
let diag = f32x8::new(
Self::DISS_DIAG[0],
Self::DISS_DIAG[1],
Self::DISS_DIAG[2],
Self::DISS_DIAG[3],
Self::DISS_DIAG[4],
Self::DISS_DIAG[5],
Self::DISS_DIAG[6],
0.0,
diff_simd_col_7_47!(Upwind4, diff_simd_col, Upwind4::BLOCK, Upwind4::DIAG, false);
diff_simd_col_7_47!(
Upwind4,
diss_simd_col,
Upwind4::DISS_BLOCK,
Upwind4::DISS_DIAG,
true
);
for (f, p) in fut
.iter_mut()
.skip(block.len())
.zip(
prev.windows(f32x8::lanes())
.map(f32x8::from_slice_unaligned)
.skip(1),
)
.take(nx - 2 * block.len())
{
*f = idx * (p * diag).sum();
}
let last_elems = unsafe { f32x8::from_slice_unaligned_unchecked(&prev[nx - 8..]) }
.shuffle1_dyn(u32x8::new(7, 6, 5, 4, 3, 2, 1, 0));
unsafe {
*fut.get_unchecked_mut(nx - 4) = idx * (block[3] * last_elems).sum();
*fut.get_unchecked_mut(nx - 3) = idx * (block[2] * last_elems).sum();
*fut.get_unchecked_mut(nx - 2) = idx * (block[1] * last_elems).sum();
*fut.get_unchecked_mut(nx - 1) = idx * (block[0] * last_elems).sum();
}
}
#[inline(never)]
fn disseta_simd(prev: &[f32], fut: &mut [f32], nx: usize, ny: usize) {
assert!(ny >= 2 * Self::DISS_BLOCK.len());
assert!(nx >= SimdT::lanes());
assert!(nx % SimdT::lanes() == 0);
assert_eq!(prev.len(), fut.len());
assert_eq!(prev.len(), nx * ny);
let dy = 1.0 / (ny - 1) as f32;
let idy = 1.0 / dy;
for j in (0..nx).step_by(SimdT::lanes()) {
let a = [
SimdT::from_slice_unaligned(&prev[0 * nx + j..]),
SimdT::from_slice_unaligned(&prev[1 * nx + j..]),
SimdT::from_slice_unaligned(&prev[2 * nx + j..]),
SimdT::from_slice_unaligned(&prev[3 * nx + j..]),
SimdT::from_slice_unaligned(&prev[4 * nx + j..]),
SimdT::from_slice_unaligned(&prev[5 * nx + j..]),
SimdT::from_slice_unaligned(&prev[6 * nx + j..]),
impl Upwind4 {
#[rustfmt::skip]
const HBLOCK: &'static [f32] = &[
49.0 / 144.0, 61.0 / 48.0, 41.0 / 48.0, 149.0 / 144.0
];
#[rustfmt::skip]
const DIAG: &'static [f32] = &[
-1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
];
#[rustfmt::skip]
const BLOCK: &'static [[f32; 7]] = &[
[ -72.0 / 49.0, 187.0 / 98.0, -20.0 / 49.0, -3.0 / 98.0, 0.0, 0.0, 0.0],
[-187.0 / 366.0, 0.0, 69.0 / 122.0, -16.0 / 183.0, 2.0 / 61.0, 0.0, 0.0],
[ 20.0 / 123.0, -69.0 / 82.0, 0.0, 227.0 / 246.0, -12.0 / 41.0, 2.0 / 41.0, 0.0],
[ 3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0, 0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
];
for (i, bl) in Self::DISS_BLOCK.iter().enumerate() {
let b = idy
* (a[0] * bl[0]
+ a[1] * bl[1]
+ a[2] * bl[2]
+ a[3] * bl[3]
+ a[4] * bl[4]
+ a[5] * bl[5]
+ a[6] * bl[6]);
b.write_to_slice_unaligned(&mut fut[i * nx + j..]);
}
let mut a = a;
for i in Self::DISS_BLOCK.len()..ny - Self::DISS_BLOCK.len() {
// Push a onto circular buffer
a = [
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
SimdT::from_slice_unaligned(&prev[nx * (i + 3) + j..]),
];
let b = idy
* (a[0] * Self::DISS_DIAG[0]
+ a[1] * Self::DISS_DIAG[1]
+ a[2] * Self::DISS_DIAG[2]
+ a[3] * Self::DISS_DIAG[3]
+ a[4] * Self::DISS_DIAG[4]
+ a[5] * Self::DISS_DIAG[5]
+ a[6] * Self::DISS_DIAG[6]);
b.write_to_slice_unaligned(&mut fut[nx * i + j..]);
}
let a = [
SimdT::from_slice_unaligned(&prev[(ny - 1) * nx + j..]),
SimdT::from_slice_unaligned(&prev[(ny - 2) * nx + j..]),
SimdT::from_slice_unaligned(&prev[(ny - 3) * nx + j..]),
SimdT::from_slice_unaligned(&prev[(ny - 4) * nx + j..]),
SimdT::from_slice_unaligned(&prev[(ny - 5) * nx + j..]),
SimdT::from_slice_unaligned(&prev[(ny - 6) * nx + j..]),
SimdT::from_slice_unaligned(&prev[(ny - 7) * nx + j..]),
#[rustfmt::skip]
const DISS_BLOCK: &'static [[f32; 7]; 4] = &[
[-3.0 / 49.0, 9.0 / 49.0, -9.0 / 49.0, 3.0 / 49.0, 0.0, 0.0, 0.0],
[ 3.0 / 61.0, -11.0 / 61.0, 15.0 / 61.0, -9.0 / 61.0, 2.0 / 61.0, 0.0, 0.0],
[-3.0 / 41.0, 15.0 / 41.0, -29.0 / 41.0, 27.0 / 41.0, -12.0 / 41.0, 2.0 / 41.0, 0.0],
[3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
];
for (i, bl) in Self::DISS_BLOCK.iter().enumerate() {
let b = idy
* (a[0] * bl[0]
+ a[1] * bl[1]
+ a[2] * bl[2]
+ a[3] * bl[3]
+ a[4] * bl[4]
+ a[5] * bl[5]
+ a[6] * bl[6]);
b.write_to_slice_unaligned(&mut fut[(ny - 1 - i) * nx + j..]);
}
}
}
#[rustfmt::skip]
const DISS_DIAG: &'static [f32; 7] = &[
1.0 / 24.0, -1.0 / 4.0, 5.0 / 8.0, -5.0 / 6.0, 5.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
];
}
impl SbpOperator for Upwind4 {
@ -556,24 +428,27 @@ fn upwind4_test() {
impl UpwindOperator for Upwind4 {
fn dissxi(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
assert_eq!(prev.shape(), fut.shape());
assert!(prev.shape()[1] >= 2 * Self::DISS_BLOCK.len());
assert!(prev.shape()[1] >= 2 * Self::BLOCK.len());
match (prev.strides(), fut.strides()) {
([_, 1], [_, _]) => {
Self::diss_simd_row(prev, fut);
}
([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
Self::diss_simd_col(prev, fut);
}
([_, _], [_, _]) => {
// Fallback, work row by row
for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
Self::diss_1d(r0, r1)
Self::diss_1d(r0, r1);
}
}
_ => unreachable!("Should only be two elements in the strides vectors"),
}
}
fn disseta(prev: ArrayView2<f32>, mut fut: ArrayViewMut2<f32>) {
assert_eq!(prev.shape(), fut.shape());
assert!(prev.shape()[0] >= 2 * Self::DISS_BLOCK.len());
let nx = prev.shape()[1];
let ny = prev.shape()[0];
if nx >= SimdT::lanes() && nx % SimdT::lanes() == 0 {
if let (Some(p), Some(f)) = (prev.as_slice(), fut.as_slice_mut()) {
Self::disseta_simd(p, f, nx, ny);
return;
}
}
// diffeta = transpose then use diffxi
fn disseta(prev: ArrayView2<f32>, fut: ArrayViewMut2<f32>) {
// diffeta = transpose then use dissxi
Self::dissxi(prev.reversed_axes(), fut.reversed_axes());
}
}