simdify blocks in diff_op_col

This commit is contained in:
Magnus Ulimoen 2020-05-01 19:08:55 +02:00
parent cbf6042055
commit 52bd4f3f8f
1 changed files with 92 additions and 33 deletions

View File

@ -293,42 +293,68 @@ fn diff_op_col_simd(
// How many elements that can be simdified // How many elements that can be simdified
let simdified = SimdT::lanes() * (ny / SimdT::lanes()); let simdified = SimdT::lanes() * (ny / SimdT::lanes());
let half_diag_width = (diag.len() - 1) / 2;
assert!(half_diag_width <= block.len());
let fut_base_ptr = fut.as_mut_ptr();
let fut_stride = fut.strides()[1];
let fut_ptr = |j, i| {
debug_assert!(j < ny && i < nx);
unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
};
let prev_base_ptr = prev.as_ptr();
let prev_stride = prev.strides()[1];
let prev_ptr = |j, i| {
debug_assert!(j < ny && i < nx);
unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
};
// Not algo necessary, but gives performance increase
assert_eq!(fut_stride, prev_stride);
// First block // First block
{ {
for (bl, mut fut) in block.iter().zip(fut.axis_iter_mut(ndarray::Axis(1))) { for (ifut, &bl) in block.iter().enumerate() {
fut.fill(0.0); for j in (0..simdified).step_by(SimdT::lanes()) {
debug_assert_eq!(fut.len(), prev.shape()[0]); let index_to_simd = |i| unsafe {
for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1))) { // j never moves past end of slice due to step_by and
debug_assert_eq!(prev.len(), fut.len()); // rounding down
fut.scaled_add(idx * bl, &prev); SimdT::from_slice_unaligned(std::slice::from_raw_parts(
prev_ptr(j, i),
SimdT::lanes(),
))
};
let mut f = SimdT::splat(0.0);
for (iprev, &bl) in bl.iter().enumerate() {
f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
}
f = f * idx;
unsafe {
f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
fut_ptr(j, ifut),
SimdT::lanes(),
));
}
}
for j in simdified..ny {
unsafe {
let mut f = 0.0;
for (iprev, bl) in bl.iter().enumerate() {
f += bl * *prev_ptr(j, iprev);
}
*fut_ptr(j, ifut) = f * idx;
}
} }
} }
} }
// Diagonal elements // Diagonal elements
{ {
let half_diag_width = (diag.len() - 1) / 2;
assert!(half_diag_width <= block.len());
let fut_base_ptr = fut.as_mut_ptr();
let fut_stride = fut.strides()[1];
let fut_ptr = |j, i| {
debug_assert!(j < ny && i < nx);
unsafe { fut_base_ptr.offset(fut_stride * i as isize + j as isize) }
};
let prev_base_ptr = prev.as_ptr();
let prev_stride = prev.strides()[1];
let prev_ptr = |j, i| {
debug_assert!(j < ny && i < nx);
unsafe { prev_base_ptr.offset(prev_stride * i as isize + j as isize) }
};
assert_eq!(fut_stride, prev_stride);
for ifut in block.len()..nx - block.len() { for ifut in block.len()..nx - block.len() {
for j in (0..simdified).step_by(SimdT::lanes()) { for j in (0..simdified).step_by(SimdT::lanes()) {
let index_to_simd = |(j, i)| unsafe { let index_to_simd = |i| unsafe {
// j never moves past end of slice due to step_by and // j never moves past end of slice due to step_by and
// rounding down // rounding down
SimdT::from_slice_unaligned(std::slice::from_raw_parts( SimdT::from_slice_unaligned(std::slice::from_raw_parts(
@ -339,7 +365,7 @@ fn diff_op_col_simd(
let mut f = SimdT::splat(0.0); let mut f = SimdT::splat(0.0);
for (id, &d) in diag.iter().enumerate() { for (id, &d) in diag.iter().enumerate() {
let offset = ifut - half_diag_width + id; let offset = ifut - half_diag_width + id;
f = index_to_simd((j, offset)).mul_adde(SimdT::splat(d), f); f = index_to_simd(offset).mul_adde(SimdT::splat(d), f);
} }
f = f * idx; f = f * idx;
unsafe { unsafe {
@ -367,13 +393,46 @@ fn diff_op_col_simd(
// End block // End block
{ {
for (bl, mut fut) in block.iter().zip(fut.axis_iter_mut(ndarray::Axis(1)).rev()) { // Get blocks and corresponding offsets
fut.fill(0.0); // (rev to iterate in ifut increasing order)
for (&bl, prev) in bl.iter().zip(prev.axis_iter(ndarray::Axis(1)).rev()) { for (bl, ifut) in block.iter().zip((0..nx).rev()) {
if symmetry == Symmetry::Symmetric { for j in (0..simdified).step_by(SimdT::lanes()) {
fut.scaled_add(idx * bl, &prev); let index_to_simd = |i| unsafe {
// j never moves past end of slice due to step_by and
// rounding down
SimdT::from_slice_unaligned(std::slice::from_raw_parts(
prev_ptr(j, i),
SimdT::lanes(),
))
};
let mut f = SimdT::splat(0.0);
for (&bl, iprev) in bl.iter().zip((0..nx).rev()) {
f = index_to_simd(iprev).mul_adde(SimdT::splat(bl), f);
}
f = if symmetry == Symmetry::Symmetric {
f * idx
} else { } else {
fut.scaled_add(-idx * bl, &prev); -f * idx
};
unsafe {
f.write_to_slice_unaligned(std::slice::from_raw_parts_mut(
fut_ptr(j, ifut),
SimdT::lanes(),
));
}
}
for j in simdified..ny {
unsafe {
let mut f = 0.0;
for (&bl, iprev) in bl.iter().zip((0..nx).rev()).rev() {
f += bl * *prev_ptr(j, iprev);
}
*fut_ptr(j, ifut) = if symmetry == Symmetry::Symmetric {
f * idx
} else {
-f * idx
};
} }
} }
} }