use super::{SbpOperator, UpwindOperator};
use crate::Float;
use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Axis};

#[derive(Debug)]
pub struct Upwind4 {}

/// Simdtype used in diff_simd_col and diff_simd_row
#[cfg(feature = "f32")]
type SimdT = packed_simd::f32x8;
#[cfg(not(feature = "f32"))]
type SimdT = packed_simd::f64x8;

macro_rules! diff_simd_row_7_47 {
    ($name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => {
        #[inline(never)]
        fn $name(prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
            assert_eq!(prev.shape(), fut.shape());
            assert!(prev.len_of(Axis(1)) >= 2 * $BLOCK.len());
            assert!(prev.len() >= SimdT::lanes());
            // The prev and fut array must have contiguous last dimension
            assert_eq!(prev.strides()[1], 1);
            assert_eq!(fut.strides()[1], 1);

            let nx = prev.len_of(Axis(1));
            let dx = 1.0 / (nx - 1) as Float;
            let idx = 1.0 / dx;

            for j in 0..prev.len_of(Axis(0)) {
                use std::slice;
                let prev = unsafe { slice::from_raw_parts(prev.uget((j, 0)) as *const Float, nx) };
                let fut =
                    unsafe { slice::from_raw_parts_mut(fut.uget_mut((j, 0)) as *mut Float, nx) };

                let first_elems = unsafe { SimdT::from_slice_unaligned_unchecked(prev) };
                let block = {
                    let bl = $BLOCK;
                    [
                        SimdT::new(
                            bl[0][0], bl[0][1], bl[0][2], bl[0][3], bl[0][4], bl[0][5], bl[0][6],
                            0.0,
                        ),
                        SimdT::new(
                            bl[1][0], bl[1][1], bl[1][2], bl[1][3], bl[1][4], bl[1][5], bl[1][6],
                            0.0,
                        ),
                        SimdT::new(
                            bl[2][0], bl[2][1], bl[2][2], bl[2][3], bl[2][4], bl[2][5], bl[2][6],
                            0.0,
                        ),
                        SimdT::new(
                            bl[3][0], bl[3][1], bl[3][2], bl[3][3], bl[3][4], bl[3][5], bl[3][6],
                            0.0,
                        ),
                    ]
                };
                fut[0] = idx * (block[0] * first_elems).sum();
                fut[1] = idx * (block[1] * first_elems).sum();
                fut[2] = idx * (block[2] * first_elems).sum();
                fut[3] = idx * (block[3] * first_elems).sum();

                let diag = {
                    let diag = $DIAG;
                    SimdT::new(
                        diag[0], diag[1], diag[2], diag[3], diag[4], diag[5], diag[6], 0.0,
                    )
                };
                for i in 4..nx - 4 {
                    unsafe {
                        let prev = SimdT::from_slice_unaligned_unchecked(&prev[i - 3..]);
                        *fut.get_unchecked_mut(i) = idx * (prev * diag).sum();
                    }
                }

                let last_elems = unsafe { SimdT::from_slice_unaligned_unchecked(&prev[nx - 8..]) }
                    .shuffle1_dyn([7, 6, 5, 4, 3, 2, 1, 0].into());
                if $symmetric {
                    fut[nx - 4] = idx * (block[3] * last_elems).sum();
                    fut[nx - 3] = idx * (block[2] * last_elems).sum();
                    fut[nx - 2] = idx * (block[1] * last_elems).sum();
                    fut[nx - 1] = idx * (block[0] * last_elems).sum();
                } else {
                    fut[nx - 4] = -idx * (block[3] * last_elems).sum();
                    fut[nx - 3] = -idx * (block[2] * last_elems).sum();
                    fut[nx - 2] = -idx * (block[1] * last_elems).sum();
                    fut[nx - 1] = -idx * (block[0] * last_elems).sum();
                }
            }
        }
    };
}

diff_simd_row_7_47!(diff_simd_row, Upwind4::BLOCK, Upwind4::DIAG, false);
diff_simd_row_7_47!(diss_simd_row, Upwind4::DISS_BLOCK, Upwind4::DISS_DIAG, true);

macro_rules! diff_simd_col_7_47 {
    ($name: ident, $BLOCK: expr, $DIAG: expr, $symmetric: expr) => {
        #[inline(never)]
        fn $name(prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
            use std::slice;
            assert_eq!(prev.shape(), fut.shape());
            assert_eq!(prev.stride_of(Axis(0)), 1);
            assert_eq!(fut.stride_of(Axis(0)), 1);
            let ny = prev.len_of(Axis(0));
            let nx = prev.len_of(Axis(1));
            assert!(nx >= 2 * $BLOCK.len());
            assert!(ny >= SimdT::lanes());
            assert!(ny % SimdT::lanes() == 0);

            let dx = 1.0 / (nx - 1) as Float;
            let idx = 1.0 / dx;

            for j in (0..ny).step_by(SimdT::lanes()) {
                let a = unsafe {
                    [
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 0)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 1)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 2)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 3)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 4)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 5)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, 6)) as *const Float,
                            SimdT::lanes(),
                        )),
                    ]
                };

                for (i, bl) in $BLOCK.iter().enumerate() {
                    let b = idx
                        * (a[0] * bl[0]
                            + a[1] * bl[1]
                            + a[2] * bl[2]
                            + a[3] * bl[3]
                            + a[4] * bl[4]
                            + a[5] * bl[5]
                            + a[6] * bl[6]);
                    unsafe {
                        b.write_to_slice_unaligned(slice::from_raw_parts_mut(
                            fut.uget_mut((j, i)) as *mut Float,
                            SimdT::lanes(),
                        ));
                    }
                }

                let mut a = a;
                for i in $BLOCK.len()..nx - $BLOCK.len() {
                    // Push a onto circular buffer
                    a = [a[1], a[2], a[3], a[4], a[5], a[6], unsafe {
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, i + 3)) as *const Float,
                            SimdT::lanes(),
                        ))
                    }];
                    let b = idx
                        * (a[0] * $DIAG[0]
                            + a[1] * $DIAG[1]
                            + a[2] * $DIAG[2]
                            + a[3] * $DIAG[3]
                            + a[4] * $DIAG[4]
                            + a[5] * $DIAG[5]
                            + a[6] * $DIAG[6]);
                    unsafe {
                        b.write_to_slice_unaligned(slice::from_raw_parts_mut(
                            fut.uget_mut((j, i)) as *mut Float,
                            SimdT::lanes(),
                        ));
                    }
                }

                let a = unsafe {
                    [
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 1)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 2)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 3)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 4)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 5)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 6)) as *const Float,
                            SimdT::lanes(),
                        )),
                        SimdT::from_slice_unaligned(slice::from_raw_parts(
                            prev.uget((j, nx - 7)) as *const Float,
                            SimdT::lanes(),
                        )),
                    ]
                };

                for (i, bl) in $BLOCK.iter().enumerate() {
                    let idx = if $symmetric { idx } else { -idx };
                    let b = idx
                        * (a[0] * bl[0]
                            + a[1] * bl[1]
                            + a[2] * bl[2]
                            + a[3] * bl[3]
                            + a[4] * bl[4]
                            + a[5] * bl[5]
                            + a[6] * bl[6]);
                    unsafe {
                        b.write_to_slice_unaligned(slice::from_raw_parts_mut(
                            fut.uget_mut((j, nx - 1 - i)) as *mut Float,
                            SimdT::lanes(),
                        ));
                    }
                }
            }
        }
    };
}

diff_simd_col_7_47!(diff_simd_col, Upwind4::BLOCK, Upwind4::DIAG, false);
diff_simd_col_7_47!(diss_simd_col, Upwind4::DISS_BLOCK, Upwind4::DISS_DIAG, true);

impl Upwind4 {
    #[rustfmt::skip]
    const HBLOCK: &'static [Float] = &[
        49.0 / 144.0, 61.0 / 48.0, 41.0 / 48.0, 149.0 / 144.0
    ];
    #[rustfmt::skip]
    const DIAG: &'static [Float] = &[
        -1.0 / 24.0, 1.0 / 4.0, -7.0 / 8.0, 0.0, 7.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
    ];
    #[rustfmt::skip]
    const BLOCK: &'static [[Float; 7]] = &[
        [  -72.0 / 49.0, 187.0 / 98.0,   -20.0 / 49.0,   -3.0 / 98.0,           0.0,           0.0,         0.0],
        [-187.0 / 366.0,          0.0,   69.0 / 122.0, -16.0 / 183.0,    2.0 / 61.0,           0.0,         0.0],
        [  20.0 / 123.0, -69.0 / 82.0,            0.0, 227.0 / 246.0,  -12.0 / 41.0,    2.0 / 41.0,         0.0],
        [   3.0 / 298.0, 16.0 / 149.0, -227.0 / 298.0,           0.0, 126.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
    ];

    #[rustfmt::skip]
    const DISS_BLOCK: &'static [[Float; 7]; 4] = &[
        [-3.0 / 49.0,    9.0 / 49.0,  -9.0 / 49.0,     3.0 / 49.0,          0.0,           0.0,         0.0],
        [ 3.0 / 61.0,  -11.0 / 61.0,  15.0 / 61.0,    -9.0 / 61.0,   2.0 / 61.0,           0.0,         0.0],
        [-3.0 / 41.0,   15.0 / 41.0, -29.0 / 41.0,    27.0 / 41.0, -12.0 / 41.0,    2.0 / 41.0,         0.0],
        [3.0 / 149.0, -27.0 / 149.0, 81.0 / 149.0, -117.0 / 149.0, 90.0 / 149.0, -36.0 / 149.0, 6.0 / 149.0],
    ];

    #[rustfmt::skip]
    const DISS_DIAG: &'static [Float; 7] = &[
        1.0 / 24.0, -1.0 / 4.0, 5.0 / 8.0, -5.0 / 6.0, 5.0 / 8.0, -1.0 / 4.0, 1.0 / 24.0
    ];
}

impl SbpOperator for Upwind4 {
    fn diff1d(prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
        super::diff_op_1d(
            ndarray::arr2(Self::BLOCK).view(),
            ndarray::arr1(Self::DIAG).view(),
            false,
            false,
            prev,
            fut,
        )
    }
    fn diffxi(prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
        assert_eq!(prev.shape(), fut.shape());
        assert!(prev.shape()[1] >= 2 * Self::BLOCK.len());

        match (prev.strides(), fut.strides()) {
            ([_, 1], [_, 1]) => {
                diff_simd_row(prev, fut);
            }
            ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
                diff_simd_col(prev, fut);
            }
            ([_, _], [_, _]) => {
                // Fallback, work row by row
                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
                    Self::diff1d(r0, r1);
                }
            }
            _ => unreachable!("Should only be two elements in the strides vectors"),
        }
    }

    fn h() -> &'static [Float] {
        Self::HBLOCK
    }
}

#[test]
fn upwind4_test() {
    use ndarray::prelude::*;
    let nx = 20;
    let dx = 1.0 / (nx - 1) as Float;
    let mut source: ndarray::Array1<Float> = ndarray::Array1::zeros(nx);
    let mut res = ndarray::Array1::zeros(nx);
    let mut target = ndarray::Array1::zeros(nx);

    for i in 0..nx {
        source[i] = i as Float * dx;
        target[i] = 1.0;
    }
    res.fill(0.0);
    Upwind4::diff1d(source.view(), res.view_mut());
    approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-4);
    {
        let source = source.to_owned().insert_axis(ndarray::Axis(0));
        let mut res = res.to_owned().insert_axis(ndarray::Axis(0));
        let target = target.to_owned().insert_axis(ndarray::Axis(0));
        res.fill(0.0);
        Upwind4::diffxi(source.view(), res.view_mut());
        approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-2);
    }

    {
        let source = Array2::from_shape_fn((nx, 8), |(i, _)| source[i]);
        let target = Array2::from_shape_fn((nx, 8), |(i, _)| target[i]);
        let mut res = Array2::zeros((nx, 8));
        res.fill(0.0);
        Upwind4::diffeta(source.view(), res.view_mut());
        approx::assert_abs_diff_eq!(&res.to_owned(), &target.to_owned(), epsilon = 1e-2);
    }

    for i in 0..nx {
        let x = i as Float * dx;
        source[i] = x * x;
        target[i] = 2.0 * x;
    }
    res.fill(0.0);
    Upwind4::diff1d(source.view(), res.view_mut());
    approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-4);
    {
        let source = source.to_owned().insert_axis(ndarray::Axis(0));
        let mut res = res.to_owned().insert_axis(ndarray::Axis(0));
        let target = target.to_owned().insert_axis(ndarray::Axis(0));
        res.fill(0.0);
        Upwind4::diffxi(source.view(), res.view_mut());
        approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-2);
    }

    {
        let source = Array2::from_shape_fn((nx, 8), |(i, _)| source[i]);
        let target = Array2::from_shape_fn((nx, 8), |(i, _)| target[i]);
        let mut res = Array2::zeros((nx, 8));
        res.fill(0.0);
        Upwind4::diffeta(source.view(), res.view_mut());
        approx::assert_abs_diff_eq!(&res.to_owned(), &target.to_owned(), epsilon = 1e-2);
    }

    for i in 0..nx {
        let x = i as Float * dx;
        source[i] = x * x * x;
        target[i] = 3.0 * x * x;
    }
    res.fill(0.0);
    Upwind4::diff1d(source.view(), res.view_mut());
    approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-2);

    {
        let source = source.to_owned().insert_axis(ndarray::Axis(0));
        let mut res = res.to_owned().insert_axis(ndarray::Axis(0));
        let target = target.to_owned().insert_axis(ndarray::Axis(0));
        res.fill(0.0);
        Upwind4::diffxi(source.view(), res.view_mut());
        approx::assert_abs_diff_eq!(&res, &target, epsilon = 1e-2);
    }

    {
        let source = Array2::from_shape_fn((nx, 8), |(i, _)| source[i]);
        let target = Array2::from_shape_fn((nx, 8), |(i, _)| target[i]);
        let mut res = Array2::zeros((nx, 8));
        res.fill(0.0);
        Upwind4::diffeta(source.view(), res.view_mut());
        approx::assert_abs_diff_eq!(&res.to_owned(), &target.to_owned(), epsilon = 1e-2);
    }
}

impl UpwindOperator for Upwind4 {
    fn diss1d(prev: ArrayView1<Float>, fut: ArrayViewMut1<Float>) {
        super::diff_op_1d(
            ndarray::arr2(Self::DISS_BLOCK).view(),
            ndarray::arr1(Self::DISS_DIAG).view(),
            true,
            false,
            prev,
            fut,
        )
    }
    fn dissxi(prev: ArrayView2<Float>, mut fut: ArrayViewMut2<Float>) {
        assert_eq!(prev.shape(), fut.shape());
        assert!(prev.shape()[1] >= 2 * Self::BLOCK.len());

        match (prev.strides(), fut.strides()) {
            ([_, 1], [_, 1]) => {
                diss_simd_row(prev, fut);
            }
            ([1, _], [1, _]) if prev.len_of(Axis(0)) % SimdT::lanes() == 0 => {
                diss_simd_col(prev, fut);
            }
            ([_, _], [_, _]) => {
                // Fallback, work row by row
                for (r0, r1) in prev.outer_iter().zip(fut.outer_iter_mut()) {
                    Self::diss1d(r0, r1);
                }
            }
            _ => unreachable!("Should only be two elements in the strides vectors"),
        }
    }
}

#[test]
fn upwind4_test2() {
    use super::testing::*;
    use super::*;
    let nx = 32;
    let ny = 16;

    check_operator_on::<Upwind4, _, _, _>(
        (ny, nx),
        |x, y| x + 2.0 * y,
        |_, _| 1.0,
        |_, _| 2.0,
        1e-4,
    );
    check_operator_on::<Upwind4, _, _, _>(
        (ny, nx),
        |x, y| x * x + 2.0 * x * y + 3.0 * y * y,
        |x, y| 2.0 * x + 2.0 * y,
        |x, y| 2.0 * x + 6.0 * y,
        1e-3,
    );
    check_operator_on::<Upwind4, _, _, _>(
        (ny, nx),
        |x, y| x.powi(3) + 2.0 * x.powi(2) * y + 3.0 * x * y.powi(2) + 4.0 * y.powi(3),
        |x, y| 3.0 * x.powi(2) + 4.0 * x * y + 3.0 * y.powi(2),
        |x, y| 2.0 * x.powi(2) + 6.0 * x * y + 12.0 * y.powi(2),
        1e-1,
    );
    check_operator_on::<Upwind4, _, _, _>(
        (32, 32),
        |x, y| x.powi(3) + 2.0 * x.powi(2) * y + 3.0 * x * y.powi(2) + 4.0 * y.powi(3),
        |x, y| 3.0 * x.powi(2) + 4.0 * x * y + 3.0 * y.powi(2),
        |x, y| 2.0 * x.powi(2) + 6.0 * x * y + 12.0 * y.powi(2),
        1e-1,
    );
}