module m_cuda_kernels_thom use cudafor use m_common, only: dp implicit none contains attributes(global) subroutine der_univ_thom( & du, u, coeffs_s, coeffs_e, coeffs, n, thom_f, thom_s, thom_w & ) implicit none real(dp), device, intent(out), dimension(:, :, :) :: du real(dp), device, intent(in), dimension(:, :, :) :: u real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e real(dp), device, intent(in), dimension(:) :: coeffs integer, value, intent(in) :: n real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w integer :: i, j, b real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du i = threadIdx%x b = blockIdx%x ! store bulk coeffs in the registers c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) c_j = coeffs(5) c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) du(i, 1, b) = coeffs_s(5, 1)*u(i, 1, b) & + coeffs_s(6, 1)*u(i, 2, b) & + coeffs_s(7, 1)*u(i, 3, b) & + coeffs_s(8, 1)*u(i, 4, b) & + coeffs_s(9, 1)*u(i, 5, b) du(i, 1, b) = du(i, 1, b) du(i, 2, b) = coeffs_s(4, 2)*u(i, 1, b) & + coeffs_s(5, 2)*u(i, 2, b) & + coeffs_s(6, 2)*u(i, 3, b) & + coeffs_s(7, 2)*u(i, 4, b) & + coeffs_s(8, 2)*u(i, 5, b) & + coeffs_s(9, 2)*u(i, 6, b) du(i, 2, b) = du(i, 2, b) - du(i, 1, b)*thom_s(2) du(i, 3, b) = coeffs_s(3, 3)*u(i, 1, b) & + coeffs_s(4, 3)*u(i, 2, b) & + coeffs_s(5, 3)*u(i, 3, b) & + coeffs_s(6, 3)*u(i, 4, b) & + coeffs_s(7, 3)*u(i, 5, b) & + coeffs_s(8, 3)*u(i, 6, b) & + coeffs_s(9, 3)*u(i, 7, b) du(i, 3, b) = du(i, 3, b) - du(i, 2, b)*thom_s(3) du(i, 4, b) = coeffs_s(2, 4)*u(i, 1, b) & + coeffs_s(3, 4)*u(i, 2, b) & + coeffs_s(4, 4)*u(i, 3, b) & + coeffs_s(5, 4)*u(i, 4, b) & + coeffs_s(6, 4)*u(i, 5, b) & + coeffs_s(7, 4)*u(i, 6, b) & + coeffs_s(8, 4)*u(i, 7, b) & + coeffs_s(9, 4)*u(i, 8, b) du(i, 4, b) = du(i, 4, b) - du(i, 3, b)*thom_s(4) do j = 5, n - 4 temp_du = c_m4*u(i, j - 4, b) + c_m3*u(i, j - 3, b) & + c_m2*u(i, j - 2, b) + c_m1*u(i, j - 1, b) & + c_j*u(i, j, b) & + c_p1*u(i, j + 1, b) + c_p2*u(i, j + 2, b) & + c_p3*u(i, j + 3, b) + c_p4*u(i, j + 4, b) du(i, j, b) = temp_du - du(i, j - 1, b)*thom_s(j) end do j = n - 3 du(i, j, b) = coeffs_e(1, 1)*u(i, j - 4, b) & + coeffs_e(2, 1)*u(i, j - 3, b) & + coeffs_e(3, 1)*u(i, j - 2, b) & + coeffs_e(4, 1)*u(i, j - 1, b) & + coeffs_e(5, 1)*u(i, j, b) & + coeffs_e(6, 1)*u(i, j + 1, b) & + coeffs_e(7, 1)*u(i, j + 2, b) & + coeffs_e(8, 1)*u(i, j + 3, b) du(i, j, b) = du(i, j, b) - du(i, j - 1, b)*thom_s(j) j = n - 2 du(i, j, b) = coeffs_e(1, 2)*u(i, j - 4, b) & + coeffs_e(2, 2)*u(i, j - 3, b) & + coeffs_e(3, 2)*u(i, j - 2, b) & + coeffs_e(4, 2)*u(i, j - 1, b) & + coeffs_e(5, 2)*u(i, j, b) & + coeffs_e(6, 2)*u(i, j + 1, b) & + coeffs_e(7, 2)*u(i, j + 2, b) du(i, j, b) = du(i, j, b) - du(i, j - 1, b)*thom_s(j) j = n - 1 du(i, j, b) = coeffs_e(1, 3)*u(i, j - 4, b) & + coeffs_e(2, 3)*u(i, j - 3, b) & + coeffs_e(3, 3)*u(i, j - 2, b) & + coeffs_e(4, 3)*u(i, j - 1, b) & + coeffs_e(5, 3)*u(i, j, b) & + coeffs_e(6, 3)*u(i, j + 1, b) du(i, j, b) = du(i, j, b) - du(i, j - 1, b)*thom_s(j) j = n du(i, j, b) = coeffs_e(1, 4)*u(i, j - 4, b) & + coeffs_e(2, 4)*u(i, j - 3, b) & + coeffs_e(3, 4)*u(i, j - 2, b) & + coeffs_e(4, 4)*u(i, j - 1, b) & + coeffs_e(5, 4)*u(i, j, b) du(i, j, b) = du(i, j, b) - du(i, j - 1, b)*thom_s(j) ! Backward pass of the Thomas algorithm du(i, n, b) = du(i, n, b)*thom_w(n) do j = n - 1, 1, -1 du(i, j, b) = (du(i, j, b) - thom_f(j)*du(i, j + 1, b))*thom_w(j) end do end subroutine der_univ_thom attributes(global) subroutine der_univ_thom_per( & du, u, coeffs, n, alpha, thom_f, thom_s, thom_w, thom_p & ) implicit none real(dp), device, intent(out), dimension(:, :, :) :: du real(dp), device, intent(in), dimension(:, :, :) :: u real(dp), device, intent(in), dimension(:) :: coeffs integer, value, intent(in) :: n real(dp), value, intent(in) :: alpha real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, & thom_p integer :: i, j, b integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4 real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4 real(dp) :: temp_du, ss i = threadIdx%x b = blockIdx%x ! store bulk coeffs in the registers c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) c_j = coeffs(5) c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) do j = 1, n jm4 = modulo(j - 5, n) + 1 jm3 = modulo(j - 4, n) + 1 jm2 = modulo(j - 3, n) + 1 jm1 = modulo(j - 2, n) + 1 jp1 = modulo(j - n, n) + 1 jp2 = modulo(j - n + 1, n) + 1 jp3 = modulo(j - n + 2, n) + 1 jp4 = modulo(j - n + 3, n) + 1 temp_du = c_m4*u(i, jm4, b) + c_m3*u(i, jm3, b) & + c_m2*u(i, jm2, b) + c_m1*u(i, jm1, b) & + c_j*u(i, j, b) & + c_p1*u(i, jp1, b) + c_p2*u(i, jp2, b) & + c_p3*u(i, jp3, b) + c_p4*u(i, jp4, b) du(i, j, b) = temp_du - du(i, jm1, b)*thom_s(j) end do ! Backward pass of the Thomas algorithm du(i, n, b) = du(i, n, b)*thom_w(n) do j = n - 1, 1, -1 du(i, j, b) = (du(i, j, b) - thom_f(j)*du(i, j + 1, b))*thom_w(j) end do ! Periodic final pass ss = (du(i, 1, b) - alpha*du(i, n, b)) & /(1._dp + thom_p(1) - alpha*thom_p(n)) do j = 1, n du(i, j, b) = du(i, j, b) - ss*thom_p(j) end do end subroutine der_univ_thom_per end module m_cuda_kernels_thom