reorder.f90 Source File


Source Code

module m_cuda_kernels_reorder
  use cudafor

  use m_common, only: dp
  use m_cuda_common, only: SZ

contains

  attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_c
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k)

    call syncthreads()

    ! copy into output array from shared
    u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) = tile(j, i)

  end subroutine reorder_c2x

  attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_c
    real(dp), device, intent(in), dimension(:, :, :) :: u_x
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz)

    call syncthreads()

    ! copy into output array from shared
    u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) = tile(j, i)

  end subroutine reorder_x2c

  attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_y
    real(dp), device, intent(in), dimension(:, :, :) :: u_x
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)

    call syncthreads()

    ! copy into output array from shared
    u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)

  end subroutine reorder_x2y

  attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_z
    real(dp), device, intent(in), dimension(:, :, :) :: u_x
    integer, value, intent(in) :: nz

    integer :: i, j, b_i, b_j, nx

    i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
    nx = gridDim%x

    ! Data access pattern for reordering between x and z is quite nice
    ! thus we don't need to use shared memory for this operation.
    do j = 1, nz
      u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
    end do

  end subroutine reorder_x2z

  attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_y
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

    call syncthreads()

    ! copy into output array from shared
    u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)

  end subroutine reorder_y2x

  attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_z
    real(dp), device, intent(in), dimension(:, :, :) :: u_y
    integer, value, intent(in) :: nx, nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

    call syncthreads()

    ! copy into output array from shared
    u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)

  end subroutine reorder_y2z

  attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_z
    integer, value, intent(in) :: nz

    integer :: i, j, b_i, b_j, nx

    i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
    nx = gridDim%x

    do j = 1, nz
      u_x(i, b_i, j + (b_j - 1)*nz) = u_z(i, j, b_i + (b_j - 1)*nx)
    end do

  end subroutine reorder_z2x

  attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_y
    real(dp), device, intent(in), dimension(:, :, :) :: u_z
    integer, value, intent(in) :: nx, nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)

    call syncthreads()

    ! copy into output array from shared
    u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)

  end subroutine reorder_z2y

  attributes(global) subroutine sum_yintox(u_x, u_y, nz)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_y
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_k) + nz*(b_i - 1))

    call syncthreads()

    ! copy into output array from shared
    u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + (b_k)) = &
      u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + (b_k)) + tile(j, i)

  end subroutine sum_yintox

  attributes(global) subroutine sum_zintox(u_x, u_z, nz)
    implicit none

    ! Arguments
    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_z
    integer, value, intent(in) :: nz

    integer :: i, j, b_i, b_j, nx

    i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
    nx = gridDim%x

    do j = 1, nz
      u_x(i, b_i, j + (b_j - 1)*nz) = u_x(i, b_i, j + (b_j - 1)*nz) &
                                      + u_z(i, j, b_i + (b_j - 1)*nx)
    end do

  end subroutine sum_zintox

  attributes(global) subroutine axpby(n, alpha, x, beta, y)
    implicit none

    integer, value, intent(in) :: n
    real(dp), value, intent(in) :: alpha, beta
    real(dp), device, intent(in), dimension(:, :, :) :: x
    real(dp), device, intent(inout), dimension(:, :, :) :: y

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n
      y(i, j, b) = alpha*x(i, j, b) + beta*y(i, j, b)
    end do

  end subroutine axpby

  attributes(global) subroutine scalar_product(s, x, y, n)
    implicit none

    real(dp), device, intent(inout) :: s
    real(dp), device, intent(in), dimension(:, :, :) :: x, y
    integer, value, intent(in) :: n

    real(dp) :: s_pncl !! pencil sum
    integer :: i, j, b, ierr

    i = threadIdx%x
    b = blockIdx%x

    s_pncl = 0._dp
    do j = 1, n
      s_pncl = s_pncl + x(i, j, b)*y(i, j, b)
    end do
    ierr = atomicadd(s, s_pncl)

  end subroutine scalar_product

  attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e
    real(dp), device, intent(in), dimension(:, :, :) :: u
    integer, value, intent(in) :: n, n_halo

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n_halo
      u_send_s(i, j, b) = u(i, j, b)
      u_send_e(i, j, b) = u(i, n - n_halo + j, b)
    end do

  end subroutine buffer_copy

end module m_cuda_kernels_reorder