reorder.f90 Source File


Source Code

module m_cuda_kernels_reorder
  use cudafor

  use m_common, only: dp
  use m_cuda_common, only: SZ

contains

  attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_c
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k)

    call syncthreads()

    ! copy into output array from shared
    u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) = tile(j, i)

  end subroutine reorder_c2x

  attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_c
    real(dp), device, intent(in), dimension(:, :, :) :: u_x
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz)

    call syncthreads()

    ! copy into output array from shared
    u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) = tile(j, i)

  end subroutine reorder_x2c

  attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_y
    real(dp), device, intent(in), dimension(:, :, :) :: u_x
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)

    call syncthreads()

    ! copy into output array from shared
    u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)

  end subroutine reorder_x2y

  attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_z
    real(dp), device, intent(in), dimension(:, :, :) :: u_x
    integer, value, intent(in) :: nz

    integer :: i, j, b_i, b_j, nx

    i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
    nx = gridDim%x

    ! Data access pattern for reordering between x and z is quite nice
    ! thus we don't need to use shared memory for this operation.
    do j = 1, nz
      u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
    end do

  end subroutine reorder_x2z

  attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_y
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

    call syncthreads()

    ! copy into output array from shared
    u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)

  end subroutine reorder_y2x

  attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_z
    real(dp), device, intent(in), dimension(:, :, :) :: u_y
    integer, value, intent(in) :: nx, nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

    call syncthreads()

    ! copy into output array from shared
    u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)

  end subroutine reorder_y2z

  attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_z
    integer, value, intent(in) :: nz

    integer :: i, j, b_i, b_j, nx

    i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
    nx = gridDim%x

    do j = 1, nz
      u_x(i, b_i, j + (b_j - 1)*nz) = u_z(i, j, b_i + (b_j - 1)*nx)
    end do

  end subroutine reorder_z2x

  attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
    implicit none

    real(dp), device, intent(out), dimension(:, :, :) :: u_y
    real(dp), device, intent(in), dimension(:, :, :) :: u_z
    integer, value, intent(in) :: nx, nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)

    call syncthreads()

    ! copy into output array from shared
    u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)

  end subroutine reorder_z2y

  attributes(global) subroutine sum_yintox(u_x, u_y, nz)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_y
    integer, value, intent(in) :: nz

    real(dp), shared :: tile(SZ, SZ)
    integer :: i, j, b_i, b_j, b_k

    i = threadIdx%x; j = threadIdx%y
    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

    ! copy into shared
    tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_k) + nz*(b_i - 1))

    call syncthreads()

    ! copy into output array from shared
    u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + (b_k)) = &
      u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + (b_k)) + tile(j, i)

  end subroutine sum_yintox

  attributes(global) subroutine sum_zintox(u_x, u_z, nz)
    implicit none

    ! Arguments
    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
    real(dp), device, intent(in), dimension(:, :, :) :: u_z
    integer, value, intent(in) :: nz

    integer :: i, j, b_i, b_j, nx

    i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
    nx = gridDim%x

    do j = 1, nz
      u_x(i, b_i, j + (b_j - 1)*nz) = u_x(i, b_i, j + (b_j - 1)*nz) &
                                      + u_z(i, j, b_i + (b_j - 1)*nx)
    end do

  end subroutine sum_zintox

end module m_cuda_kernels_reorder