fieldops.f90 Source File


This file depends on

sourcefile~~fieldops.f90~~EfferentGraph sourcefile~fieldops.f90 fieldops.f90 sourcefile~common.f90 common.f90 sourcefile~fieldops.f90->sourcefile~common.f90 sourcefile~common.f90~3 common.f90 sourcefile~fieldops.f90->sourcefile~common.f90~3

Files dependent on this one

sourcefile~~fieldops.f90~~AfferentGraph sourcefile~fieldops.f90 fieldops.f90 sourcefile~backend.f90 backend.f90 sourcefile~backend.f90->sourcefile~fieldops.f90 sourcefile~xcompact.f90 xcompact.f90 sourcefile~xcompact.f90->sourcefile~backend.f90

Source Code

module m_cuda_kernels_fieldops
  use cudafor

  use m_common, only: dp
  use m_cuda_common, only: SZ

contains

  attributes(global) subroutine copy(n, dst, src)
    implicit none

    integer, value, intent(in) :: n
    real(dp), device, intent(out), dimension(:, :, :) :: dst
    real(dp), device, intent(in), dimension(:, :, :) :: src

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n
      dst(i, j, b) = src(i, j, b)
    end do

  end subroutine copy

  attributes(global) subroutine axpby(n, alpha, x, beta, y)
    implicit none

    integer, value, intent(in) :: n
    real(dp), value, intent(in) :: alpha, beta
    real(dp), device, intent(in), dimension(:, :, :) :: x
    real(dp), device, intent(inout), dimension(:, :, :) :: y

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n
      y(i, j, b) = alpha*x(i, j, b) + beta*y(i, j, b)
    end do

  end subroutine axpby

  attributes(global) subroutine pwmul(y, x, n)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: y
    real(dp), device, intent(in), dimension(:, :, :) :: x
    integer, value, intent(in) :: n

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n
      y(i, j, b) = y(i, j, b)*x(i, j, b)
    end do

  end subroutine pwmul

  attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e
    real(dp), device, intent(in), dimension(:, :, :) :: u
    integer, value, intent(in) :: n, n_halo

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n_halo
      u_send_s(i, j, b) = u(i, j, b)
      u_send_e(i, j, b) = u(i, n - n_halo + j, b)
    end do

  end subroutine buffer_copy

  attributes(global) subroutine field_scale(f, alpha, n)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: f
    real(dp), value, intent(in) :: alpha
    integer, value, intent(in) :: n

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n
      f(i, j, b) = alpha*f(i, j, b)
    end do

  end subroutine field_scale

  attributes(global) subroutine field_shift(f, const, n)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: f
    real(dp), value, intent(in) :: const
    integer, value, intent(in) :: n

    integer :: i, j, b

    i = threadIdx%x
    b = blockIdx%x

    do j = 1, n
      f(i, j, b) = f(i, j, b) + const
    end do

  end subroutine field_shift

  attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j)
    implicit none

    real(dp), device, intent(inout) :: s
    real(dp), device, intent(in), dimension(:, :, :) :: x, y
    integer, value, intent(in) :: n, n_i_pad, n_j

    real(dp) :: s_pncl !! pencil sum
    integer :: i, j, b, b_i, b_j, ierr

    i = threadIdx%x
    b_i = blockIdx%x
    b_j = blockIdx%y

    b = b_i + (b_j - 1)*n_i_pad
    s_pncl = 0._dp
    if (i + (b_j - 1)*blockDim%x <= n_j) then
      do j = 1, n
        s_pncl = s_pncl + x(i, j, b)*y(i, j, b)
      end do
    end if
    ierr = atomicadd(s, s_pncl)

  end subroutine scalar_product

  attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j)
    implicit none

    real(dp), device, intent(inout) :: max_f, sum_f
    real(dp), device, intent(in), dimension(:, :, :) :: f
    integer, value, intent(in) :: n, n_i_pad, n_j

    real(dp) :: max_pncl, sum_pncl, val
    integer :: i, j, b, b_i, b_j, ierr

    i = threadIdx%x
    b_i = blockIdx%x
    b_j = blockIdx%y

    b = b_i + (b_j - 1)*n_i_pad
    max_pncl = 0._dp
    sum_pncl = 0._dp
    if (i + (b_j - 1)*blockDim%x <= n_j) then
      do j = 1, n
        val = abs(f(i, j, b))
        sum_pncl = sum_pncl + val
        max_pncl = max(max_pncl, val)
      end do
    end if
    ierr = atomicadd(sum_f, sum_pncl)
    ierr = atomicmax(max_f, max_pncl)

  end subroutine field_max_sum

  attributes(global) subroutine field_set_y_face( &
    f, c_start, c_end, flow_rate_diff, nx, ny, nz)
  !! Set domain Y_FACE boundary values.
  !! c_start: Dirichlet value applied at the bottom face (j = 1)
  !! c_end:   Dirichlet value applied at the top face (j = ny)
    implicit none

    real(dp), device, intent(inout), dimension(:, :, :) :: f
    real(dp), value, intent(in) :: c_start, c_end, flow_rate_diff
    integer, value, intent(in) :: nx, ny, nz

    integer :: i, j, b, n_mod, b_end

    j = threadIdx%x + (blockIdx%x - 1)*blockDim%x ! from 1 to nx
    b = blockIdx%y ! from 1 to nz

    n_mod = mod(ny - 1, SZ) + 1
    b_end = b + (ny - 1)/SZ*nz

    if (j <= nx) then
      f(1, j, b) = c_start
      f(n_mod, j, b_end) = c_end
    end if

  end subroutine field_set_y_face

  attributes(global) subroutine field_set_x_face( &
    f, c_start, c_end, bc_start, bc_end, flow_rate_diff, nx, ny, nz)
  !! Set domain X_FACE boundary values.
  !! c_start: Dirichlet value applied at the left face (i = 1)
  !! c_end:   convective velocity Uc = uxmax * gdt / dx,
  !!          used as multiplier in the outflow scheme du/dt + Uc*du/dx = 0
    implicit none
    real(dp), device, intent(inout), dimension(:, :, :) :: f
    real(dp), value, intent(in) :: c_start, c_end, flow_rate_diff
    integer, value, intent(in) :: bc_start, bc_end
    integer, value, intent(in) :: nx, ny, nz
    integer :: i, b, n_mod, n_y_blocks, y_block, i_max

    i = threadIdx%x + (blockIdx%x - 1)*blockDim%x  ! 1..SZ
    b = blockIdx%y                                 ! 1..n_y_blocks*nz

    n_mod = mod(ny - 1, SZ) + 1
    n_y_blocks = (ny - 1)/SZ + 1

    ! Determine if this b is the last y-block (padding present)
    ! With b = (y_block - 1)*nz + z, y_block = (b - 1)/nz + 1
    y_block = (b - 1)/nz + 1
    if (y_block == n_y_blocks) then
      i_max = n_mod    ! last y-block: only i in [1, n_mod] are real
    else
      i_max = SZ       ! interior y-blocks: all i in [1, SZ] are real
    end if

    if (i <= i_max) then
      ! --- Left face (j = 1) ---
      select case (bc_start)
      case (1) ! BC_NEUMANN
        !this can be empty for now future TODO
      case (2) ! BC_DIRICHLET
        f(i, 1, b) = c_start
      end select

      ! --- Right face (j = nx) ---
      select case (bc_end)
      case (1) !BC_NEUMANN
        !this can be empty for now future TODO
      case (2) ! BC_DIRICHLET
        f(i, nx, b) = f(i, nx, b) &
                      - c_end*(f(i, nx, b) - f(i, nx - 1, b)) &
                      + flow_rate_diff
      end select
    end if
  end subroutine field_set_x_face

  attributes(global) subroutine volume_integral(s, f, n, n_i_pad, n_j)
    implicit none

    real(dp), device, intent(inout) :: s
    real(dp), device, intent(in), dimension(:, :, :) :: f
    integer, value, intent(in) :: n, n_i_pad, n_j

    real(dp) :: s_pncl !! pencil sum
    integer :: i, j, b, b_i, b_j, ierr

    i = threadIdx%x
    b_i = blockIdx%x
    b_j = blockIdx%y

    b = b_i + (b_j - 1)*n_i_pad
    s_pncl = 0._dp
    if (i + (b_j - 1)*blockDim%x <= n_j) then
      do j = 1, n
        s_pncl = s_pncl + f(i, j, b)
      end do
    end if
    ierr = atomicadd(s, s_pncl)

  end subroutine volume_integral

end module m_cuda_kernels_fieldops