Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/General/memory.f90
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ subroutine alloc_tot()
gphiv = huge(gphiv(1,1))
allocate(gphif(jpj,jpi))
gphif = huge(gphif(1,1))
allocate(e1t(jpj,jpi))
allocate(e1t(jpj,jpi))
e1t = huge(e1t(1,1))
allocate(e1u(jpj,jpi))
e1u = huge(e1u(1,1))
Expand Down Expand Up @@ -790,10 +790,13 @@ subroutine alloc_tot()
allocate(DAY_LENGTH(jpj,jpi))
DAY_LENGTH = huge(DAY_LENGTH(1,1))
forcing_phys_initialized = .false.

!$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt)

#ifdef Mem_Monitor
mem_all=get_mem(err) - aux_mem
#endif

END subroutine alloc_tot


Expand All @@ -802,6 +805,8 @@ subroutine clean_memory()

! myalloc (memory.f90)

!$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt)

#ifdef key_mpp

!$acc exit data delete(te_send, tw_send, tn_send, ts_send) finalize
Expand Down
6 changes: 6 additions & 0 deletions src/General/step.f90
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ SUBROUTINE trcstp
! with surface boundary condition
! with IMPLICIT vertical diffusion

! XXX: to be removed
use DIA_mem, only: diaflx
use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,tmask,avt

IMPLICIT NONE
integer jn,jk,ji,jj
trcstpparttime = MPI_WTIME() ! cronometer-start
Expand Down Expand Up @@ -296,7 +300,9 @@ SUBROUTINE trcstp

IF (lbfm) CALL trcsms

!$acc update device(e1t,diaflx,e3t_back,e2t,trb,tmask,e3t,tra,avt,e3w) if (lzdf)
IF (lzdf) CALL trczdf ! tracers: vertical diffusion
!$acc update host(diaflx,tra) if (lzdf)

IF (lsnu) CALL snutel

Expand Down
2 changes: 2 additions & 0 deletions src/IO/DIA_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ SUBROUTINE alloc_DIA_local_flx()
INDflxDUMP = huge(INDflxDUMP(1))
allocate(diaflx (7, Fsize, jptra ))
diaflx = 0
!$acc enter data create(diaflx)
END SUBROUTINE alloc_DIA_local_flx


Expand Down Expand Up @@ -95,6 +96,7 @@ subroutine clean_memory_dia()

if (allocated(diaflx)) then
deallocate(diaflx)
!$acc exit data delete(diaflx)
endif

if (allocated(INDflxDUMPZERO)) then
Expand Down
29 changes: 28 additions & 1 deletion src/PHYS/ZDF_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ subroutine myalloc_ZDF()
jarr_zdf = huge(jarr_zdf(1,1))
allocate(jarr_zdf_flx(jpi*jpj,jpk))
jarr_zdf_flx = huge(jarr_zdf_flx(1,1))
!$acc enter data create(jarr_zdf,jarr_zdf_flx)
!$acc update device(jarr_zdf,jarr_zdf_flx)
#ifndef _OPENACC
allocate(zwd(jpk, ntids))
zwd = huge(zwd(1,1))
allocate(zws(jpk, ntids))
Expand All @@ -60,14 +63,36 @@ subroutine myalloc_ZDF()
zwz = huge(zwz(1,1))
allocate(zwt(jpk, ntids))
zwt = huge(zwt(1,1))
#endif

#ifdef Mem_Monitor
mem_all=get_mem(err) - aux_mem
#endif


END subroutine myalloc_ZDF


#ifdef _OPENACC
subroutine myalloc_ZDF_gpu()
allocate(zwd(jpk, dimen_jvzdf))
zwd = huge(zwd(1,1))
allocate(zws(jpk, dimen_jvzdf))
zws = huge(zws(1,1))
allocate(zwi(jpk, dimen_jvzdf))
zwi = huge(zwi(1,1))
allocate(zwx(jpk, dimen_jvzdf))
zwx = huge(zwx(1,1))
allocate(zwy(jpk, dimen_jvzdf))
zwy = huge(zwy(1,1))
allocate(zwz(jpk, dimen_jvzdf))
zwz = huge(zwz(1,1))
allocate(zwt(jpk, dimen_jvzdf))
zwt = huge(zwt(1,1))

!$acc enter data create(zwd,zwi,zwx,zws,zwz,zwy,zwt)
!$acc update device(zwd,zwi,zwx,zws,zwz,zwy,zwt)
END subroutine myalloc_ZDF_gpu
#endif
Comment on lines +75 to +95

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We create a new subroutine here that is called once in trczdf after dimen_jvzdf value is known

We could probably do the same for the CPU version to avoid duplicates, also the memory counter might needs to be adapted



subroutine clean_memory_zdf()
Expand All @@ -82,6 +107,8 @@ subroutine clean_memory_zdf()
deallocate(zwz)
deallocate(zwt)

!$acc exit data delete(jarr_zdf,jarr_zdf_flx,zwd,zwi,zwx,zws,zwz,zwy,zwt)

end subroutine clean_memory_zdf


Expand Down
8 changes: 1 addition & 7 deletions src/PHYS/trcadv.f90
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,8 @@ SUBROUTINE trcadv
!$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu)

!$acc enter data create( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc enter data create( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
Comment on lines 177 to 179

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not a good idea to declare these arrays here:

  • they are allocated and deallocated later, which is a waste of time
  • GPU allocation should be moved close to CPU allocate as the port progress


!$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
Expand Down Expand Up @@ -353,10 +351,8 @@ SUBROUTINE trcadv

!!trn could be allocate earlier
!$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu)
!$acc enter data create(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu)
!$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu)
!$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu)
!$acc enter data create( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu)

!$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
Expand Down Expand Up @@ -946,9 +942,7 @@ SUBROUTINE trcadv
!$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu)
!$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu)

!$acc exit data delete( tra) finalize if(use_gpu)
!$acc exit data delete( trn, advmask ) finalize if(use_gpu)
!$acc exit data delete( flx_ridxt, diaflx ) finalize if(use_gpu)
!$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu)

!!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it
Expand All @@ -963,7 +957,7 @@ SUBROUTINE trcadv
deallocate(zbuf )

!$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu)
!$acc exit data delete( e1t, e2t, e3t, e1u, e2u, e3u, e1v, e2v, e3v, e3w, un, vn, wn ) finalize if(use_gpu)
!$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu)

trcadvparttime = MPI_WTIME() - trcadvparttime
trcadvtottime = trcadvtottime + trcadvparttime
Expand Down
46 changes: 30 additions & 16 deletions src/PHYS/trczdf.f90
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ SUBROUTINE trczdf


LOGICAL :: l1,l2,l3
INTEGER :: jk,jj,ji, jn, jv, jf
INTEGER :: jk,jj,ji, jn, jv, jf, ntx
! omp variables


Expand Down Expand Up @@ -113,7 +113,11 @@ SUBROUTINE trczdf
END DO
END DO
END DO

!$acc enter data create(delta_tra,int_tra)
!$acc update device(jarr_zdf,jarr_zdf_flx)
#ifdef _OPENACC
call myalloc_ZDF_gpu()
#endif
ENDIF


Expand All @@ -129,12 +133,20 @@ SUBROUTINE trczdf
ztavg = 0.e0
!! vertical slab

! NOTE: kernel is too big, should be split
!$acc parallel loop gang vector default(present) async vector_length(32)
Comment on lines +136 to +137

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might want to think about clever ways to generate this kernel as it seems quite big, best performance on A100 was obtained with a vector length of 32 which isn't very high

DO jv = 1, dimen_jvzdf

ji = jarr_zdf(2,jv)
jj = jarr_zdf(1,jv)
Aij = e1t(jj,ji) * e2t(jj,ji)

#ifdef _OPENACC
ntx=jv

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for GPU version we parallelize on dimen_jvzdf

#else
ntx=1
#endif

!! I. Vertical trends associated with lateral mixing
!! -------------------------------------------------
!! (excluding the vertical flux proportional to dk[t] )
Expand All @@ -155,22 +167,22 @@ SUBROUTINE trczdf
!! ... Euler time stepping when starting from rest
DO jk = 1, jpkm1
z2dtt = zdt * rdt
zwi(jk, 1) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) )
zws(jk, 1) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) )
zwd(jk, 1) = 1. - zwi(jk, 1) - zws(jk, 1)
zwi(jk, ntx) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) )
zws(jk, ntx) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) )
zwd(jk, ntx) = 1. - zwi(jk, ntx) - zws(jk, ntx)
END DO

!! Surface boundary conditions
zwi(1,1) = 0.e0
zwd(1,1) = 1. - zws(1,1)
zwi(1,ntx) = 0.e0
zwd(1,ntx) = 1. - zws(1,ntx)

!! II.1. Vertical diffusion on tr
!! ------------------------------
!! Second member construction
!! ... Euler time stepping when starting from rest
DO jk = 1, jpkm1
z2dtt = zdt * rdt
zwy(jk,1) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn)
zwy(jk,ntx) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn)
END DO

!! Matrix inversion from the first level
Expand Down Expand Up @@ -208,22 +220,22 @@ SUBROUTINE trczdf
ikstp1=ikst+1
ikenm2=jpk-2

zwt(ikst,1)=zwd(ikst,1)
zwt(ikst,ntx)=zwd(ikst,ntx)

DO jk=ikstp1,jpkm1
zwt(jk,1)=zwd(jk,1)-zwi(jk,1)*zws(jk-1,1)/zwt(jk-1,1)
zwt(jk,ntx)=zwd(jk,ntx)-zwi(jk,ntx)*zws(jk-1,ntx)/zwt(jk-1,ntx)
END DO

zwz(ikst,1)=zwy(ikst,1)
zwz(ikst,ntx)=zwy(ikst,ntx)

DO jk=ikstp1,jpkm1
zwz(jk,1)=zwy(jk,1)-zwi(jk, 1)/zwt(jk-1, 1)*zwz(jk-1, 1)
zwz(jk,ntx)=zwy(jk,ntx)-zwi(jk, ntx)/zwt(jk-1, ntx)*zwz(jk-1, ntx)
END DO

zwx(jpkm1, 1)=zwz(jpkm1, 1)/zwt(jpkm1, 1)
zwx(jpkm1, ntx)=zwz(jpkm1, ntx)/zwt(jpkm1, ntx)

DO jk=ikenm2,ikst,-1
zwx(jk, 1)=( zwz(jk, 1)-zws(jk, 1)*zwx(jk+1, 1) )/zwt(jk, 1)
zwx(jk, ntx)=( zwz(jk, ntx)-zws(jk, ntx)*zwx(jk+1, ntx) )/zwt(jk, ntx)
END DO

! calculate flux due to vertical diffusion (on top face of the grid cell jk)
Expand All @@ -232,7 +244,7 @@ SUBROUTINE trczdf
DO jk=1,jpkm1

z2dtt = zdt * rdt
delta_tra(jk) = ( zwx(jk,1) - zwy(jk,1) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid)
delta_tra(jk) = ( zwx(jk,ntx) - zwy(jk,ntx) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid)

IF (jk .EQ. 1) THEN
int_tra(1) = 0
Expand All @@ -258,16 +270,18 @@ SUBROUTINE trczdf
!! (c a u t i o n: tracer not its trend, Leap-frog scheme done
!! it will not be done in trcnxt)
DO jk = 1, jpkm1
tra(jk,jj,ji,jn) = zwx(jk,1) * tmask(jk,jj,ji)
tra(jk,jj,ji,jn) = zwx(jk,ntx) * tmask(jk,jj,ji)
END DO

END DO ! jv
!$acc end parallel loop

! end if

END DO TRACER_LOOP
!!!$omp end parallel do

!$acc wait


trczdfparttime = MPI_WTIME() - trczdfparttime !cronometer-stop
Expand Down