diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 2af71f75..975c1480 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -516,7 +516,7 @@ subroutine alloc_tot() gphiv = huge(gphiv(1,1)) allocate(gphif(jpj,jpi)) gphif = huge(gphif(1,1)) - allocate(e1t(jpj,jpi)) + allocate(e1t(jpj,jpi)) e1t = huge(e1t(1,1)) allocate(e1u(jpj,jpi)) e1u = huge(e1u(1,1)) @@ -790,10 +790,13 @@ subroutine alloc_tot() allocate(DAY_LENGTH(jpj,jpi)) DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. + + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem #endif - + END subroutine alloc_tot @@ -802,6 +805,8 @@ subroutine clean_memory() ! myalloc (memory.f90) + !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + #ifdef key_mpp !$acc exit data delete(te_send, tw_send, tn_send, ts_send) finalize diff --git a/src/General/step.f90 b/src/General/step.f90 index 80c8ef31..e776aa14 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -265,6 +265,10 @@ SUBROUTINE trcstp ! with surface boundary condition ! with IMPLICIT vertical diffusion + ! XXX: to be removed + use DIA_mem, only: diaflx + use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,tmask,avt + IMPLICIT NONE integer jn,jk,ji,jj trcstpparttime = MPI_WTIME() ! cronometer-start @@ -296,7 +300,9 @@ SUBROUTINE trcstp IF (lbfm) CALL trcsms + !$acc update device(e1t,diaflx,e3t_back,e2t,trb,tmask,e3t,tra,avt,e3w) if (lzdf) IF (lzdf) CALL trczdf ! tracers: vertical diffusion + !$acc update host(diaflx,tra) if (lzdf) IF (lsnu) CALL snutel diff --git a/src/IO/DIA_mem.f90 b/src/IO/DIA_mem.f90 index e385f32e..223d6c27 100644 --- a/src/IO/DIA_mem.f90 +++ b/src/IO/DIA_mem.f90 @@ -50,6 +50,7 @@ SUBROUTINE alloc_DIA_local_flx() INDflxDUMP = huge(INDflxDUMP(1)) allocate(diaflx (7, Fsize, jptra )) diaflx = 0 + !$acc enter data create(diaflx) END SUBROUTINE alloc_DIA_local_flx @@ -95,6 +96,7 @@ subroutine clean_memory_dia() if (allocated(diaflx)) then deallocate(diaflx) + !$acc exit data delete(diaflx) endif if (allocated(INDflxDUMPZERO)) then diff --git a/src/PHYS/ZDF_mem.f90 b/src/PHYS/ZDF_mem.f90 index cb3fe00b..265e7004 100644 --- a/src/PHYS/ZDF_mem.f90 +++ b/src/PHYS/ZDF_mem.f90 @@ -46,6 +46,9 @@ subroutine myalloc_ZDF() jarr_zdf = huge(jarr_zdf(1,1)) allocate(jarr_zdf_flx(jpi*jpj,jpk)) jarr_zdf_flx = huge(jarr_zdf_flx(1,1)) + !$acc enter data create(jarr_zdf,jarr_zdf_flx) + !$acc update device(jarr_zdf,jarr_zdf_flx) +#ifndef _OPENACC allocate(zwd(jpk, ntids)) zwd = huge(zwd(1,1)) allocate(zws(jpk, ntids)) @@ -60,6 +63,7 @@ subroutine myalloc_ZDF() zwz = huge(zwz(1,1)) allocate(zwt(jpk, ntids)) zwt = huge(zwt(1,1)) +#endif #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -67,7 +71,28 @@ subroutine myalloc_ZDF() END subroutine myalloc_ZDF - + +#ifdef _OPENACC + subroutine myalloc_ZDF_gpu() + allocate(zwd(jpk, dimen_jvzdf)) + zwd = huge(zwd(1,1)) + allocate(zws(jpk, dimen_jvzdf)) + zws = huge(zws(1,1)) + allocate(zwi(jpk, dimen_jvzdf)) + zwi = huge(zwi(1,1)) + allocate(zwx(jpk, dimen_jvzdf)) + zwx = huge(zwx(1,1)) + allocate(zwy(jpk, dimen_jvzdf)) + zwy = huge(zwy(1,1)) + allocate(zwz(jpk, dimen_jvzdf)) + zwz = huge(zwz(1,1)) + allocate(zwt(jpk, dimen_jvzdf)) + zwt = huge(zwt(1,1)) + + !$acc enter data create(zwd,zwi,zwx,zws,zwz,zwy,zwt) + !$acc update device(zwd,zwi,zwx,zws,zwz,zwy,zwt) + END subroutine myalloc_ZDF_gpu +#endif subroutine clean_memory_zdf() @@ -82,6 +107,8 @@ subroutine clean_memory_zdf() deallocate(zwz) deallocate(zwt) + !$acc exit data delete(jarr_zdf,jarr_zdf_flx,zwd,zwi,zwx,zws,zwz,zwy,zwt) + end subroutine clean_memory_zdf diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 129613be..65d0a75b 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -174,10 +174,8 @@ SUBROUTINE trcadv !$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) @@ -353,10 +351,8 @@ SUBROUTINE trcadv !!trn could be allocate earlier !$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc enter data create(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) !$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) !$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - !$acc enter data create( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu) !$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) @@ -946,9 +942,7 @@ SUBROUTINE trcadv !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc exit data delete( tra) finalize if(use_gpu) !$acc exit data delete( trn, advmask ) finalize if(use_gpu) - !$acc exit data delete( flx_ridxt, diaflx ) finalize if(use_gpu) !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu) !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it @@ -963,7 +957,7 @@ SUBROUTINE trcadv deallocate(zbuf ) !$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu) - !$acc exit data delete( e1t, e2t, e3t, e1u, e2u, e3u, e1v, e2v, e3v, e3w, un, vn, wn ) finalize if(use_gpu) + !$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu) trcadvparttime = MPI_WTIME() - trcadvparttime trcadvtottime = trcadvtottime + trcadvparttime diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index 0a02d892..a1b02ee3 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -69,7 +69,7 @@ SUBROUTINE trczdf LOGICAL :: l1,l2,l3 - INTEGER :: jk,jj,ji, jn, jv, jf + INTEGER :: jk,jj,ji, jn, jv, jf, ntx ! omp variables @@ -113,7 +113,11 @@ SUBROUTINE trczdf END DO END DO END DO - + !$acc enter data create(delta_tra,int_tra) + !$acc update device(jarr_zdf,jarr_zdf_flx) +#ifdef _OPENACC + call myalloc_ZDF_gpu() +#endif ENDIF @@ -129,12 +133,20 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab + ! NOTE: kernel is too big, should be split + !$acc parallel loop gang vector default(present) async vector_length(32) DO jv = 1, dimen_jvzdf ji = jarr_zdf(2,jv) jj = jarr_zdf(1,jv) Aij = e1t(jj,ji) * e2t(jj,ji) +#ifdef _OPENACC + ntx=jv +#else + ntx=1 +#endif + !! I. Vertical trends associated with lateral mixing !! ------------------------------------------------- !! (excluding the vertical flux proportional to dk[t] ) @@ -155,14 +167,14 @@ SUBROUTINE trczdf !! ... Euler time stepping when starting from rest DO jk = 1, jpkm1 z2dtt = zdt * rdt - zwi(jk, 1) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) - zws(jk, 1) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) - zwd(jk, 1) = 1. - zwi(jk, 1) - zws(jk, 1) + zwi(jk, ntx) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) + zws(jk, ntx) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) + zwd(jk, ntx) = 1. - zwi(jk, ntx) - zws(jk, ntx) END DO !! Surface boundary conditions - zwi(1,1) = 0.e0 - zwd(1,1) = 1. - zws(1,1) + zwi(1,ntx) = 0.e0 + zwd(1,ntx) = 1. - zws(1,ntx) !! II.1. Vertical diffusion on tr !! ------------------------------ @@ -170,7 +182,7 @@ SUBROUTINE trczdf !! ... Euler time stepping when starting from rest DO jk = 1, jpkm1 z2dtt = zdt * rdt - zwy(jk,1) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) + zwy(jk,ntx) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) END DO !! Matrix inversion from the first level @@ -208,22 +220,22 @@ SUBROUTINE trczdf ikstp1=ikst+1 ikenm2=jpk-2 - zwt(ikst,1)=zwd(ikst,1) + zwt(ikst,ntx)=zwd(ikst,ntx) DO jk=ikstp1,jpkm1 - zwt(jk,1)=zwd(jk,1)-zwi(jk,1)*zws(jk-1,1)/zwt(jk-1,1) + zwt(jk,ntx)=zwd(jk,ntx)-zwi(jk,ntx)*zws(jk-1,ntx)/zwt(jk-1,ntx) END DO - zwz(ikst,1)=zwy(ikst,1) + zwz(ikst,ntx)=zwy(ikst,ntx) DO jk=ikstp1,jpkm1 - zwz(jk,1)=zwy(jk,1)-zwi(jk, 1)/zwt(jk-1, 1)*zwz(jk-1, 1) + zwz(jk,ntx)=zwy(jk,ntx)-zwi(jk, ntx)/zwt(jk-1, ntx)*zwz(jk-1, ntx) END DO - zwx(jpkm1, 1)=zwz(jpkm1, 1)/zwt(jpkm1, 1) + zwx(jpkm1, ntx)=zwz(jpkm1, ntx)/zwt(jpkm1, ntx) DO jk=ikenm2,ikst,-1 - zwx(jk, 1)=( zwz(jk, 1)-zws(jk, 1)*zwx(jk+1, 1) )/zwt(jk, 1) + zwx(jk, ntx)=( zwz(jk, ntx)-zws(jk, ntx)*zwx(jk+1, ntx) )/zwt(jk, ntx) END DO ! calculate flux due to vertical diffusion (on top face of the grid cell jk) @@ -232,7 +244,7 @@ SUBROUTINE trczdf DO jk=1,jpkm1 z2dtt = zdt * rdt - delta_tra(jk) = ( zwx(jk,1) - zwy(jk,1) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) + delta_tra(jk) = ( zwx(jk,ntx) - zwy(jk,ntx) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) IF (jk .EQ. 1) THEN int_tra(1) = 0 @@ -258,16 +270,18 @@ SUBROUTINE trczdf !! (c a u t i o n: tracer not its trend, Leap-frog scheme done !! it will not be done in trcnxt) DO jk = 1, jpkm1 - tra(jk,jj,ji,jn) = zwx(jk,1) * tmask(jk,jj,ji) + tra(jk,jj,ji,jn) = zwx(jk,ntx) * tmask(jk,jj,ji) END DO END DO ! jv + !$acc end parallel loop ! end if END DO TRACER_LOOP !!!$omp end parallel do + !$acc wait trczdfparttime = MPI_WTIME() - trczdfparttime !cronometer-stop