-
Notifications
You must be signed in to change notification settings - Fork 3
port trczdf to GPU #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev_gpu
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -174,10 +174,8 @@ SUBROUTINE trcadv | |
| !$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
|
|
||
| !$acc enter data create( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc enter data create( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
|
Comment on lines
177
to
179
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's not a good idea to declare these arrays here:
|
||
|
|
||
| !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
|
|
@@ -353,10 +351,8 @@ SUBROUTINE trcadv | |
|
|
||
| !!trn could be allocate earlier | ||
| !$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) | ||
| !$acc enter data create(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) | ||
| !$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) | ||
| !$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu) | ||
| !$acc enter data create( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu) | ||
|
|
||
| !$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
|
|
@@ -946,9 +942,7 @@ SUBROUTINE trcadv | |
| !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
| !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) | ||
|
|
||
| !$acc exit data delete( tra) finalize if(use_gpu) | ||
| !$acc exit data delete( trn, advmask ) finalize if(use_gpu) | ||
| !$acc exit data delete( flx_ridxt, diaflx ) finalize if(use_gpu) | ||
| !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu) | ||
|
|
||
| !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it | ||
|
|
@@ -963,7 +957,7 @@ SUBROUTINE trcadv | |
| deallocate(zbuf ) | ||
|
|
||
| !$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu) | ||
| !$acc exit data delete( e1t, e2t, e3t, e1u, e2u, e3u, e1v, e2v, e3v, e3w, un, vn, wn ) finalize if(use_gpu) | ||
| !$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu) | ||
|
|
||
| trcadvparttime = MPI_WTIME() - trcadvparttime | ||
| trcadvtottime = trcadvtottime + trcadvparttime | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,7 +69,7 @@ SUBROUTINE trczdf | |
|
|
||
|
|
||
| LOGICAL :: l1,l2,l3 | ||
| INTEGER :: jk,jj,ji, jn, jv, jf | ||
| INTEGER :: jk,jj,ji, jn, jv, jf, ntx | ||
| ! omp variables | ||
|
|
||
|
|
||
|
|
@@ -113,7 +113,11 @@ SUBROUTINE trczdf | |
| END DO | ||
| END DO | ||
| END DO | ||
|
|
||
| !$acc enter data create(delta_tra,int_tra) | ||
| !$acc update device(jarr_zdf,jarr_zdf_flx) | ||
| #ifdef _OPENACC | ||
| call myalloc_ZDF_gpu() | ||
| #endif | ||
| ENDIF | ||
|
|
||
|
|
||
|
|
@@ -129,12 +133,20 @@ SUBROUTINE trczdf | |
| ztavg = 0.e0 | ||
| !! vertical slab | ||
|
|
||
| ! NOTE: kernel is too big, should be split | ||
| !$acc parallel loop gang vector default(present) async vector_length(32) | ||
|
Comment on lines
+136
to
+137
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we might want to think about clever ways to generate this kernel as it seems quite big, best performance on A100 was obtained with a vector length of 32 which isn't very high |
||
| DO jv = 1, dimen_jvzdf | ||
|
|
||
| ji = jarr_zdf(2,jv) | ||
| jj = jarr_zdf(1,jv) | ||
| Aij = e1t(jj,ji) * e2t(jj,ji) | ||
|
|
||
| #ifdef _OPENACC | ||
| ntx=jv | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for GPU version we parallelize on dimen_jvzdf |
||
| #else | ||
| ntx=1 | ||
| #endif | ||
|
|
||
| !! I. Vertical trends associated with lateral mixing | ||
| !! ------------------------------------------------- | ||
| !! (excluding the vertical flux proportional to dk[t] ) | ||
|
|
@@ -155,22 +167,22 @@ SUBROUTINE trczdf | |
| !! ... Euler time stepping when starting from rest | ||
| DO jk = 1, jpkm1 | ||
| z2dtt = zdt * rdt | ||
| zwi(jk, 1) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) | ||
| zws(jk, 1) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) | ||
| zwd(jk, 1) = 1. - zwi(jk, 1) - zws(jk, 1) | ||
| zwi(jk, ntx) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) | ||
| zws(jk, ntx) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) | ||
| zwd(jk, ntx) = 1. - zwi(jk, ntx) - zws(jk, ntx) | ||
| END DO | ||
|
|
||
| !! Surface boundary conditions | ||
| zwi(1,1) = 0.e0 | ||
| zwd(1,1) = 1. - zws(1,1) | ||
| zwi(1,ntx) = 0.e0 | ||
| zwd(1,ntx) = 1. - zws(1,ntx) | ||
|
|
||
| !! II.1. Vertical diffusion on tr | ||
| !! ------------------------------ | ||
| !! Second member construction | ||
| !! ... Euler time stepping when starting from rest | ||
| DO jk = 1, jpkm1 | ||
| z2dtt = zdt * rdt | ||
| zwy(jk,1) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) | ||
| zwy(jk,ntx) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) | ||
| END DO | ||
|
|
||
| !! Matrix inversion from the first level | ||
|
|
@@ -208,22 +220,22 @@ SUBROUTINE trczdf | |
| ikstp1=ikst+1 | ||
| ikenm2=jpk-2 | ||
|
|
||
| zwt(ikst,1)=zwd(ikst,1) | ||
| zwt(ikst,ntx)=zwd(ikst,ntx) | ||
|
|
||
| DO jk=ikstp1,jpkm1 | ||
| zwt(jk,1)=zwd(jk,1)-zwi(jk,1)*zws(jk-1,1)/zwt(jk-1,1) | ||
| zwt(jk,ntx)=zwd(jk,ntx)-zwi(jk,ntx)*zws(jk-1,ntx)/zwt(jk-1,ntx) | ||
| END DO | ||
|
|
||
| zwz(ikst,1)=zwy(ikst,1) | ||
| zwz(ikst,ntx)=zwy(ikst,ntx) | ||
|
|
||
| DO jk=ikstp1,jpkm1 | ||
| zwz(jk,1)=zwy(jk,1)-zwi(jk, 1)/zwt(jk-1, 1)*zwz(jk-1, 1) | ||
| zwz(jk,ntx)=zwy(jk,ntx)-zwi(jk, ntx)/zwt(jk-1, ntx)*zwz(jk-1, ntx) | ||
| END DO | ||
|
|
||
| zwx(jpkm1, 1)=zwz(jpkm1, 1)/zwt(jpkm1, 1) | ||
| zwx(jpkm1, ntx)=zwz(jpkm1, ntx)/zwt(jpkm1, ntx) | ||
|
|
||
| DO jk=ikenm2,ikst,-1 | ||
| zwx(jk, 1)=( zwz(jk, 1)-zws(jk, 1)*zwx(jk+1, 1) )/zwt(jk, 1) | ||
| zwx(jk, ntx)=( zwz(jk, ntx)-zws(jk, ntx)*zwx(jk+1, ntx) )/zwt(jk, ntx) | ||
| END DO | ||
|
|
||
| ! calculate flux due to vertical diffusion (on top face of the grid cell jk) | ||
|
|
@@ -232,7 +244,7 @@ SUBROUTINE trczdf | |
| DO jk=1,jpkm1 | ||
|
|
||
| z2dtt = zdt * rdt | ||
| delta_tra(jk) = ( zwx(jk,1) - zwy(jk,1) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) | ||
| delta_tra(jk) = ( zwx(jk,ntx) - zwy(jk,ntx) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) | ||
|
|
||
| IF (jk .EQ. 1) THEN | ||
| int_tra(1) = 0 | ||
|
|
@@ -258,16 +270,18 @@ SUBROUTINE trczdf | |
| !! (c a u t i o n: tracer not its trend, Leap-frog scheme done | ||
| !! it will not be done in trcnxt) | ||
| DO jk = 1, jpkm1 | ||
| tra(jk,jj,ji,jn) = zwx(jk,1) * tmask(jk,jj,ji) | ||
| tra(jk,jj,ji,jn) = zwx(jk,ntx) * tmask(jk,jj,ji) | ||
| END DO | ||
|
|
||
| END DO ! jv | ||
| !$acc end parallel loop | ||
|
|
||
| ! end if | ||
|
|
||
| END DO TRACER_LOOP | ||
| !!!$omp end parallel do | ||
|
|
||
| !$acc wait | ||
|
|
||
|
|
||
| trczdfparttime = MPI_WTIME() - trczdfparttime !cronometer-stop | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We create a new subroutine here that is called once in trczdf after dimen_jvzdf value is known
We could probably do the same for the CPU version to avoid duplicates, also the memory counter might needs to be adapted