PTX Backend#18
Conversation
| <%inherit file='base'/> | ||
|
|
||
| <% | ||
| pftype = 'f32' if dtype == 'float' else 'f64' |
There was a problem hiding this comment.
Is it worth factoring any of this into base?
| yield from self._dense_kernel_generators(dtype, dsize, base_args) | ||
|
|
||
| def _sparse_kernel_generators(self, dtype, dsize, base_args): | ||
| if not self.is_sparse_suitable(self.A): |
There was a problem hiding this comment.
Maybe move these checks up to the _kernel_generators function.
| meta = { | ||
| 'block': (blkx, 1, 1), | ||
| 'grid': (-(-self.n // n_per_cta), 1, 1), | ||
| 'desc': f'{tpl}/nn{nn}-w{w}{"-bs" if bs else ""}', |
There was a problem hiding this comment.
Recent Python allows for single quotes here.
| yield (tpl, args, meta) | ||
|
|
||
| # Warp-specialised dense DMMA | ||
| if cc >= (10, 0): |
There was a problem hiding this comment.
Does this gate consumer cards with less shared memory?
| @@ -0,0 +1,276 @@ | |||
| # -*- coding: utf-8 -*- | |||
|
|
|||
| import struct | |||
| return | ||
|
|
||
| # Some kernels can optional steal blocks | ||
| bs_default = cc >= (10, 0) |
There was a problem hiding this comment.
Combine with the check below?
| ws_configs = [(1, 4), (2, 4), (4, 4)] | ||
| for nn, w in ws_configs: | ||
| n_per_cta = 8 * nn * w | ||
| if n_per_cta > self.n: |
| if ws_layout['dynm_total_bytes'] > 200 * 1024: | ||
| continue | ||
|
|
||
| args = (base_args |
There was a problem hiding this comment.
I can think reorder to get it to two lines?
| i = m_tile * 8 + lane // 4 | ||
| j = k_iter * 4 + lane % 4 | ||
| v = float(a[i, j]) if (i < m and j < k) else 0.0 | ||
| u = struct.unpack('<Q', struct.pack('<d', v))[0] |
There was a problem hiding this comment.
Can you unpick this for me?
|
|
||
| # A in fragment layout: lane l -> A[m_tile*8 + l/4][k_iter*4 + l%4] | ||
| a_u64 = [] | ||
| for m_tile in range(m_tiles): |
There was a problem hiding this comment.
Can 3 arg range work here?
|
I know this is an utter pain but for FP32/FP64 can you confirm correctness for all relevant PyFR matrices at a suite of N values for all instances where a kernel is expected to work on A100/H100/B100)? |
| @@ -0,0 +1,4 @@ | |||
| .version 8.7 | |||
| .target sm_${cc[0]}${cc[1]}${"a" if cc[0] >= 9 else ""} | |||
| .param .u64 _c) | ||
| { | ||
| % endif | ||
| .reg .u32 n, id, tid_x, tid_y; |
There was a problem hiding this comment.
Ensure we throw higher up if n is too big.
| ## Async fill of chunk 0 | ||
| % for idx, kx in enumerate(bchunks[0]): | ||
| % if idx % msplit == cid: | ||
| % if n is None: |
There was a problem hiding this comment.
See if we can come up with some consistent indentation for Mako. Am open to ideas.
| <% | ||
| buf_cur = bb % 2 | ||
| buf_next = (bb + 1) % 2 | ||
| is_last = (bb == len(bchunks) - 1) |
There was a problem hiding this comment.
There is a Mako var for this.
| % if afix[row_j] == -1: | ||
| % if beta == 0: | ||
| { | ||
| .reg .${pftype} _tmp; |
There was a problem hiding this comment.
Can this be factored up as appears in both branches?
| fma.rn.${pftype} _ctmp, _ctmp, ${float(beta)}, dotp; | ||
| st.global.${pftype} [_cptr], _ctmp; | ||
| % else: | ||
| ld.global.${pftype} _ctmp, [c_base + ${ldc*j*dwidth_i}]; |
There was a problem hiding this comment.
Is there scope to lifting these ld's up or does the assembler handle this?
| ld.weak.global.cg.${pftype} csub${j}, [_cptr]; | ||
| } | ||
| % else: | ||
| ld.weak.global.cg.${pftype} csub${j}, [c_base + ${ldc*j*dwidth_i}]; |
There was a problem hiding this comment.
Are we consistent in our use of loads throughout the PTX? Not sure if it makes a huge different for performance but code consistency would be good.
This adds a PTX backend to GiMMiK. The key features are:
Optimisations have focused on FP64, FP32 is future work.