determine where to insert prefetch/prefetchnta instructions
Posted: Wed Feb 26, 2025 11:27 pm
I'm writing own GEMM implementation on my AMD platform.
How to determine where to insert prefetch/prefetchnta instructions to make it as fast as AOCL.
000000000040183f <cblas_dgemm.L7>:
40183f: 4d 89 c8 mov %r9,%r8
401842: c5 fc 77 vzeroall
401845: b9 1e 00 00 00 mov $0x1e,%ecx
000000000040184a <cblas_dgemm.L9>:
40184a: c4 c1 7d 28 22 vmovapd (%r10),%ymm4
40184f: c4 c1 7d 28 6a 20 vmovapd 0x20(%r10),%ymm5
401855: c4 42 dd b8 00 vfmadd231pd (%r8),%ymm4,%ymm8
40185a: c4 42 d5 b8 40 20 vfmadd231pd 0x20(%r8),%ymm5,%ymm8
401860: c4 42 dd b8 0c 00 vfmadd231pd (%r8,%rax,1),%ymm4,%ymm9
401866: c4 42 d5 b8 4c 00 20 vfmadd231pd 0x20(%r8,%rax,1),%ymm5,%ymm9
40186d: c4 42 dd b8 14 40 vfmadd231pd (%r8,%rax,2),%ymm4,%ymm10
401873: c4 42 d5 b8 54 40 20 vfmadd231pd 0x20(%r8,%rax,2),%ymm5,%ymm10
40187a: c4 42 dd b8 1c 18 vfmadd231pd (%r8,%rbx,1),%ymm4,%ymm11
401880: c4 42 d5 b8 5c 18 20 vfmadd231pd 0x20(%r8,%rbx,1),%ymm5,%ymm11
401887: 49 83 c2 40 add $0x40,%r10
40188b: 49 83 c0 40 add $0x40,%r8
40188f: 48 83 e9 01 sub $0x1,%rcx
401893: 75 b5 jne 40184a <cblas_dgemm.L9>
How to determine where to insert prefetch/prefetchnta instructions to make it as fast as AOCL.
000000000040183f <cblas_dgemm.L7>:
40183f: 4d 89 c8 mov %r9,%r8
401842: c5 fc 77 vzeroall
401845: b9 1e 00 00 00 mov $0x1e,%ecx
000000000040184a <cblas_dgemm.L9>:
40184a: c4 c1 7d 28 22 vmovapd (%r10),%ymm4
40184f: c4 c1 7d 28 6a 20 vmovapd 0x20(%r10),%ymm5
401855: c4 42 dd b8 00 vfmadd231pd (%r8),%ymm4,%ymm8
40185a: c4 42 d5 b8 40 20 vfmadd231pd 0x20(%r8),%ymm5,%ymm8
401860: c4 42 dd b8 0c 00 vfmadd231pd (%r8,%rax,1),%ymm4,%ymm9
401866: c4 42 d5 b8 4c 00 20 vfmadd231pd 0x20(%r8,%rax,1),%ymm5,%ymm9
40186d: c4 42 dd b8 14 40 vfmadd231pd (%r8,%rax,2),%ymm4,%ymm10
401873: c4 42 d5 b8 54 40 20 vfmadd231pd 0x20(%r8,%rax,2),%ymm5,%ymm10
40187a: c4 42 dd b8 1c 18 vfmadd231pd (%r8,%rbx,1),%ymm4,%ymm11
401880: c4 42 d5 b8 5c 18 20 vfmadd231pd 0x20(%r8,%rbx,1),%ymm5,%ymm11
401887: 49 83 c2 40 add $0x40,%r10
40188b: 49 83 c0 40 add $0x40,%r8
40188f: 48 83 e9 01 sub $0x1,%rcx
401893: 75 b5 jne 40184a <cblas_dgemm.L9>