Extremely fast implementation of matrix multiplication.
- ; YASM/NASM-compatible matrix implementation for amd64, using SSE2
- segment .text
- global MMath_GFX_CoreInit
- MMath_GFX_CoreInit:
- xor rax, rax
- mov [rel mmath_gfx_flags], rax
- push rbx
- cpuid
- ; Test for AMD.
- cmp ebx, 0x68747541
- jne .not_amd
- cmp edx, 0x69746e65
- jne .not_amd
- cmp ecx, 0x444d4163
- sete BYTE [rel mmath_gfx_is_amd]
- .not_amd:
- mov eax, 1
- cpuid
- bt ecx, 19
- setc BYTE [rel mmath_gfx_have_sse4_1]
- bt ecx, 20
- setc BYTE [rel mmath_gfx_have_sse4_2]
- bt ecx, 28
- jnc .no_avx
- mov BYTE [rel mmath_gfx_have_avx], 1
- bt ecx, 29
- setc BYTE [rel mmath_gfx_have_f16c]
- .no_avx:
- test BYTE [rel mmath_gfx_is_amd], 1
- jz .no_use_sse4_1
- test BYTE [rel mmath_gfx_have_sse4_1], 1
- setnz BYTE [rel mmath_gfx_use_sse4_1]
- .no_use_sse4_1:
- pop rbx
- ret
- global MMath_GFX_CoreInitSoft
- MMath_GFX_CoreInitSoft:
- xor rax, rax
- mov [rel mmath_gfx_flags], rax
- ret
- ; void MMath_GFX_MatrixTransform(
- ; const float *in,
- ; const float *matrix,
- ; float *out)
- ; TODO: Vector call?
- global MMath_GFX_MatrixTransform_Win64
- MMath_GFX_MatrixTransform_Win64:
- emms ; I don't trust older compilers :/
- ; fld vec[0]
- fld DWORD [rcx]
- mov rdi, rcx
- ; fld matrix[0]
- ; fmul
- fmul DWORD [rdx]
- movups xmm0, [rcx]
- mov rcx, r8
- test BYTE [rel mmath_gfx_use_sse4_1], 1
- jz mmath_gfx_matrix_transform_sse2
- ; FALLTRHOUGH
- mmath_gfx_matrix_transform_sse4_1:
- ; dpps Doesn't work with older YASM/NASM, we can just write this in.
- ; Dot product of vec and matrix[0:3] as x87
- ; Intermixed this with SSE2 instructions as an experiment.
- ; fld vec[0]
- ; fld matrix[0]
- ; fmul
- ; fld matrix[1]
- ; fld vec[1]
- ; fmulp
- ; faddp
- ; fld matrix[2]
- ; fld vec[2]
- ; fmulp
- ; faddp
- ; fld matrix[3]
- ; fld vec[3]
- ; fmulp
- ; faddp
- ; fstp out[0]
- ; TODO: We should be able to load some floats into st0 with movdq2q
- ; fld vec[1]
- fld DWORD [rdi + 4]
- ; SSE4.1 implementation for reference
- ; movaps xmm1, xmm0
- ; dpps xmm1, [rdx], 0xff
- ; db 0x66,0x0F,0x3A,0x40,0x0A,0XFF
- ; movss [rcx], xmm1
- movaps xmm1, xmm0
- ; fld matrix[1]
- ; fmulp
- fmul DWORD [rdx + 4]
- ; dpps xmm1, [rdx + 16], 0XFF
- db 0x66,0x0F,0x3A,0x40,0x4A,0X10,0XFF
- ; faddp
- faddp
- movss [rcx + 4], xmm1
- ; fld vec[2]
- fld DWORD [rdi + 8]
- movaps xmm1, xmm0
- ; fld matrix[2]
- ; fmulp
- fmul DWORD [rdx + 8]
- ; dpps xmm1, [rdx + 32], 0XFF
- db 0x66,0x0F,0x3A,0x40,0x4A,0X20,0XFF
- ; faddp
- faddp
- movss [rcx + 8], xmm1
- ; Two ops here, this pipelines best before a dpps
- ; (Don't tell me it doesn't, go ahead and change it before or after and
- ; see how it gets 3% slower when you run the perf tests)
- ; fld vec[3]
- fld DWORD [rdi + 12]
- ; fld matrix[3]
- ; fmulp
- fmul DWORD [rdx + 12]
- ; dpps xmm0, [rdx + 40], 0XFF
- db 0x66,0x0F,0x3A,0x40,0x42,0X30,0XFF
- ; faddp
- faddp
- movss [rcx + 12], xmm0
- ; fstp out[0]
- fstp DWORD [rcx]
- ret
- global MMath_GFX_MatrixTransform
- MMath_GFX_MatrixTransform:
- ; fld vec[0]
- fld DWORD [rdi]
- movups xmm0, [rdi]
- mov rcx, rdx
- ; fld matrix[0]
- ; fmul
- fmul DWORD [rsi]
- mov rdx, rsi
- test BYTE [rel mmath_gfx_use_sse4_1], 1
- jnz mmath_gfx_matrix_transform_sse4_1
- ; FALLTHROUGH
- mmath_gfx_matrix_transform_sse2:
- ; Dot product of vec and matrix[0:3] as x87
- ; Intermixed this with SSE2 instructions as an experiment.
- ; fld vec[0]
- ; fld matrix[0]
- ; fmul
- ; fld matrix[1]
- ; fld vec[1]
- ; fmulp
- ; faddp
- ; fld matrix[2]
- ; fld vec[2]
- ; fmulp
- ; faddp
- ; fld matrix[3]
- ; fld vec[3]
- ; fmulp
- ; faddp
- ; fstp out[0]
- ; SSE 2 version for reference
- ; movups xmm1, [rdx]
- ; mulps xmm1, xmm0
- ; movhlps xmm2, xmm1
- ; addps xmm1, xmm2
- ; pshufd xmm2, xmm1, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
- ; addps xmm1, xmm2
- ; movss [rcx], xmm1
- ; fld vec[0]
- ; fld DWORD [rdi]
- ; Dot product of vec and matrix[4:7]
- movups xmm1, [rdx + 16]
- ; fld matrix[0]
- ; fmulp
- ; fmul DWORD [rdx]
- mulps xmm1, xmm0
- ; fld vec[1]
- fld DWORD [rdi + 4]
- movhlps xmm2, xmm1
- ; fld matrix[1]
- ; fmulp
- fmul DWORD [rdx + 4]
- addps xmm1, xmm2
- ; faddp
- faddp
- pshufd xmm2, xmm1, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
- ; fld vec[2]
- fld DWORD [rdi + 8]
- addps xmm1, xmm2
- ; fld matrix[2]
- ; fmulp
- fmul DWORD [rdx + 8]
- movss [rcx + 4], xmm1
- ; faddp
- faddp
- ; Dot product of vec and matrix[8:11]
- movups xmm1, [rdx + 32]
- ; fld vec[3]
- fld DWORD [rdi + 12]
- mulps xmm1, xmm0
- ; fld matrix[3]
- ; fmulp
- fmul DWORD [rdx + 12]
- movhlps xmm2, xmm1
- ; faddp
- faddp
- addps xmm1, xmm2
- ; fstp out[0]
- fstp DWORD [rcx]
- pshufd xmm2, xmm1, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
- addps xmm1, xmm2
- movss [rcx + 8], xmm1
- ; Dot product of vec and matrix[12:15]
- ; Destructively update xmm0 to avoid the need for more loading.
- mulps xmm0, [rdx + 48]
- movhlps xmm2, xmm0
- addps xmm0, xmm2
- pshufd xmm2, xmm0, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
- addps xmm0, xmm2
- movss [rcx + 12], xmm0
- ret
- segment .data
- mmath_matrix_transform: dd 0,0
- global mmath_gfx_flags
- global _mmath_gfx_flags
- mmath_gfx_flags: ; FALLRHOUGH
- _mmath_gfx_flags: ; FALLRHOUGH
- mmath_gfx_have_sse4_1: db 0
- mmath_gfx_have_sse4_2: db 0
- mmath_gfx_have_avx: db 0
- mmath_gfx_have_f16c: db 0
- mmath_gfx_is_amd: db 0
- mmath_gfx_use_sse4_1: db 0