
Optimized Matrix Multiplication
121
22007E/0—November 1999
AMD Athlon™ Processor x86 Code Optimization
$$xform:
ADD
EBX, 16
;res++
MOVQ
MM0, QWORD PTR [EDX]
;v->y | v->x
MOVQ
MM1, QWORD PTR [EDX+8]
;v->w | v->z
ADD
EDX, 16
;v++
MOVQ
MM2, MM0
;v->y | v->x
MOVQ
MM3, QWORD PTR [EAX+M00] ;m[0][1] | m[0][0]
PUNPCKLDQ MM0, MM0
;v->x | v->x
MOVQ
MM4, QWORD PTR [EAX+M10] ;m[1][1] | m[1][0]
PFMUL
MM3, MM0
;v->x*m[0][1] | v->x*m[0][0]
PUNPCKHDQ MM2, MM2
;v->y | v->y
PFMUL
MM4, MM2
;v->y*m[1][1] | v->y*m[1][0]
MOVQ
MM5, QWORD PTR [EAX+M02] ;m[0][3] | m[0][2]
MOVQ
MM7, QWORD PTR [EAX+M12] ;m[1][3] | m[1][2]
MOVQ
MM6, MM1
;v->w | v->z
PFMUL
MM5, MM0
;v->x*m[0][3] | v0>x*m[0][2]
MOVQ
MM0, QWORD PTR [EAX+M20] ;m[2][1] | m[2][0]
PUNPCKLDQ MM1, MM1
;v->z | v->z
PFMUL
MM7, MM2
;v->y*m[1][3] | v->y*m[1][2]
MOVQ
MM2, QWORD PTR [EAX+M22] ;m[2][3] | m[2][2]
PFMUL
MM0, MM1
;v->z*m[2][1] | v->z*m[2][0]
PFADD
MM3, MM4
;v->x*m[0][1]+v->y*m[1][1] |
; v->x*m[0][0]+v->y*m[1][0]
MOVQ
MM4, QWORD PTR [EAX+M30] ;m[3][1] | m[3][0]
PFMUL
MM2, MM1
;v->z*m[2][3] | v->z*m[2][2]
PFADD
MM5, MM7
;v->x*m[0][3]+v->y*m[1][3] |
; v->x*m[0][2]+v->y*m[1][2]
MOVQ
MM1, QWORD PTR [EAX+M32] ;m[3][3] | m[3][2]
PUNPCKHDQ MM6, MM6
;v->w | v->w
PFADD
MM3, MM0
;v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1] |
; v->x*m[0][0]+v->y*m[1][0]+v->z*m[2][0]
PFMUL
MM4, MM6
;v->w*m[3][1] | v->w*m[3][0]
PFMUL
MM1, MM6
;v->w*m[3][3] | v->w*m[3][2]
PFADD
MM5, MM2
;v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3] |
; v->x*m[0][2]+v->y*m[1][2]+v->z*m[2][2]
PFADD
MM3, MM4
;v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1]+
; v->w*m[3][1] | v->x*m[0][0]+v->y*m[1][0]+
; v->z*m[2][0]+v->w*m[3][0]
MOVQ
[EBX-16], MM3
;store res->y | res->x
PFADD
MM5, MM1
;v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3]+
; v->w*m[3][3] | v->x*m[0][2]+v->y*m[1][2]+
; v->z*m[2][2]+v->w*m[3][2]
MOVQ
[EBX-8], MM5
;store res->w | res->z
DEC
ECX
;numverts--
JNZ
$$XFORM
;until numverts == 0
FEMMS
;clear MMX state
}
}
Summary of Contents for Athlon Processor x86
Page 1: ...AMD Athlon Processor x86 Code Optimization Guide TM...
Page 12: ...xii List of Figures AMD Athlon Processor x86 Code Optimization 22007E 0 November 1999...
Page 16: ...xvi Revision History AMD Athlon Processor x86 Code Optimization 22007E 0 November 1999...
Page 202: ...186 Page Attribute Table PAT AMD Athlon Processor x86 Code Optimization 22007E 0 November 1999...
Page 252: ...236 VectorPath Instructions AMD Athlon Processor x86 Code Optimization 22007E 0 November 1999...
Page 256: ...240 Index AMD Athlon Processor x86 Code Optimization 22007E 0 November 1999...