Software Pipelining
6-47
Optimizing Assembly Code via Linear Assembly
Example 6–28. Assembly Code for Fixed-Point Dot Product (Software Pipelined
With No Extraneous Loads) (Continued)
LOOP:
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
||
MPY
.M1X
A2,B2,A6
;** ai * bi
||
MPYH
.M2X
A2,B2,B6
;** ai+1 * bi+1
||[A1] SUB
.S1
A1,1,A1
;****** decrement loop counter
||[A1] B
.S2
LOOP
;***** branch to loop
||
LDW
.D1
*A4++,A2
;******* ld ai & ai+1 fm memory
||
LDW
.D2
*B4++,B2
;******* ld bi & bi+1 fm memory
; Branch occurs here
ADDs
MPYs
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
||
MPY
.M1X
A2,B2,A6
;** ai * bi
||
MPYH
.M2X
A2,B2,B6
;** ai+1 * bi+1
1
1
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
||
MPY
.M1X
A2,B2,A6
;** ai * bi
||
MPYH
.M2X
A2,B2,B6
;** ai+1 * bi+1
2
2
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
||
MPY
.M1X
A2,B2,A6
;** ai * bi
||
MPYH
.M2X
A2,B2,B6
;** ai+1 * bi+1
3
3
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
||
MPY
.M1X
A2,B2,A6
;** ai * bi
||
MPYH
.M2X
A2,B2,B6
;** ai+1 * bi+1
4
4
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
||
MPY
.M1X
A2,B2,A6
;** ai * bi
||
MPYH
.M2X
A2,B2,B6
;** ai+1 * bi+1
5
5
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
6
ADD
.L1
A6,A7,A7
; sum0 += (ai * bi)
||
ADD
.L2
B6,B7,B7
; sum1 += (ai+1 * bi+1)
7
ADD
.L1X
A7,B7,A4
; sum = sum0 + sum1