Software Pipelining
6-54
Example 6–31. Assembly Code for Floating-Point Dot Product (Software Pipelined With
Removal of Prolog and Epilog) (Continued)
[A1]
B
.S2
LOOP
;*** branch to loop
||[A1]
SUB
.S1
A1,1,A1
;**** decrement loop counter
[A1]
B
.S2
LOOP
;**** branch to loop
||[A1]
SUB
.S1
A1,1,A1
;***** decrement loop counter
LOOP:
LDDW
.D1
A4++,A7:A6
;********* load ai & ai + 1 from memory
||
LDDW
.D2
B4++,B7:B6
;********* load bi & bi + 1 from memory
||
MPYSP
.M1X
A6,B6,A5
;**** pi = a0 b0
||
MPYSP
.M2X
A7,B7,B5
;**** pi1 = a1 b1
||
ADDSP
.L1
A5,A8,A8
; sum0 += (ai bi)
||
ADDSP
.L2
B5,B8,B8
; sum1 += (ai+1 bi+1)
||[A1]
B
.S2
LOOP
;***** branch to loop
||[A1]
SUB
.S1
A1,1,A1
;****** decrement loop counter
; Branch occurs here
ADDSP
.L1X
A8,B8,A0
; sum(0) = sum0(0) + sum1(0)
ADDSP
.L2X
A8,B8,B0
; sum(1) = sum0(1) + sum1(1)
ADDSP
.L1X
A8,B8,A0
; sum(2) = sum0(2) + sum1(2)
ADDSP
.L2X
A8,B8,B0
; sum(3) = sum0(3) + sum1(3)
NOP
; wait for B0
ADDSP
.L1X
A0,B0,A5
; sum(01) = sum(0) + sum(1)
NOP
; wait for next B0
ADDSP
.L2X
A0,B0,B5
; sum(23) = sum(2) + sum(3)
NOP
3
ADDSP
.L1X
A5,B5,A4
; sum = sum(01) + sum(23)
NOP
3
;