Software Pipelining
6-56
Example 6–33. Assembly Code for Floating-Point Dot Product (Software Pipelined
With Smallest Code Size)
B
.S2
LOOP
; branch to loop
||
MVK
.S1
53,A1
; set up loop counter
B
.S2
LOOP
;* branch to loop
||
ZERO
.L1
A7
; zero out mpysp input
||
ZERO
.L2
B7
; zero out mpysp input
B
.S2
LOOP
;** branch to loop
||
ZERO
.L1
A8
; zero out sum0 accumulator
||
ZERO
.L2
B8
; zero out sum0 accumulator
B
.S2
LOOP
;*** branch to loop
||
ZERO
.L1
A5
; zero out addsp input
||
ZERO
.L2
B5
; zero out addsp input
B
.S2
LOOP
;**** branch to loop
||
ZERO
.L1
A6
; zero out mpysp input
||
ZERO
.L2
B6
; zero out mpysp input
LOOP:
LDDW
.D1
A4++,A7:A6
;********* load ai & ai + 1 from memory
||
LDDW
.D2
B4++,B7:B6
;********* load bi & bi + 1 from memory
||
MPYSP
.M1X
A6,B6,A5
;**** pi = a0 b0
||
MPYSP
.M2X
A7,B7,B5
;**** pi1 = a1 b1
||
ADDSP
.L1
A5,A8,A8
; sum0 += (ai bi)
||
ADDSP
.L2
B5,B8,B8
; sum1 += (ai+1 bi+1)
||[A1] B
.S2
LOOP
;***** branch to loop
||[A1] SUB
.S1
A1,1,A1
;****** decrement loop counter
; Branch occurs here
ADDSP
.L1X
A8,B8,A0
; sum(0) = sum0(0) + sum1(0)
ADDSP
.L2X
A8,B8,B0
; sum(1) = sum0(1) + sum1(1)
ADDSP
.L1X
A8,B8,A0
; sum(2) = sum0(2) + sum1(2)
ADDSP
.L2X
A8,B8,B0
; sum(3) = sum0(3) + sum1(3)
NOP
; wait for B0
ADDSP
.L1X
A0,B0,A5
; sum(01) = sum(0) + sum(1)
NOP
; wait for next B0
ADDSP
.L2X
A0,B0,B5
; sum(23) = sum(2) + sum(3)
NOP
3
ADDSP
.L1X
A5,B5,A4
; sum = sum(01) + sum(23)
NOP
3
;