Software Pipelining the Outer Loop
6-133
Optimizing Assembly Code via Linear Assembly
Example 6–71. Final Assembly Code for FIR Filter With Redundant Load Elimination and
No Memory Hits With Outer Loop Software-Pipelined
MVK
.S1
50,A2
; set up outer loop counter
STW
.D2
B11,*B15––
; push register
||
MVK
.S1
74,A3
; used to rst x ptr outer loop
||
MVK
.S2
72,B10
; used to rst h ptr outer loop
||
ADD
.L2X
A6,2,B11
; set up pointer to y[1]
LDH
.D1
*A4++,B8
; x0 = x[j]
||
ADD
.L2X
A4,4,B1
; set up pointer to x[j+2]
||
ADD
.L1X
B4,2,A8
; set up pointer to h[1]
||
MVK
.S2
8,B2
; set up inner loop counter
||[A2]
SUB
.S1
A2,1,A2
; decrement outer loop counter
LDH
.D2
*B1++[2],B0
; x2 = x[j+i+2]
||
LDH
.D1
*A4++[2],A0
; x1 = x[j+i+1]
||
ZERO
.L1
A9
; zero out sum0
||
ZERO
.L2
B9
; zero out sum1
LDH
.D1
*A8++[2],B6
; h1 = h[i+1]
||
LDH
.D2
*B4++[2],A1
; h0 = h[i]
LDH
.D1
*A4++[2],A5
; x3 = x[j+i+3]
||
LDH
.D2
*B1++[2],B5
; x0 = x[j+i+4]
OUTLOOP:
LDH
.D2
*B4++[2],A7
; h2 = h[i+2]
||
LDH
.D1
*A8++[2],B8
; h3 = h[i+3]
||[B2]
SUB
.S2
B2,2,B2
; decrement loop counter
LDH
.D2
*B1++[2],B0
;* x2 = x[j+i+2]
||
LDH
.D1
*A4++[2],A0
;* x1 = x[j+i+1]
LDH
.D1
*A8++[2],B6
;* h1 = h[i+1]
||
LDH
.D2
*B4++[2],A1
;* h0 = h[i]
MPY
.M1X
B8,A1,A0
; x0 * h0
||
MPY
.M2X
A0,B6,B6
; x1 * h1
||
LDH
.D1
*A4++[2],A5
;* x3 = x[j+i+3]
||
LDH
.D2
*B1++[2],B5
;* x0 = x[j+i+4]
[B2]
B
.S1
LOOP
; branch to loop
||
MPY
.M2
B0,B6,B7
; x2 * h1
||
MPY
.M1
A0,A1,A1
; x1 * h0
||
LDH
.D2
*B4++[2],A7
;* h2 = h[i+2]
||
LDH
.D1
*A8++[2],B8
;* h3 = h[i+3]
||[B2]
SUB
.S2
B2,1,B2
;* decrement loop counter