Packed-Data Processing on the ’C64x
8-33
’C64x Programming Considerations
Example 8–12. Vectorized form of the Vector Complex Multiply Kernel
void vec_cx_mpy(const short *restrict a, const short *restrict b,
short *restrict c, int len, int shift)
{
int i;
unsigned a3_a2, a1_a0; /* Packed 16–bit values */
unsigned b3_b2, b1_b0; /* Packed 16–bit values */
short a3, a2, a1, a0; /* Separate 16–bit elements */
short b3, b2, b1, b0; /* Separate 16–bit elements */
short c3, c2, c1, c0; /* Separate 16–bit results */
unsigned c3_c2, c1_c0; /* Packed 16–bit values */
for (i = 0; i < len; i += 4)
{
/* Load two complex numbers from the a[] array. */
/* The complex values loaded are represented as ’a3 + a2 * j’ */
/* and ’a1 + a0 * j’. That is, the real components are a3 */
/* and a1, and the imaginary components are a2 and a0. */
a3_a2 = _hi(*(const double *) &a[i]);
a1_a0 = _lo(*(const double *) &a[i]);
/* Load two complex numbers from the b[] array. */
b3_b2 = _hi(*(const double *) &b[i]);
b1_b0 = _lo(*(const double *) &b[i]);
/* Separate the 16–bit coefficients so that the complex */
/* multiply may be performed. This portion needs further */
/* optimization. */
a3 = ((signed) a3_a2) >> 16;
a2 = _ext(a3_a2, 16, 16);
a1 = ((signed) a1_a0) >> 16;
a0 = _ext(a1_a0, 16, 16);
b3 = ((signed) a3_a2) >> 16;
b2 = _ext(a3_a2, 16, 16);
b1 = ((signed) a1_a0) >> 16;
b0 = _ext(a1_a0, 16, 16);
/* Perform the complex multiplies using 16x16 multiplies. */
c3 = (b3 * a2 + b2 * a3) >> 16;
c2 = (b3 * a3 – b2 * a2) >> 16;
c1 = (b1 * a0 + b0 * a1) >> 16;
c0 = (b1 * a1 – b0 * a0) >> 16;
/* Pack the 16–bit results into 32–bit words. */
c3_c2 = _pack2(c3, c2);
c1_c0 = _pack2(c1, c0);
/* Store the results. */
*(double *) &c[i] = _itod(c3_c2, c1_c0);
}
}