Packed Data Processing & 64-bit DSPs

by Shehrzad Qureshi



Listing One



void memclear( void * ptr, int count )

{

  long *lptr = ptr;

  _nassert((int)ptr%8==0);

  #pragma MUST_ITERATE (32);

  for (count>>=3; count>0; count--)

    *lptr++ = 0;

}





Listing Two



void memset( void *ptr, int x, int count ) {

    char *uch = ptr;

    for (; count>0; count--) *uch++ = x;

}





Listing Three



#pragma DATA_ALIGN(double_word_aligned_array, 8)

unsigned char double_word_aligned_array[256];



#pragma DATA_ALIGN(word_aligned_array, 4)

unsigned char word_aligned_array[256];





Listing Four



/* "left" center-of-mass numerator */

for (ii=0; ii<T; ii++)

  sumofprod1 += ii*hist[ii];

/* "right" center-of-mass numerator */

for (ii=T+1; ii<MP; ii++) /* MP=255 for 8-bit images */

  sumofprod2 += ii*hist[ii];





Listing Five



/* pixval = {0, 1, 2, ..., 255} for an 8-bit image */



for (ii=0; ii<T; ii++) /* left */

  sumofprod1 += pixval[ii]*hist[ii];

for (ii=T+1; ii<MP; ii++)

  sumofprod2 += pixval[ii]*hist[ii];





Listing Six



#pragma MUST_ITERATE(,,4)

_nassert((int)pixval%8 == 0)

_nassert((int)hist%8 == 0)

for (ii=0; ii<T; ii++) /* left */

  sumofprod1 += pixval[ii]*hist[ii];

#pragma MUST_ITERATE(,,4)

_nassert((int)pixval%8 == 0)

_nassert((int)hist%8 == 0)

for (ii=T+1; ii<MP; ii++) /* right */

  sumofprod2 += pixval[ii]*hist[ii];





Listing Seven



unsigned long dotproduct(int lo, int hi)

{

  /* 0, 1, 2, ..., 255 */

  static const unsigned short pixval[] = 

                                 {0,1,2, /* 3,5,...,252 */ ,253,254,255};

  unsigned long sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum; 

  const int N = hi-lo;

  int ii=0, jj=lo, remaining;

  double h1_h2_h3_h4, b1_b2_b3_b4;

  unsigned int h1_h2, h3_h4, b1_b2, b3_b4;



  /* unrolled dot-product loop with non-aligned double word reads */   

  for (; ii<N; ii+=4, jj+=4)     

  {

    h1_h2_h3_h4 = _memd8_const(&hist[ii]);

    h1_h2 = _lo(h1_h2_h3_h4);

    h3_h4 = _hi(h1_h2_h3_h4);

    

    b1_b2_b3_b4 = _memd8_const(&pixval[ii]);

    b1_b2 = _lo(b1_b2_b3_b4);

    b3_b4 = _hi(b1_b2_b3_b4);

    

    sum1 += _mpyu(h1_h2, b1_b2); /* (h1)(b1) */        

    sum2 += _mpyhu(h1_h2, b1_b2); /* (h2)(b2) */

    sum3 += _mpyu(h3_h4, b3_b4); /* (h3)(b3) */

    sum4 += _mpyhu(h3_h4, b3_b4);  /* (h4)(b4) */  

  }

  sum = sum1 + sum2 + sum3 + sum4;

  /* loop epilogue: if # iterations guaranteed to

   * be a multiple of 4, then this would not be required. 

   */

  remaining = N - ii;

  jj = N - remaining;

  for (ii=jj; ii<N; ii++)

    sum += hist[ii]*pixval[ii];

  return sum;

}



Listing Eight



unsigned long dotproduct(int lo, int hi)

{

  /* 0, 1, 2, ..., 255 */

  static const unsigned short pixval[] = 

            {0,1,2, /* 3,5,...,252 */, 253,254,255};

  unsigned long sum1 = 0, sum2 = 0, sum; 

  const int N = hi-lo;

  int ii=0, jj=lo, remaining;

  double h1_h2_h3_h4, b1_b2_b3_b4;

  unsigned int h1_h2, h3_h4, b1_b2, b3_b4;



  /* unrolled dot-product loop with non-aligned double word reads */   

  for (; ii<N; ii+=4, jj+=4)     

  {

    h1_h2_h3_h4 = _memd8_const(&smoothed_hist[ii]);

    h1_h2 = _lo(h1_h2_h3_h4);

    h3_h4 = _hi(h1_h2_h3_h4);

    

    b1_b2_b3_b4 = _memd8_const(&pixval[ii]);

    b1_b2 = _lo(b1_b2_b3_b4);

    b3_b4 = _hi(b1_b2_b3_b4);

    

    sum1 += _dotp2(h1_h2, b1_b2); /* see Figure 4 */        

    sum2 += _dotp2(h3_h4, b3_b4); 

  }

  sum = sum1 + sum2;

  /* loop epilogue: if # iterations guaranteed to

   * be a multiple of 4, then this would not be required. 

   */

  remaining = N - ii;

  jj = N - remaining;

  for (ii=jj; ii<N; ii++)

    sum += smoothed_hist[ii]*pixval[ii];

    

  return sum;

}







3



