Accelerating Compute Intensive Functions Using C
by Joe Hanson



Listing One

void fir(short *X, short *H, short *Y, int N, int T)
{
     int n, t, acc;
     short *x, *h;
     /* Filter Input */
     for (n = 0; n < N; n++) {
          x = X;
          h = H;
          acc = (*x--) * (*h++);
          for(t = 1; t < T; t++) {
               acc += (*x--) * (*h++);
          }
          *Y = acc >> 14;     
          X++;
          Y++;
     }
}


Listing Two

#include <stretch.h>
static se_sint<32> acc; 

/* Performs 8 parallel MAC */ 
SE_FUNC void 
firFunc(SE_INST FIR_MUL, SE_INST FIR_MAC, WR X, WR H, WR *Y) 
{ 
      se_sint<16> x, h
      se_sint<32> sum ;
      int i ;
      sum = 0;
      for(i = 0; i < 128; i += 16) { 
          h = H(i + 15, i); 
          x = X(127-i, 112-i);
          sum += x * h ; 
      } 
      acc  = FIR_MUL ? sum : se_sint<32>(sum + acc) ;
      *Y = acc >> 14 ; 
} 


Listing Three

#include "fir8.h" 

#define   ST_DECR   1
#define   ST_INCR   0
 
void fir(short *X, short *H, short *Y, short N, short T)
{
   int n, t, t8;
   WR x, h, y;
   t8 = T/8;
   WRPUTINIT(ST_INCR, Y) ;
   for (n = 0; n < N; n++) { 
        WRGET0INIT(ST_INCR, H) ;
        X++ ;
        WRGET1INIT(ST_DECR, X) ;
        WRGET0I( &h, 16 );
        WRGET1I( &x, 16);
        FIR_MUL(x, h, &y);
        for (t = 1; t < t8; t++) {
             WRGET0I(&h, 16);
             WRGET1I(&x, 16);
             FIR_MAC(x, h, &y);
        }
        WRPUTI(y, 2) ;
  }
   WRPUTFLUSH0() ;
   WRPUTFLUSH1() ;
}


Listing Four

/* Include the Stretch Instruction Specific Header */
#include "fir8.h"
#define   ST_DECR   1     /* Decrement Indicator   */
#define   ST_INCR   0                              
/* Increment Indicator */
/* define macro for the FIR ISEF instruction invocations */
#define             FIR(H, X, h, x, t8, y)             \
{                                                      \
     int t8m1 = (t8)-1;                                \
     WRGET0INIT(ST_INCR, (H)) ;                        \
     (X)++ ;                                           \
     WRGET1INIT(ST_DECR, (X)) ;                        \
     WRGET0I( &(h), 8 * sizeof(short) );               \
     WRGET1I( &(x), 8 * sizeof(short) );               \
     FIR_MUL( (x), (h), &(y) );                        \
                                                       \
     for (t = 1; t < (t8m1); t++)                      \
     {                                                 \
          WRGET0I( &(h), 16 );                         \
          WRGET1I( &(x), 16 );                         \
          FIR_MAC( (x), (h), &(y) );                   \
     }                                                 \
     WRGET0I( &(h), 16 );                              \
     WRGET1I( &(x), 16 );                              \
     FIR_MAC( (x), (h), &(y) );                        \
}
/*
* - FIR using 8 multipliers in ISEF 
* - Loop optimized
*/
void fir(short *X, short *H, short *Y, short N, short T)
{
     int n, t, t8 ;
     WR x, h, y1, y2, y3, y4;
     t8 = T/8 ;
     WRPUTINIT(ST_INCR, Y) ;            /* init output stream */
     FIR (H, X, h, x, t8, y1) ;         /* x * h + y => y1 */
     /* loop ((N/2)-1) times */
    n = 0;
    do
    {
          FIR (H, X, h, x, t8, y2) ;    /* x * h + y => y2 */
          WRPUTI(y1, 2) ;                         /* put (y1) result */
          FIR (H, X, h, x, t8, y1) ;    /* x * h + y => y1 */
          WRPUTI(y2, 2) ;               /* put (y2) result */
     } while ( ++n < ((N>>1)-1) );
     FIR (H, X, h, x, t8, y2) ;         /* x * h + y => y2 */
     WRPUTI(y1, 2) ;                    /* put (y1) result */
     WRPUTI(y2, 2) ;                    /* put (y2) result */
     WRPUTFLUSH0() ;                    /* flush output stream */
     WRPUTFLUSH1() ;                    /* flush output stream */
}

Listing Five

/* Include the Stretch Instruction Specific Header */
#include "fir8.h"

#define             ST_DECR             1         /* Decrement Indicator */
#define             ST_INCR             0         /* Increment Indicator */
#define             FIR(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, X)   \
{                                                      \
   WRGET0I( &(h1), 8 * sizeof(short) );                \
   WRGET1I( &(x1), 16 );                               \
   X++ ;                                               \
   WRGET0I( &(h2), 16 );                               \
   WRGET1I( &(x2), 16 );                               \
   FIR_MUL( (x1), (h1), &(y1) );                       \
                                                       \
    WRGET0I( &(h3), 16 );                              \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x2), (h2), &(y1) );                      \
    WRGET0I( &(h4), 16 );                              \
    WRGET1I( &(x2), 16 );                              \
    FIR_MAC( (x1), (h3), &(y1) );                      \
    WRGET0I( &(h5), 16 );                              \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x2), (h4), &(y1) );                      \
    WRGET0I( &(h6), 16 );                              \
    WRGET1I( &(x2), 16 );                              \
    FIR_MAC( (x1), (h5), &(y1) );                      \
    WRGET0I( &(h7), 16 );                              \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x2), (h6), &(y1) );                      \
    WRGET0I( &(h8), 16 );                              \
    WRGET1I( &(x2), 16 );                              \
    FIR_MAC( (x1), (h7), &(y1) );                      \
    WRGET1INIT(ST_DECR, X);                            \
    FIR_MAC( (x2), (h8), &(y1) );                      \
}
#define  FIR1(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X)  \
{                                                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MUL( (x1), (h1), &(y2) );                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x1), (h2), &(y2) );                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x1), (h3), &(y2) );                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x1), (h4), &(y2) );                      \
    WRGET1I( &(x1), 16 );                              \
    X++ ;                                              \
    FIR_MAC( (x1), (h5), &(y2) );                      \
    WRGET1I( &(x1), 16 );                              \
    WRGET1I( &(x2), 16 );                              \
    FIR_MAC( (x1), (h6), &(y2) );                      \
    WRGET1I( &(x1), 16 );                              \
    WRGET1INIT0(ST_DECR, X);                           \
    FIR_MAC( (x2), (h7), &(y2) );                      \
    WRGET1INIT1();                                     \
    WRPUTI(y1, 2);                                     \
    FIR_MAC( (x1), (h8), &(y2) );                      \
}
#define   FIR2(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X)  \
{                                                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MUL( (x1), (h1), &(y1) );                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x1), (h2), &(y1) );                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x1), (h3), &(y1) );                      \
    WRGET1I( &(x1), 16 );                              \
    FIR_MAC( (x1), (h4), &(y1) );                      \
    WRGET1I( &(x1), 16 );                              \
    X++ ;                                              \
    FIR_MAC( (x1), (h5), &(y1) );                      \
    WRGET1I( &(x1), 16 );                              \
    WRGET1I( &(x2), 16 );                              \
    FIR_MAC( (x1), (h6), &(y1) );                      \
    WRGET1I( &(x1), 16 );                              \
    WRGET1INIT0(ST_DECR, X) ;                          \
    FIR_MAC( (x2), (h7), &(y1) );                      \
    WRGET1INIT1();                                     \
    WRPUTI(y2, 2);                                     \
    FIR_MAC( (x1), (h8), &(y1) );                      \  
}
/*
 * - FIR using 8 multipliers in ISEF
 * - Loop optimized / Hand unrolled
 */
void fir(short *X, short *H, short *Y, short N, short T)
{
     int n, t, t8 ;
     WR  h1, h2, h3, h4, h5, h6, h7, h8 ;
     WR  x1, x2;    
     WR  y1;
     WR  y2;
     // (these alternative "register" declarations make no difference:)
     //  register WR y1 SE_REG("wra1") ;
     //  register WR y2 SE_REG("wra2") ;
     WRPUTINIT(ST_INCR, Y);              /* init output stream */
     WRGET0INIT(ST_INCR, H);             /* init coefficient stream */
     X++ ;
     WRGET1INIT(ST_DECR, X);             /* init input stream */
     /* compute Y[0] in y1 */
     FIR(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, X) ;
     /* loop ((N/2)-1) times */    
     for (n = 0; n < ((N>>1)-1); n++)
     {
/* FIR1 writes previous output (y1) and computes current output (y2) */
          FIR1(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ;
/* FIR1 writes previous output (y2) and computes current output (y1) */
          FIR2(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ;
     }
     /* compute Y[N-1] in y2 and write Y[N-2] from y1 */
     FIR1(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ;
     WRPUTI(y2, 2) ;                     /* write U[N-1] */
     WRPUTFLUSH0() ;                     /* flush output stream */
     WRPUTFLUSH1() ;                     /* flush output stream */    
}




1


