Code Efficiency & Compiler-Directed Feedback
by Jackie Brenner and Markus Levy

Listing One
void E_autocor_cn
(const e_s16 *restrict x,               /* Input data samples         */
             e_s16         *restrict r, /* Output correlations        */
             e_s16 nx,                  /* Number of input samples    */
             e_s16 nr,                  /* Number of outputs          */
             e_s16 scale                /* Scale factor (right shift) */
)
{     n_int i, j;
      long sum;
      for (i = 0; i < nr; i++)
      {
          sum = 0;
          for (j = i; J < nx; j++)
              sum += x[j] * x[j - i];
          r[i] = sum >> (16 + scale);
      }
}

Listing Two
   [A0]    SUB    .S1    A0,1,A0
|| [!A0]   ADD    .L1    A7,A5:A4,A5:A4
||         MPY    .M1X   B5,A3,A7
|| [B0]    BDEC   .S2    L3,B0
||         LDH    .D1T1  *A6++,A3
||         LDH    .D2T2  *B4++,B5

Listing Three
;* SOFTWARE PIPELINE INFORMATION
;*    Known Minimum Trip Count           : 1
;*    Known Maximum Trip Count           : 32767 
;*    Known Max Trip Count               : 1  
;*    Loop Carried Dependency Bound (^)  : 0  
;*    Unpartitioned Resource Bound       : 1
;*    Partitioned Resouce Bound (*)      : 1
;*    Resource Partition: 
;*                              A-side     B-side
;*    .L Units                   1*          0
;*    .S Units                   0           1*
;*    .D Units                   1*          1*
;*    .M Units                   1*          0
;*    .X cross paths             1*          0
;*    .T address paths           1*          1*
;*    Long read paths            1*          0
;*    Long write paths           0           0   
;*    Logical ops (.LS)          0           0   (.L or .S unit)
;*    Addition ops (.LSD)        0           0   (.L or .S or .D unit)
;*    Bound (.L .S .D)           1*          1* 
;*    Bound (.L .S .D .LS .LSD)  1*          1* 
;*
;*    Searching for software pipeline schedule at ...
;*       ii = 1 Schedule found with 8 iterations in parallel done


Listing Four
{ n_int i,j,
   long sum_0, sum_1, sum_2, sum_3;
   for (i=0; i < nr; i+=4)
   {sum_0 = sum_1 = sum_2 = sum_3 = 0;
      for(j=i+4; j < nx; j++)
      { sum_0 += x[j] * x[j-i-0];   
        sum_1 += x[j] * x[j-i-1];   
        sum_2 += x[j] * x[j-i-2];   
        sum_3 += x[j] * x[j-i-3];   
      }
   sum_0 += x[i+0]*x[0] + x[i+1]*x[1] + x[i+2]*x[2] +  x[i+3]*x[3];
   sum_1 += x[i+1]*x[0] + x[i+2]*x[1] + x[i+3]*x[2];
   sum_2 += x[i+2]*x[0] + x[i+3]*x[1];
   sum_3 += x[i+3]*x[0];
   r[i+0] = sum_0 >> (16 + scale);
   r[i+1] = sum_1 >> (16 + scale);
   r[i+2] = sum_2 >> (16 + scale);
   r[i+3] = sum_3 >> (16 + scale);
   }
}

Listing Five
;* SOFTWARE PIPELINE INFORMATION
;*    Known Minimum Trip Count           : 1
;*    Known Maximum Trip Count           : 32763 
;*    Known Max Trip Count               : 1  
;*    Loop Carried Dependency Bound (^)  : 1  
;*    Unpartitioned Resource Bound       : 2
;*    Partitioned Resouce Bound (*)      : 2
;*    Resource Partition: 
;*                              A-side     B-side
;*    .L Units                   2*          2*
;*    .S Units                   1           0
;*    .D Units                   1           1
;*    .M Units                   2*          2*
;*    .X cross paths             2*          2*
;*    .T address paths           1           2*
;*    Long read paths            2*          2*
;*    Long write paths           0           0   
;*    Logical ops (.LS)          0           0   (.L or .S unit)
;*    Addition ops (.LSD)        0           0   (.L or .S or .D unit)
;*    Bound (.L .S .D)           2*          1 
;*    Bound (.L .S .D .LS .LSD)  2*          1 
;*
;*    Searching for software pipeline schedule at ...
;*       ii = 2 Did not find schedule 
;*       ii = 3 Schedule found with 4 iterations in parallel

Listing Six
       MPYHL     .M2X       A5,B16,B9
||      MPY      .M1X       B16,A4,A16
||[A0]    BDEC   .S1        L3,A0
||     LDNDW     .D1T1      *A3++(2),A5:A4
  [!A1]   ADD    .L1        A4,A9:A8,A9:A8
||[!A1]   ADD    .L2        B9,B7:B6,B7:B6
  [A1]    SUB    .D1        A1,1,A1
||[!A1]   ADD    .L2        B9,B5:B4,B5:B4
||[!A1]   ADD    .L1        A16,A7:A6,A7:A6
        MPY      .M1X       B6,A5,A4
       MPYHL     .M2X       A4,B16,B9
||      LDH      .D2T2      *B8++,B16

Listing Seven
;*  SOFTWARE PIPELINE INFORMATION
;*     Loop Unroll Multiple               : 4x
;*     Known Minimum Trip Count           : 1
;*     Known Maximum Trip Count           : 8190
;*     Known Max Trip Count               : 1  
;*     Loop Carried Dependency Bound (^)  : 3  
;*     Unpartitioned Resource Bound       : 7
;*     Partitioned Resouce Bound (*)      : 7
;*     Resource Partition: 
;*                               A-side     B-side
;*     .L Units                   7*          7*
;*     .S Units                   0           1
;*     .D Units                   3           1
;*     .M Units                   5           7*
;*     .X cross paths             5           4
;*     .T address paths           4           4
;*     Long read paths            7*          7*
;*     Long write paths           0           0   
;*     Logical ops (.LS)          2           0   (.L or .S unit)
;*     Addition ops (.LSD)        4           0   (.L or .S or .D unit)
;*     Bound (.L .S .D)           5           4 
;*     Bound (.L .S .D .LS .LSD)  6           3 
;*
;*     Searching for software pipeline schedule at ...
;*       ii = 7 Did not find schedule 
;*       ii = 8 Schedule found with 3 iterations in parallel

Listing Eight
       MPY       .M2X       B29,A3,B27
||     MV        .D1X       B29,A24
||     MPY       .M1        A26,A25,A27
  [!A0]   ADD    .L1        A26,A7:A6,A7:A6
||[!A0]   ADD    .L2        B27,B7:B6,B7:B6
||       DOTP2   .M1        A24,A25,A27
||       PACKLH2 .S1        A3,A3,A26
  [!A0]   ADD    .L2        B27,B17:B16,B17:B16
||      MPYHL    .M1        A25,A24,A27
||[B0]   BDEC    .S2        L5,B0
||[!A0]   ADD    .L1X       B30,A17:A16,A17:A16
||      MPYHL    .M2        B29,B25,B27
  [!A0]   ADD    .L2        B31,B9:B8,B9:B8
||[!A0]   ADD    .L1        A27,A9:A8,A9:A8
||      MPYHL    .M2        B28,B24,B24
||        ADD    .S1        2,A30,A29
||     LDNDW     .D1T2      *A30,B35:B24
  [!A0]   ADD    .L1        A27,A23:A22,A23:A22
||[!A0]   ADD    .L2        B27,B19:B18,B19:B18
||        MV     .D1X       B28,A26
||        MPY2   .M2X       B28,B24,B27
||     LDNDW     .D2T2      *B1++(8),B29:B28
  [!A0]   ADD    .L1X       B26,A19:A18,A19:A18
||        ADD    .L2        B24,B21:B20,B21:B20
||      MPYH     .M2        B29,B25,B30
||      DOT2     .M1        A26,A24,A20
||       ADD     .S1        8,A30,A30
||     LDNDW     .D1T1      *-A30(6),A26:A24
  [A0]   SUB     .D1        A0,1,A0
||[!A0]  ADD     .L1X       B30,A21:A20,A21:A20
||       ADD     .L2        B27,B23:B22,B23:B22
||     MPYHL     .M1        A24,A26,A26
||      MPY2     .M2X       B28,A28,B31:B30

Listing Nine
   [B0]   BDEC   .S2        L3,B0
||       DOTP2   .M1        A20,A21,A20
||[!A0]   ADD    .L2X       A25,B17:B16,B17:B16
||[!A0]   ADD    .L1        A26,A5:A4,A5:A4
||        MV     .D1X       B26,A25
  [!A0]   ADD    .L2        B22,B7:B6,B7:B6
||[!A0]   ADD    .L1        A26,A23:A22,A23:A22
||       DOTP2   .M2X       B27,A20,B23
||       DOTP2   .M1        A25,A20,A25
||       PACKLH2 .S1        A20,A20,A26
||       LDDW    .D2T2      *B28++,B27:B26
  [!A0]   ADD    .L1        A24,A9:A8,A9:A8
||[!A0]   ADD    .L2        B23,B5:B4,B5:B4
||      MPYHL    .M2X       B27,A21,B22
||      DOTP2    .M1        A25,A28,A26
||       PACKLH2 .S1        A28,A28,A27
||       LDDW    .D1T1      *++A3,A21:A20
  [!A0]   ADD    .L1X       B22,A7:A6,A7:A6
||[!A0]   ADD    .L2        B25,B21:B20,B21:B20
||      MPYHL    .M1        A25,A20,A24
||       MPY     .M2X       B26,A24,B22
||       LDW     .D1T1      +-A3(4),A28
  [!A0]   ADD    .L1        A20,A19:A18,A19:A18
||        ADD    .L2        B22,B9:B8,B9:B8
||         MV    .S1X       B27,A20
||       MPY2    .M2X       B27,A26,B23:B22
||        LDH    .D1T1      *-A3(6),A24
  [A0]    SUB    .D1        A0,1,A0
||[!A0]   ADD    .L1X       B24,A17:A16,A17:A16
||        ADD    .L2        B23,B19:B18,B19:B18
||        MPYHL  .M1        A28,A20,A26
||        MPY2   .M2X       B26,A27,B25:B24

Listing Ten
;*  SOFTWARE PIPELINE INFORMATION
;*     Loop Unroll Multiple               : 4x
;*     Known Minimum Trip Count           : 1
;*     Known Maximum Trip Count           : 8190
;*     Known Max Trip Count               : 1  
;*     Loop Carried Dependency Bound (^)  : 3  
;*     Unpartitioned Resource Bound       : 6
;*     Partitioned Resouce Bound (*)      : 6
;*     Resource Partition: 
;*                               A-side     B-side
;*     .L Units                   6*          6*
;*     .S Units                   0           1
;*     .D Units                   3           1
;*     .M Units                   5           5
;*     .X cross paths             4           6*
;*     .T address paths           3           1
;*     Long read paths            6*          6*
;*     Long write paths           0           0   
;*     Logical ops (.LS)          2           0   (.L or .S unit)
;*     Addition ops (.LSD)        4           0   (.L or .S or .D unit)
;*     Bound (.L .S .D)           4           4 
;*     Bound (.L .S .D .LS .LSD)  5           3 
;*
;*     Searching for software pipeline schedule at ...
;*       ii = 6 Schedule found with 3 iterations in parallel



1


