Automatically Exploiting Implicit Parallelism
by Aart J.C. Bik, Milind Girkar, Paul M. Grey, and Xinmin Tian


Example 1:
(a)
S1:   a = 100;   
S2:   b = c - 10;

(b)
S3:   a = 100;
S4:   b = a - 10;

(c)
    for (i = 1; i < 100; i++) {
S5:   a[i] = a[i-1] * 20;
    }


Example 2:

(a)
double a[100], b[100], c[100];
 ...                             
  for (i = 0; i < 100; i++) {   
S6:   a[i] = b[i] * c[i];        
  }                             

(b)
    xor    ecx, ecx       ; reset loop counter
L:  movapd xmm0, _b[ecx]  ; load     2 DP elements from b into xmm0 register
    mulpd  xmm0, _c[ecx]  ; multiply 2 DP elements from c into xmm0 register
    movapd _a[ecx], xmm0  ; store    2 DP elements from xmm0 register into a 
    add    ecx, 16        ;
    cmp    ecx, 800       ;
    jl     L              ; looping logic (iterates 50 times)


(c)
      for (i = 0; i < 8; i++) {   
S7:      a[i+2] = a[i] * b[i];        
      }               

(d)
      float x[85], y[85];
      ...
      for (i = 1; i < 85; i++) {   
S8:      x[i] = x[i-1] + y[i];        
      }                           

(e)
      for (i = 1; i < 85; i++) {   
S9:      x[i-1] = x[i] + y[i];        
      }                             


Example 3:

(a)
     xor     eax, eax         ; reset loop counter
 L:  movups  xmm0, _x[eax+4]  ; load 4 SP elements from x into xmm0 register
     movups  xmm1, _y[eax+4]  ; load 4 SP elements from y into xmm0 register
     addps   xmm0, xmm1       ; add   4 SP elements 
     movaps  _x[eax], xmm1    ; store 4 SP elements from xmm0 register into x
     add     eax, 16
     cmp     eax, 336
     jl      L                ; looping logic (iterates 22 times)


(b)
     for (i = 1; i < N; i++) {              for (i = 1; i < N; i++) {   
S10:    a[i] = b[i-1];                S11:    b[i] = c[i] * 2.0;        
S11:    b[i] = c[i] * 2.0;     <->    S10:    a[i] = b[i-1];    
     }                                     }


(c) 
     short u[N], v[N]; int w;
     ...
     for (w = 0, i = 0; i < N; i++) {
S12:     w = w + u[i] * v[i];
    }


(d)
     xor       eax, eax       ; reset loop counter
     pxor      xmm0, xmm0     ; initialize accumulator xmm0 to | 0  0  0  0 |

L:   movdqa    xmm1, _x[eax]  ; load 8 shorts from x into xmm1 and
     pmaddwd   xmm1, _y[eax]  ;  and multiply/add with 8 shorts from y
     paddd     xmm0, xmm1     ;   and accumulate 4 resulting integers into
     add       eax, 16        ;    the 4 partial sums in accumulator
     cmp       eax, 2*N       ;
     jl        L              ; looping logic (iterates N/8 times)


Example 4:

(a)
      double x[N], y[n], a[N][N];
      ...          
      for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
  S:      x[i] = x[i] + a[i][j] * y[j];
        }
      }          

(b)
_fork_call(_matvec_par_loop0, n);

(c)
void _matvec_par_loop0(int tid, int n) {
  int _lower  = 0;
  int _upper  = n - 1;  
  int _stride = 1;
  _static_init(tid, &_lower, &_upper, &_stride);
  for (par_i = _lower; par_i <= _upper; par_i += _stride) {
     par_xx = 0.0;
     for (par_j = 0; par_j < n; par_j++) {
       par_xx = par_xx + a[par_i][par_j] * y[par_j];
     }
     x[par_i] = par_xx;
  }
  static_fini(tid);  
}






3

