Optimizing IA-64 Performance
by Sverre Jarp	


Listing One
int find(int key, int N, int* array)
{
   int i;
   for (i=0; i<N; ++i)
   {   if (key == array[i]) return i;  // Found   }
   return -1; // Not found
}

Listing Two
 #define Name find
 .text
 .global Name
 .type Name,@function 
 .proc   Name
Name:
//   Initialization code comes here
//   Loop comes here
//   Exit code comes here
 .endp

Listing Three
#define s_pfssave  r9
#define s_lcsave    r10
            alloc   s_pfssave=ar.pfs,3,0,0,0
            mov    s_lcsave=ar.lc

            cmp.le.unc p6,p0=in1,r0
(p6)        br.cond.dpnt.few  notfound
;;
            add     in1=-1,in1   // loop count - 1
;;
            mov     ret0=-1      // initial index count
            mov     ar.lc=in1    // initial loop count
;;
// The loop comes here   
notfound:  mov     ret0=-1   ;;  // Not found
found:     mov    ar.lc=s_lcsave
             br.ret.sptk.many rp
 .endp

Listing Four
#define s_temp      r31 
cntloop:
            ld4        s_temp=[in2],4
            add       ret0=1,ret0       // tracking of index
;;    
           cmp4.eq.unc    p6,p0=s_temp,in0
(p6)       br.cond,dptk.few  found
           br.cloop.dptk.few     cntloop
;;  

Listing Five
 #define LL 2  // L1 Load latency
 #define CL 0  // Compare latency
 .rotr   array[LL+1] 
 .rotp   pc[LL+1],qc[CL+1]

 #define Epilogue   LL+CL+1

Listing Six
Name:          
           alloc   s_pfssave=ar.pfs,3,5,0,8
 .rotr   array[LL+1]
            mov    s_prsave=pr
            mov    s_lcsave=ar.lc
            cmp.le.unc p6,p0=in1,r0
(p6)        br.cond.dpnt.few  notfound
            mov    s_key=in0      // move the key
            mov    s_parray=in2  // move the array ptr
            brp.loop.imp modloop,modloop+16
            mov    ar.ec=Epilogue
            add     in1=-1,in1 // loop count - 1
            mov    pr.rot=1<<16 // initialise pr16
;;
            mov     ret0=-1       // initialise the index count
            mov     ar.lc=in1     // initialise the loop count
;;
// The loop comes here   
notfound:   mov    ret0=-1   ;;   //Not found
found:      mov    ar.lc=s_lcsave
            mov    ar.pfs=s_pfssave
            mov    pr=s_prsave,-1
            br.ret.sptk.many    rp

Listing Seven
modloop:
(pc[0])      ld4     array[0]=[s_parray],4
(pc[LL])     add     ret0=1,ret0   // easy tracking of index    
(pc[LL])     cmp4.eq.unc qc[CL],p0=array[LL],s_key
(qc[CL])     br.cond.dpnt.few  found
              br.ctop.dptk.few    modloop   
;;

Listing Eight
 #define LL 6  // L2 Load latency
 #define CL 0  // Compare latency
 .rotr   array[LL+1] 
 .rotp   pc[LL+1],qc[CL+1]

 #define Epilogue   LL+CL+1

Listing Nine
modloop:
{ .mii

(pc[0])       ld4       array[0]=[s_parray],4
(pc[LL])      add       ret0=1,ret0          // easy tracking
(pc[LL])      cmp4.eq.unc  qc[CL],p0=array[LL],s_key
}
{ .mbb
              nop.m 0
(qc[CL])      br.cond.dpnt.few  found
              br.ctop.dptk.few modloop   
;; }

Listing Ten
modloop:
{ .mii
(pc[0])          ld8        array[0]=[s_parray],8
(pc[LL])         pcmp4.eq   array[LL]=array[LL],s_key
(pc[LL+PCL])     add        ret0=2,ret0
 }
 { .mbb
(pc[LL+PCL])     cmp.ne.unc qc[CL],p0=array[LL+PCL],r0
(qc[CL])         br.cond.dpnt.few   found
                 br.ctop.dptk.few    modloop  
;; 
}





1

