Optimizing 3DNow! Real-time Graphics
by Max I. Fomitchev


Listing One
; compute y/w and x/w
MOVD        MM0,mem ;   0 | w
PFRCP       MM1,MM0 ; 1/w | 1/w (14-bit approximation)
PUNPCKLDQ   MM0,MM0 ;   w | w
PFRCPIT1    MM0,MM1 ; 1/w | 1/w (intermediate)
MOVQ        MM2,mem ;   y | x
PFRCPIT2    MM0,MM1 ; 1/w | 1/w (24-bit precision)
PFMUL       MM2,MM0 ; y/w | x/w


Listing Two
MOVD        MM0,mem ;         0 | w
PFRSQRT     MM1,MM0 ; 1/sqrt(w) | 1/sqrt(w)   (15-bit approximation)
MOVQ        MM2,MM1
PUNPCKLDQ   MM0,MM0 ;         w | w
PFMUL       MM1,MM1 ; sqrt(w)^2 | sqrt(w)^2 (15-bit approximation)
PFRSQIT1    MM1,MM0 ; 1/sqrt(w) | 1/sqrt(w)   (intermediate)
PFRCPIT2    MM1,MM2 ; 1/sqrt(w) | 1/sqrt(w)   (24-bit precision)
PFMUL       MM0,MM1 ;   sqrt(w) | sqrt(w)

Listing Three
PFSUB       MM1,MM3 ; this instruction pair will execute
PFMUL       MM2,mem ; in the same clock cycle
PFADD       MM1,MM2 ; this instruction pair won't execute
PFMUL       MM1,MM3 ; simultaneously due to register dependency

Listing Four
PFMIN       MM1,MM2
PFSUB       MM3,a
MOVQ        mem,MM1 ; 1-cycle stall occurs here because the result
PFMIN       MM2,MM3 ; in MM1 will be ready in the next cycle

Listing Five
PFMIN       MM1,MM2
PFSUB       MM3,a
PFMIN       MM2,MM3 ; 1-cycle stall occurs here 
PFSUB       MM4,b
MOVQ        mem,MM1 ; no stall, the result in MM1 is ready
PFSUB       MM3,a

Listing Six
(a)
for ( j = 0; j < n; j++ )
   beta += a[j]*r[n - j];

(b)
; eax = j, ebx = n-j-1
MOVQ        MM0,a[EAX*4]    ;          a[j] | a[j+1]
MOVQ        MM1,r[EBX*4]    ;      r[n-j-1] | r[n-j]
PFMUL       MM0,MM1         ; a[j]*r[n-j-1] | a[j+1]*r[n-j]


Listing Seven
(a)
// Calculate r in reverse order (if possible) or swap the array elements
 ...
for ( j = 0; j < n; j++ )       // beta calculation loop
      beta += a[j]*r[c + j];    // c is some constant

(b) 
for ( j = 0; j < n; j++ )
{
    float c = r[n - j]; // swap(r[j], r[n-j];
    r[n - j] = r[j];
    r[j] = c;
}

(c)
MOVQ        MM0,a[EAX*4]    ;        a[j] | a[j+1]
PSWAPD      MM1,r[EBX*4]    ;      r[n-j] | r[n-j-1]
PFMUL       MM0,MM1         ; a[j]*r[n-j] | a[j+1]*r[n-j-1]

(d)
PFACC       MM0,MM0         ; a[j]*r[n-j] + a[j+1]*r[n-j-1]
MOVD        beta,MM0


Listing Eight
(a)
; n-th iteration
MOVQ        MM0,a[EAX*4]    ;        a[n] | a[n+1]
MOVQ        MM1,r[EBX*4]    ;      r[c+n] | r[c+n+1]
PFMUL       MM0,MM1         ; a[n]*r[c+n] | a[n+1]*r[c+n+1]
 ...
; after the loop
PFACC       MM0,MM0         ; a[n]*r[c+n] + a[n+1]*r[c+n+1]
MOVD        beta,MM0        ; Oops! Extra a[n+1]*r[c+n+1]

(b)
float a[m], r[m];           // m is some constant
// Initialize arrays a and r
 ...
for ( n = 1; n < m; n++ )   // main loop
{
    beta = 0;
    for ( j = 0; j < n; j++ )
    beta += a[j]*r[c + j];    // c is some constant
    // Do something else
    ...
}

(c)
    ; Compute beta
    MOV     ECX,n
    DEC     ECX
    MOV     EAX,0           ; j = 0
    PXOR    MM2,MM2         ; beta = 0
M:
    MOVQ        MM0,a[EAX]  ;        a[j] | a[j+1]
    MOVQ        MM1,r[EAX]  ;      r[c+j] | r[c+j+1]
    PFMUL       MM0,MM1     ; a[n]*r[c+j] | a[j+1]*r[c+j+1]
    PFADD       MM2,MM0     ;       beta0 | beta1
    ADD     EAX,8
    SUB     ECX,2
    JG      M               ; end of even part
    JNE     SKIPODD
    ; odd part processing
    MOVD        MM0,a[EAX]  ;          a[n-1] | 0
    MOVD        MM1,r[EAX]  ;        r[c+n-1] | 0
    PFMUL       MM0,MM1     ; a[n-1]*r[c+n-1] | 0
    PFADD       MM2,MM0     ;           beta0 | beta1
SKIPODD:
    PFACC       MM2,MM2     ; beta = beta0 + beta1
    MOVD        beta,MM2

(d)
// Unroll for n = 1
beta = a[0]*r[c];
// Do something else for n = 1
  ...
// Process for n = 2, 3,...m
for ( n = 2; n < m; n++ )       // main loop
{
    // 3DNow! optimized loop for beta calculation
    __asm {
    }
    // Do something else
    ...
}

Listing Nine
(a)
__int64 StartTicks, EndTicks;
_asm {
    CPUID
    RDTSC
    MOV     DWORD PTR StartTicks,EAX
    MOV     DWORD PTR StartTicks[4],EDX
    // Code which performance is measured
   ...
   CPUID
   RDTSC
   MOV     DWORD PTR EndTicks,EAX
   MOV     DWORD PTR EndTicks[4],EDX
   // EndTicks - StartTicks = Running Time in CPU cycles
}

(b)
__int64 StartTicks, EndTicks, Ticks = 0x7FFFFFFFFFFFFFFF;
for ( int i = 0; i < 10; i++ )
{
    // Flush the data cache if necessary
    // memset(data, 0, sizeof_data_cache);
    _asm {
         CPUID
         RDTSC
         MOV     DWORD PTR StartTicks,EAX
         MOV     DWORD PTR StartTicks[4],EDX
         // Code which performance is measured
        ...
        CPUID
        RDTSC
        MOV     DWORD PTR EndTicks,EAX
        MOV     DWORD PTR EndTicks[4],EDX
   }
   if ( EndTicks - StartTicks < Ticks )
        Ticks = EndTicks - StartTicks;
}






4


