Optimization Techniques  
by Tim Kientzle

Example 1: 

(a) 

__int64 readTSC() {
  __asm { rdtsc }
}

(b) 

long long readTSC(void) {
  /* Assumes 'long long' is 64 bits, 'unsigned' is 32 */
  union { long long complete; unsigned part[2]; } ticks;
  __asm__ ("rdtsc; mov %%eax,%0;mov %%edx,%1"
       : "=mr" (ticks.part[0]),
         "=mr" (ticks.part[1]) 
       : /* No inputs */
       : "eax", "edx");
  return ticks.complete;
}


Example 2: 

int (*Foo_Ptr)(int);
PROCESSOR p;
ProcessorInfo(&p);
if(p->hasSSE2) Foo_Ptr=Foo_SSE2
else if(p->hasSSE) Foo_Ptr=Foo_SSE
else if(p->hasMMX) Foo_Ptr=Foo_MMX
else Foo_Ptr=Foo;

Example 3: 

/* Portable C Implementation */
int Foo(int);
/* Use C version on MMX processors */
#define Foo_MMX Foo
/* Hand-tuned SSE implementation */
int Foo_SSE(int)
/* Use SSE version on SSE2 processors */
#define Foo_SSE2 Foo_SSE

Example 4:

(a) 
if(foo & 1)  bar = a;
else bar = b;

(b) 
bar = (foo & 1) ? a : b;

(c) 
bar = b + (a - b)*(foo & 1);

(d) 
/* -(foo & 1) is either all zeros or all ones */
bar = b + ((a - b) & -(foo & 1));


Listing One
typedef struct {
    /* Registers */
    int eax;
    int ebx;
    int ecx;
    int edx;
    /* Basic CPUID data */
    int maxCPUID;
    int maxExtendedCPUID;
    int features;
    int extendedFeaturesAMD;
    int family;
    int model;
    int steppingId;
    /* Identification Strings */
    char manufacturer[16];
    char brand[50];
    /* Specific features */
    char hasCPUID;
    char hasTSC;
    char hasCMOV;
    char hasMMX;
    char hasSSE;
    char hasSSE2;
} PROCESSOR;
void
ProcessorCPUID(int arg, PROCESSOR *p) {
  if(!p->hasCPUID) return;
  p->eax = p->ebx = p->ecx = p->edx = 0;
#if defined(_M_IX86) && defined(_MSC_VER)
  /* Visual C++ syntax */
  __asm {
    mov edi,p
    mov eax,arg
    cpuid
    mov [edi]PROCESSOR.eax,eax
    mov [edi]PROCESSOR.ebx,ebx
    mov [edi]PROCESSOR.ecx,ecx
    mov [edi]PROCESSOR.edx,edx
  }
#elif defined(__GNUC__) && defined(i386)
  /* GNU C/C++ Syntax */
  __asm__("mov %4,%%eax;cpuid;mov %%eax,%0;mov %%ebx,%1;"
          "mov %%ecx,%2;mov %%edx,%3"
          : "=mr" (p->eax),"=mr"(p->ebx),
              "=mr"(p->ecx),"=mr"(p->edx) /* outputs */
          : "mr" (arg) /* inputs */
          : "eax","ebx","ecx","edx" /* Registers clobbered */);
#endif
}
void
ProcessorInfo(PROCESSOR *p) {
  int t=0; /* Scratch C register */
  memset(p,0,sizeof(*p));
  /* Step 1: Determine if this processor supports CPUID. */
#if defined(_M_IX86) && defined(_MSC_VER)
  /* Processor supports CPUID if you can set and clear bit 21 of EFLAGS */
  /* This sequence sets bit 21 of t if bit 21 can be both set and cleared */
  __asm {
    pushfd        /* Start with current EFLAGS value */
    pop eax
    or eax,0x00200000  /* Set bit 21 */
    push eax
    popfd         /* Store into EFLAGS */
    pushfd        /* Read back from EFLAGS */
    pop ebx       /* Result into EBX */
    xor eax,0x00200000 /* Clear bit 21 */
    push eax
    popfd         /* Store into EFLAGS */
    pushfd        /* Read back from EFLAGS */
    pop ecx       /* Result into ECX */
    xor ebx,ecx   /* See if bit 21 is different in EBX and ECX */
    mov t,ebx     /* Store result into 't' */
  }
#elif defined(__GNUC__) && defined(i386)
  /* Processor supports CPUID if you can set and clear bit 21 of EFLAGS */
  /* This sequence sets bit 21 of t if bit 21 can be both set and cleared */
  __asm__("pushf;pop %%eax;"
          "or $0x00200000,%%eax;push %%eax;popf;pushf;pop %%ebx;"
          "xor $0x00200000,%%eax;push %%eax;popf;pushf;pop %%ecx;"
          "xor %%ecx,%%ebx;mov %%ebx,%0"
          : "=mr" (t) /* Output 't' can be a memory argument or register */
          : /* No inputs */
          : "eax","ebx","ecx" /* Registers clobbered */);
#endif
  if(t & 0x00200000) p->hasCPUID=1;
  /* No CPUID? Then there's not a lot we can determine. */
  if(!p->hasCPUID) return;
  /* Use CPUID(0) to determine manufacturer and extent of CPUID support */
  ProcessorCPUID(0,p);
  p->maxCPUID = p->eax; /* Maximum CPUID argument */
  {
    int *s = (int *)(p->manufacturer);
    s[0] = p->ebx;
    s[1] = p->edx;
    s[2] = p->ecx;
    p->manufacturer[12] = 0;
  }
  if(p->maxCPUID < 1) return;
  /* Identify standard features */
  ProcessorCPUID(1,p);
  p->features = p->edx;
  if(p->features & (1<<4)) p->hasTSC = 1;
  if(p->features & (1<<15)) p->hasCMOV = 1;
  if(p->features & (1<<23)) p->hasMMX = 1;
  if(p->features & (1<<25)) p->hasSSE = 1;
  if(p->features & (1<<26)) p->hasSSE2 = 1;
  p->family = (p->eax >> 8) & 15;
  if(p->family == 0x0F) p->family |= (p->eax >> 16) & 0xFF0;
  p->model = (p->eax >> 4) & 0x0F;
  if(p->model == 0x0F) p->model |= (p->eax >> 8) & 0xF0;
  p->steppingId = p->eax & 0x0F;
  /* De facto standard: AMD 3DNow */
  ProcessorCPUID(0x80000000U,p);
  p->maxExtendedCPUID = p->eax;
  if(p->maxExtendedCPUID >= 0x80000001U) {
    ProcessorCPUID(0x80000001U,p);
    p->extendedFeaturesAMD = p->edx;
    if(p->extendedFeaturesAMD & (1<<31)) p->has3DNow = 1;
  }
  if(p->maxExtendedCPUID >= 0x80000004U) {
    int *s = (int *)(p->brand);
    ProcessorCPUID(0x80000002U,p);
    s[0] = p->eax; s[1] = p->ebx; s[2] = p->ecx; s[3] = p->edx;
    ProcessorCPUID(0x80000003U,p);
    s[4] = p->eax; s[5] = p->ebx; s[6] = p->ecx; s[7] = p->edx;
    ProcessorCPUID(0x80000004U,p);
    s[8] = p->eax; s[9] = p->ebx; s[10] = p->ecx; s[11] = p->edx;
  }
  if(strcmp(p->manufacturer,"GenuineIntel")==0) {
    /* Intel-specific feature recognition */
  }
  if(strcmp(p->manufacturer,"AuthenticAMD")==0) {
    /* Check extended features for AMD-specific extensions */
    /* if(p->extendedFeaturesAMD & (1<<22)) p->hasAMDMMX = 1;*/
    /* if(p->extendedFeaturesAMD & (1<<31)) p->has3DNow = 1; */
    /* if(p->extendedFeaturesAMD & (1<<30)) p->has3DNowExtensions = 1; */
  }
  /* Clear out register fields before returning */
  p->eax = p->ebx = p->ecx = p->edx = 0;
}


Listing Two

void ClearWorkArea_MMX(char *work) {
  _asm {
    mov   esi,work
    mov   edi,esi
    sub   edi,128
    pxor  mm0,mm0
    mov   cl,15
loopTop:
    movq  qword ptr [esi],mm0
    movq  qword ptr [esi+8],mm0
    movq  qword ptr [esi+16],mm0
    movq  qword ptr [esi+24],mm0
    movq  qword ptr [esi+32],mm0
    movq  qword ptr [esi+40],mm0
    movq  qword ptr [esi+48],mm0
    movq  qword ptr [esi+56],mm0
    movq  qword ptr [esi+64],mm0
    movq  qword ptr [esi+72],mm0
    movq  qword ptr [esi+80],mm0
    movq  qword ptr [esi+88],mm0
    add   edi,256
    movq  qword ptr [esi+96],mm0
    movq  qword ptr [esi+104],mm0
    movq  qword ptr [esi+112],mm0
    movq  qword ptr [esi+120],mm0
    movq  qword ptr [edi],mm0
    movq  qword ptr [edi+8],mm0
    movq  qword ptr [edi+16],mm0
    movq  qword ptr [edi+24],mm0
    movq  qword ptr [edi+32],mm0
    movq  qword ptr [edi+40],mm0
    movq  qword ptr [edi+48],mm0
    movq  qword ptr [edi+56],mm0
    movq  qword ptr [edi+64],mm0
    movq  qword ptr [edi+72],mm0
    add   esi,256
    movq  qword ptr [edi+80],mm0
    movq  qword ptr [edi+88],mm0
    movq  qword ptr [edi+96],mm0
    dec   cl
    movq  qword ptr [edi+104],mm0
    movq  qword ptr [edi+112],mm0
    movq  qword ptr [edi+120],mm0
    jne   loopTop
    emms
  }
}







5


