Optimizing for Intel Architecture CPUs
by Victor J. Duvanenko

Listing One
(a)

unsigned char a = 0, b = 1, c;
c = a + b;
unsigned short a = 0, b = 1, c;
c = a + b;
unsigned long a = 0, b = 1, c;
c = a + b;

(b)
Iu8vec8  a, b, c;   // Iu8vec8 is array with 8 elements of unsigned char
c = a + b;
Iu16vec4  a, b, c;  // Iu16vec4 is array with 4 elements of unsigned short
c = a + b;

Iu32vec2  a, b, c;  // Iu32vec4 is array with 2 elements of unsigned long
c = a + b;

(c)
Iu8vec8 is really equivalent to "unsigned char  tmp[8]".
Iu16vec4 is really equivalent to "unsigned short tmp[4]"
Iu32vec2 is really equivalent to "unsigned long tmp[2]"

Listing Two

(a)
void initMemoryByByte( unsigned char * memoryBuffer,
                       unsigned long length, unsigned char value )
{
    for( unsigned long i = 0; i < length; i++ )
        memoryBuffer[ i ] = value;
}

(b)

void initMemoryByMMX( unsigned char * memoryBuffer,
                      unsigned long length, unsigned char value )
{
    Iu8vec8            initValue;
    unsigned long  i;
    for( i = 0; i < sizeof( Iu8vec8 ); i++ )
        initValue[ i ] = value;
    for( i = 0; i < ( length / sizeof( Iu8vec8 )); i++ )
        ((Iu8vec8 *)memoryBuffer )[ i ] = initValue;
    empty();
}

(c)

void initMemoryByMemset( unsigned char * memoryBuffer,
                         unsigned long length, unsigned char value )
{
    memset( memoryBuffer, value, length );
}


Listing Three
(a)
void initMemoryByShort( unsigned char * memoryBuffer,
                        unsigned long length, unsigned char value )
{
    unsigned short  initValue = (((unsigned short) value ) << 0 ) |
                                (((unsigned short) value ) << 8 );
    for( unsigned long i = 0; i < ( length / sizeof( unsigned short)); i++ )
        ((unsigned short *)memoryBuffer )[ i ] = initValue;
}
void initMemoryByLong( unsigned char * memoryBuffer,
                       unsigned long length, unsigned char value )
{
    unsigned long  initValue = (((unsigned long) value ) <<  0 ) |
                               (((unsigned long) value ) <<  8 ) |
                               (((unsigned long) value ) << 16 ) |
                               (((unsigned long) value ) << 24 );
    for( unsigned long i = 0; i < ( length / sizeof( unsigned long )); i++ )
        ((unsigned long *)memoryBuffer )[ i ] = initValue;
}

(b)

void initMemoryBySSE2( unsigned char * memoryBuffer,
                       unsigned long length, unsigned char value )
{
    Iu8vec16      initValue;
    __m128i *      memoryBuffer128bitPtr = (__m128i *) memoryBuffer;
    unsigned long i;
    for( i = 0; i < sizeof( Iu8vec16 ); i++ )   initValue[ i ] = value;
    for( i = 0; i < ( length / sizeof( Iu8vec16 )); i++ )
        _mm_store_si128( memoryBuffer128bitPtr++, initValue );
}

(c)

void initMemoryByMMXaroundCache( unsigned char * memoryBuffer,
                                 unsigned long length, unsigned char value )
{
    Iu8vec8       initValue;
    __m64 *       memoryBuffer64bitPtr = (__m64 *) memoryBuffer;
    unsigned long i;
    for( i = 0; i < sizeof( Iu8vec8 ); i++ )
        initValue[ i ] = value;
    for( i = 0; i < ( length / sizeof( Iu8vec8 )); i++ )
        store_nta( memoryBuffer64bitPtr++, initValue );
    empty();
}

(d)

void initMemoryBySSE2aroundCache( unsigned char * memoryBuffer,
                                  unsigned long length, unsigned char value )
{
    Iu8vec16      initValue;
    __m128i *      memoryBuffer128bitPtr = (__m128i *) memoryBuffer;
    unsigned long i;
    for( i = 0; i < sizeof( Iu8vec16 ); i++ )
        initValue[ i ] = value;
    for( i = 0; i < ( length / sizeof( Iu8vec16 )); i++ )
       _mm_stream_si128( memoryBuffer128bitPtr++, initValue );
}




3


