Integer 64-bit Optimizations

by Anatoliy Kuznetsov



Listing One



{

    int   a1[2048];

    int   a2[2048];

    int   a3[2048];



    for (int i = 0; i < 2048; ++i)

    {

         a3[i] = a1[i] ^ a2[i];

    }

}



    

Listing Two



{

    long long  a1[1024];

    long long  a2[1024];

    long long  a3[1024];



    for (int i = 0; i < 1024; ++i)

    {

         a3[i] = a1[i] ^ a2[i];

    }

}

    



Listing Three



{

    int   a1[2048];

    int   a2[2048];

    int   a3[2048];



    long long* pa1 = (long long*) a1;

    long long* pa2 = (long long*) a2;

    long long* pa3 = (long long*) a3;



    for (int i = 0; i < sizeof(a1) / sizeof(long long); ++i)

    {

         pa3[i] = pa1[i] ^ pa2[i];

    }

}





Listing Four



int popcount(long long b)

{

     b = (b & 0x5555555555555555LU) + (b >> 1 & 0x5555555555555555LU);

     b = (b & 0x3333333333333333LU) + (b >> 2 & 0x3333333333333333LU);

     b = b + (b >> 4) & 0x0F0F0F0F0F0F0F0FLU;

     b = b + (b >> 8);

     b = b + (b >> 16);

     b = b + (b >> 32) & 0x0000007F;



     return (int) b;

}







Listing Five





int bitcmp(int w1, int w2)

{

    while (w1 != w2)

    {

        int res = (w1 & 1) - (w2 & 1);

        if (res != 0) 

             return res;

        w1 >>= 1;

        w2 >>= 1;

    }

    return 0;

}





Listing Six



int compare_bit_string(int a1[2048], int a2[2048])

{

    long long* pa1 = (long long*) a1;

    long long* pa2 = (long long*) a2;



    for (int i = 0; i < sizeof(a1) / sizeof(long long); ++i)

    {

         long long w1, w2, diff;

         w1 = a1[i];

         w2 = a2[i];

         diff = w1 ^ w2;

         if (diff)

         {

             return (w1 & diff & -diff) ? 1 : -1;

         }         

    }

    return 0;

}





Listing Seven



#include <bitset>

using namespace std;



const unsigned BSIZE = 1000;

typedef  bitset<BSIZE>  bset;



unsigned int humming_distance(const bset& set1, const bset& set2)

{

    bset xor_result =  set1 ^ set2;

    return xor_result.count();

}







Listing Eight





{

    unsigned int hamming;

    int   a1[2048];

    int   a2[2048];

    long long* pa1;

    long long* pa2;



    pa1 = (long long*) a1; pa2 = (long long*) a2;

    hamming = 0;



    for (int i = 0; i < sizeof(a1) / sizeof(long long); ++i)

    {

         long long b;

         b = pa1[i] ^ pa2[i];



        b = (b & 0x5555555555555555LU) + (b >> 1 & 0x5555555555555555LU);

        b = (b & 0x3333333333333333LU) + (b >> 2 & 0x3333333333333333LU);

        b = b + (b >> 4) & 0x0F0F0F0F0F0F0F0FLU;

        b = b + (b >> 8);

        b = b + (b >> 16);

        b = b + (b >> 32) & 0x0000007F;



        hamming += b;

    }

}





Listing Nine 



{

    double ti;

    int   a1[2048];

    int   a2[2048];

    long long* pa1;

    long long* pa2;



    pa1 = (long long*) a1; pa2 = (long long*) a2;

    ti = 0;



    for (int i = 0; i < sizeof(a1) / sizeof(long long); ++i)

    {

         long long b1, b2, b3;

         b1 = pa1[i] & pa2[i];

         b2 = pa1[i] & ~pa2[i];

         b3 = pa2[i] & ~pa1[i];



         b1 = popcount(b1);

         b2 = popcount(b2);

         b3 = popcount(b3);



    ti += double(b1) / double(0.4 * b2 + 0.5 * b3 + b1);



    }

}



Listing Ten



void bit_xor(unsigned* dst, const unsigned* src, unsigned block_size)

{

      const __m128i* wrd_ptr = (__m128i*)src;

      const __m128i* wrd_end = (__m128i*)(src + block_size);

      __m128i* dst_ptr = (__m128i*)dst;



      do

      {

           __m128i xmm1 = _mm_load_si128(wrd_ptr);

           __m128i xmm2 = _mm_load_si128(dst_ptr);

                

           __m128i xmm1 = _mm_xor_si128(xmm1, xmm2);           

           __mm_store_si128(dst_ptr, xmm1);

           ++dst_ptr;

           ++wrd_ptr;



      } while (wrd_ptr < wrd_end);

}











4



