Improving Scalability of Multithreaded Dynamic Memory Allocation
by Greg Nakhimovsky

Listing One
% more test1.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

main()
{
  char *str;
  int i;
  char buf[128];

  for (i=0; i<1000000; i++)
    str = (char *)malloc(8);
  sprintf(buf, "/usr/proc/bin/pmap -x %ld "
    "| /bin/egrep 'heap|Kbytes'", getpid());
  system(buf);
}
% cc test1.c
% a.out

Address   Kbytes  Resident  Shared  Private  Permissions       Mapped File
00022000  15742    15752       -     15752    read/write/exec    [ heap ]


Listing Two
/* Originally written by David Palmer, PTC.
Modified by Greg Nakhimovsky, Sun Microsystems.
This program helps measure scalability of an MT-hot malloc.
The single-threaded (ST) version calls function pound() ncpus times in
sequence.
The multi-threaded (MT) versions call pound() from the main thread,
plus create (ncpus-1) additional bound threads which run the same
routine pound() in parallel. In the MT versions, function pound()
allocates all of its dynamic memory separately in each thread.
Therefore, the memory consumption of this program is approximately
ncpus times greater than that of the single-threaded version.
Build the ST version:
 cc -xtarget=ultra -xO4 -DST -o single_threaded mt_hot_malloc_test.c
Build the multithreaded version using Solaris malloc(3C):
 cc -xtarget=ultra -xO4 -DMT -mt -o mt_sys mt_hot_malloc_test.c \
   -lpthread
Build the MT version using mt_hot malloc:
 cc -xtarget=ultra -xO4 -DMT -mt -o mt_hot mt_hot_malloc_test.c \
   mt_hot_malloc.o -lpthread

Run:
single_threaded <iterations>
mt_sys <iterations>
mt_hot  <iterations>

ncpus is the number of available CPUs (determined dynamically).
Function pound() executes a loop each iteration of which allocates
many blocks of memory, fills them with data, then frees some of that
memory and reallocates it for blocks with different sizes.
All this is repeated "iterations" times.
*/

#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <math.h>
#include <sys/time.h>
#include <unistd.h>
#define MAX_BLOCKS 80
#define SIZE_BLOCK 80
#define BIG_BLOCK 100

static int iterations;
static void *pound();

main(int argc, char *argv[])
{
  pthread_attr_t attr;
  pthread_t pthr_ids[64];
  int i;
  int num_main;
  int ncpus;
  hrtime_t start, stop;

  if (argc > 1)
    iterations = atoi(argv[1]);
  else
    iterations = 5000;

  ncpus = sysconf(_SC_NPROCESSORS_ONLN);
  printf("Detected %d CPUs\n", ncpus);
  start = gethrtime();

#ifdef MT
  if(pthread_attr_init(&attr))
  {
    printf("Error: pthread_attr_init() failed\n");
    exit(1);
  }
  /* make it a bound thread */
  if(pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM))
  {
    printf("Error: pthread_attr_setscope() failed\n");
    exit(1);
  }
  for( i = 0; i<ncpus-1; i++ )
  {
    if(pthread_create(&pthr_ids[i], &attr, (void *(*)(void *))pound, NULL))
    {
      printf("Error: Thread #%d cannot be created.\n", i);
      exit(1);
    }
    printf("Additional thread %d has been launched.\n", pthr_ids[i]);
  }
  num_main = 1;
#else
  num_main = ncpus;
#endif

  for( i = 0; i<num_main; i++ )
  {
    printf("Calling pound() in the main thread: %d\n", i+1);
    pound();
  }

#ifdef MT
  /* wait until all threads are finished */
  for( i = 0; i<ncpus-1; i++ )
    if(pthread_join(pthr_ids[i], NULL))
    {
      printf("Error: pthread_join() failed\n");
      exit(1);
    }
#endif
  stop = gethrtime();
  printf("Elapsed time: %10.2f sec\n", 1.e-9*(stop - start) );
}
void *pound()
{
  char *blocks[MAX_BLOCKS];
  int i, j, count;
  char lastc = '!' + MAX_BLOCKS - 1;
  for( count=0; count < iterations; count++ )
  {
    for( i = 0; i < MAX_BLOCKS; i++ )          /* Allocate all blocks */
    {
      blocks[i] = (char *)malloc(i + SIZE_BLOCK);
      for( j = 0; j < i + SIZE_BLOCK; j++ )   /* Fill with data */
     blocks[i][j] = '!' + i;
    }
    for( i = 1; i < MAX_BLOCKS; i += 2)    /* Free odd numbered blocks */
      free(blocks[i]);
    for( i = MAX_BLOCKS-1; i > 0; i -= 2)  /* Allocate odd numbered blocks */
    {
      blocks[i] = (char *)malloc(SIZE_BLOCK - i);
      for( j = 0; j < SIZE_BLOCK - i; j++ )
        blocks[i][j] = lastc - i;
    }
    for( i = MAX_BLOCKS-1; i > 0; i -= 2 ) /* Reallocate odd numbered blocks */
    {
      blocks[i] = (char *)realloc(blocks[i], SIZE_BLOCK + i);
      for( j = 0; j < SIZE_BLOCK + i; j++ )    /* Fill with data */
        blocks[i][j] = '!' + i;
    }
    for( i = 1; i <= BIG_BLOCK; i++ )         /* Reallocate one big block */
    {
      blocks[MAX_BLOCKS-1] =
                      (char *)realloc(blocks[MAX_BLOCKS-1],SIZE_BLOCK * i);
      for( j = 0; j < SIZE_BLOCK * i; j++ )
        blocks[MAX_BLOCKS-1][j] = lastc;
    }
    for( i = 0; i < MAX_BLOCKS; i++ )         /* Free all blocks */
    {
      free(blocks[i]);
      blocks[i] = NULL;
    }
    for( i = 0; i < MAX_BLOCKS; i++ )          /* Free NULL poin */
      free(blocks[i]);
  }
}





