Algorithm Alley
by Paul F. Hultquist and William R. Mahoney

Listing One
/* ===========================================================================
Demonstration of reservoir sampling.
Author:  Bill Mahoney
=========================================================================== */

#include <iostream.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#define  BIG_FILE_SIZE 967  /* Any old number will do */
void reservoir_sample( int *big_file, int *sample, int N );

int main()
{
    int *sample, N, i;
    int big_file[ BIG_FILE_SIZE ];

    srand( (unsigned int) time( (time_t *) NULL ) );
    for( i = 0; i < BIG_FILE_SIZE; i++ )
        /* We could put in random numbers, or put in numbers that are the */ 
        /* same as the record number. This'll make it simpler for us to   */
        /* see which records are in fact selected.                        */
#if 0
        big_file[ i ] = rand() % 1000;
#else
        big_file[ i ] = i;
#endif
    /* For simplicity, assume the user enters a valid number. */
    cout << "How many elements would you like to retrieve? ";
    cin >> N;
    cout << "Selecting " << N << " elements...\n";
    sample = new int[ N ];

    reservoir_sample( big_file, sample, N );

    for( i = 0; i < N; i++ )
        cout << sample[ i ] << ( ( ( i + 1 ) % 5 == 0 ) ? "\n" : " " );
    cout << "Done!\n";

    delete [] sample;
    return( 0 );
}
/* ===========================================================================
reservoir_sample
Here we do the actual guts. Select the first N records from the front of the 
"file", then selectively knock out candidates and replace them with new ones.
Obviously, since we have just an array of int's there is no need for the 
temporary reservour array at all. We could just keep the index into the array 
"file" instead of an index into "res", the reservoir. Likewise, we could omit
"active" and just use "sample" to hold indices into the array. In an actual 
application, though, you would need to allocate the reservoir and fill it from
the disk file; thus the use of the dynamic array "res" - to illustrate this. 
Same thing with "active". An alternative in a real world application would be 
to have "res" maintain the file position (i.e. ftell() it) of record on disk.
=========================================================================== */

void reservoir_sample( int *file, int *sample, int N )
{
    int end_res, file_record, *res, res_size, *active, i;
    /* Here we create a reservoir with sufficient space.  */
    /* See the formula in the article for an explanation. */
    res_size = (int) ( 1.25 * (double) N * 
               ( 1.0 + log( (double) BIG_FILE_SIZE / (double) N ) ) );
    cout << "Creating a temporary 
                      storage area of " << res_size << " elements.\n";
    res = new int[ res_size ];
    active = new int[ N ];
    /* First off, copy the first N records into the reservoir. */
    for( i = 0; i < N; i++ )
    {
        res[ i ] = file[ i ];
        active[ i ] = i;
    }
    /* end_res is the # of elements in the reservoir, */
    /* file_record is the next record in the "file".  */
    end_res = file_record = N;

    while ( file_record < BIG_FILE_SIZE )
    {
        /* Position is 0..file_record-1 */
        int position = rand() % file_record;
        if ( position < N )
        {
            /* Copy the record into the reservoir */
            res[ end_res ] = file[ file_record ];
            /* Save the index to this record.     */
            active[ position ] = end_res;
            end_res++;
            if ( end_res >= res_size )
               cerr << "Out of space in the reservoir!\n";
               /* Do some kind of error recovery here. */
        }
        file_record++;
    }
    /* Give the caller the desired records. */
    for( i = 0; i < N; i++ )
        sample[ i ] = res[ active[ i ] ];
    delete[] res;
    delete [] active;
}








2

