Algorithm Alley: Yet Another Record Selection Algorithm
by William R. Mahoney

Listing One
/* =================================================================
random record selection demo
Author: Bill Mahoney
================================================================== */
#include <stdlib.h> // for rand/random
#include <iostream>
const int N_sub_t = 967; // Some "known length" for our "file"
struct file_s {
              int data; // the "data" in the "file"
              char more[ 3 ]; // "More" data in the "file"
              };
void sample( file_s *file, file_s *selected, int N );
int main()
{
     int N, i;
     file_s *file = new file_s[ N_sub_t ]; // pretend file
     file_s *selected; // The ones we pick at random
     // Fill up the file with data; of course, for a real application one 
     // would be accessing records from a file in the "select" function
     // below; the file would already have data.
     for( i = 0; i < N_sub_t; i++ )
        // Just number them sequentially so that
        // it is easy to see which are selected.
        file[ i ].data = i;
    // Ask how many the user wants; we'll assume for now that 0 <= N < N_sub_t
    cout << "How many records do you need? ";
    cin >> N;
    cout << "Selecting " << N << " elements...\n";
    selected = new file_s[ N ];
    // Select the sample of N records from N_sub_t
    sample( file, selected, N );
    for( i = 0; i < N; i++ )
        cout << selected[ i ].data
             << ( ( ( i + 1 ) % 5 == 0 ) ? "\n" : " " );
    cout << "Done!\n";
    delete [] selected;
    delete [] file;
    return( 0 );
}

/* =================================================================
sample
Randomly select N records from the "file" and copy records into "selected".
================================================================= */
void sample( file_s *file, file_s *selected, int N )
{
    int s, k; // See article
    float R; // probability to test
    float limit; // (N-s)/(N_sub_t-k)
    for( k = s = 0; k < N_sub_t; k++ )
    {
       // random(3) returns 0..RAND_MAX; the older C function rand(3) would 
       // also work. Since the algorithm is explained in terms of floating 
       // point, we will do this program to match. Integer math would, 
       // of course, be faster.
       R = ( (float) random() ) / ( (float) RAND_MAX );
       limit = ( (float)( N - s ) ) / ( (float)( N_sub_t - k ) );
       if ( R < limit )
           // R < (N-s)/(N_sub_t-k) -- include the record.
           selected[ s++ ] = file[ k ];
       if ( s == N )
          break; // get out early
    }
}

   



1

