Algorithm Alley Column
by  Lynn Monson


Example 1: 

< name of  primary class (e.g. "Urgent") >
< name of  default class (e.g. "Not_Urgent") >
<class>
<open ended text>
<end-of-class-marker>
<class>
<open ended text>
<end-of-class-marker>

Example 2: 

If body_ssl is an element.
  CLASS: __service_provider__ (5/0)
else If body_ssl is not an element.
  If body_with is NOT an element.
    If body_ssl/winsock2 is an element.
      CLASS: __service_provider__ (3/0)
    else If body_ssl/winsock2 is not an element.
      If body_available is an element.
        CLASS: __service_provider__ (2/0)
      else If body_available is not an element.
        CLASS: __something_else__ (32/4)
  else If body_with is an element.
    CLASS: __something_else__ (50/0)


Listing One
/* Calculate maximum gain test (C4.5) from list of examples, 
 * TotalWordMap, and two class names. */
static TreeNode  MaxGainTest( Vector vExamples, TotalWordMap twm,
                  String sClassA, String sClassB )
{
  int         iTotalMain = 0;
  int         iTotalNotMain = 0;
  boolean     bPosTest = true;

  Enumeration e = vExamples.elements();
  while ( e.hasMoreElements() ) {
    BagOWords   bow = (BagOWords)e.nextElement();
    if ( bow.sClass.equalsIgnoreCase( sClassA ) )
      iTotalMain = iTotalMain +1;
    else
      iTotalNotMain = iTotalNotMain +1;
  }
  if ( iTotalMain<iSmallestNodeSize |
       iTotalNotMain<iSmallestNodeSize |
       (iTotalMain + iTotalNotMain < iMinExamples ) )
    {
      int iTotal = iTotalMain + iTotalNotMain;
      if ( iTotalMain < iTotalNotMain )
    return new TreeNode( sClassB, iTotal, iTotalMain );
      else
    return new TreeNode( sClassA, iTotal, iTotalNotMain );
    }
  double dbBestGain = -1.0;
  double dbBaseEntropy = Entropy((double)iTotalMain/(iTotalMain+iTotalNotMain),
           (double)iTotalNotMain/(iTotalMain+iTotalNotMain) );
  e = twm.enumWords();
  String sBestWord = "";
  int     pBest=0, nBest=0;
  while ( e.hasMoreElements() )  {
    String  sWord = (String)e.nextElement();
    int     p=0, n=0;
    // Number of true positives
    p = twm.getClassCnt( sWord, sClassA );
    // Number of false positives
    n = twm.getNotClassCnt( sWord, sClassA );
    double dbEntropy = CondEntropy( p, n, iTotalMain-p, iTotalNotMain-n );
    double dbGain = dbBaseEntropy - dbEntropy;
    if ( dbGain > dbBestGain ) {
      dbBestGain = dbGain;
      sBestWord = sWord;
      pBest = p;
      nBest = n;
      bPosTest = true;
      if ( p<n ) {
    pBest = iTotalMain-p;
    nBest = iTotalNotMain-n;
    bPosTest = false;
      }
    }
  }
  int iTotal = iTotalMain + iTotalNotMain;
  boolean bIsLeaf = false;
  if ( dbBestGain <= 0 ) bIsLeaf = true;
  if (pBest==0 && nBest==iTotal ) bIsLeaf = true;
  if ( nBest==0 && pBest==iTotal ) bIsLeaf = true;
  if ( bIsLeaf )  {
    if ( iTotalMain < iTotalNotMain )
      return new TreeNode( sClassB, iTotal, iTotalMain );
    else
      return new TreeNode( sClassA, iTotal, iTotalNotMain );
  }
  return new TreeNode( sClassA, sBestWord, bPosTest );
}
// ID3/(subset C4.5) algorithm
TreeNode   ID3( Vector vExamples, TotalWordMap twm,
        String sMainClass, String sNotMainClass )
{
  Vector       vLeft = new Vector();
  TotalWordMap twmLeft = new TotalWordMap();
  String       sRuleLeft, sRuleRight;
  TreeNode tn = MaxGainTest( vExamples, twm, sMainClass, sNotMainClass );

  if ( tn.bIsLeaf ) return tn;
  // Build the left branch data structures
  Enumeration e = vExamples.elements();
  while ( e.hasMoreElements() ) {
    BagOWords  b = (BagOWords)e.nextElement();
    if ( b.hasWord( tn.sTest ) == tn.bPosTest ) {
      twmLeft.addBOW( b );
      vLeft.addElement( b );
    }
  }
  // Delete the word from all examples
  e = vExamples.elements();
  while( e.hasMoreElements() )  {
    BagOWords b = (BagOWords)e.nextElement();
    if ( b.hasWord( tn.sTest ) )
      b.removeWord( tn.sTest );
  }
  twm.deleteWord( tn.sTest );
  twmLeft.deleteWord( tn.sTest );
  // Remove examples from the right branch data
  e = vLeft.elements();
  while( e.hasMoreElements() ) {
    BagOWords b = (BagOWords)e.nextElement();
    vExamples.removeElement( b );
    twm.fixupRemove( b );
  }
  // Recursively build the left and right branches.
  tn.Left = ID3( vLeft, twmLeft, sMainClass, sNotMainClass );
  tn.Right = ID3( vExamples, twm, sMainClass, sNotMainClass );
  return tn;
}
// Prune a decision tree
TreeNode PruneTree( TreeNode tn, String sClassA, String sClassB )
{
  if ( tn.bIsLeaf ) return tn;
  // Prune lower levels of tree first
  if ( !tn.Left.bIsLeaf )
    tn.Left = PruneTree( tn.Left, sClassA, sClassB );
  if ( !tn.Right.bIsLeaf )
    tn.Right = PruneTree( tn.Right, sClassA, sClassB );
  int iActualA = tn.getActualCnt( sClassA );
  int iActualB = tn.getActualCnt( sClassB );
  int iTotalOverall = iActualA + iActualB;
  double dbNumErrLeafA = tn.calcPredictedErrs( iTotalOverall, iActualB );
  double dbNumErrLeafB = tn.calcPredictedErrs( iTotalOverall, iActualA );
  double dbTreeRate = tn.getPredictedErrs();
  // Replace with a leaf of type "A"?
  if ( dbNumErrLeafA <= dbNumErrLeafB && dbNumErrLeafA <= dbTreeRate )
    return new TreeNode( sClassA, iTotalOverall, iActualB );
  // Replace with a leaf of type "B"?
  if ( dbNumErrLeafB <= dbTreeRate )
    return new TreeNode( sClassB, iTotalOverall, iActualA );
  return tn;
}



