by Algorithm Alley
Tim Kientzle


Figure 1: 
(a)
C(i) = a(i)/2 SUM from x=0 to (N-1) s(x) cos(PI i (2x+1)/ 2N)

(b)
C(j,i) = a(j)/2 a(i)/2 SUM from y=0 to (N-1) SUM from x=0 to (N-1)
       s(y,x) cos(PI i (2x+1)/2N) cos(PI j (2y+1)/2N)

(c)
C(j,i) = a(j)/2 SUM from y=0 to (N-1) cos(PI j (2y+1)/2N)
       [ a(i)/2 SUM from x=0 to (N-1)  s(y,x) cos(PI i (2x+1)/2N) ]

Figure 2: 

(a)

s(x) = SUM from i=0 to (N-1) a(i)/2 C(i) cos(PI i (2x+1)/2N)

(b)

s(y,x) = SUM from j=0 to (N-1) SUM from i=0 to (N-1)
       a(j)/2 a(i)/2 C(j,i) cos(PI i (2x+1)/2N) cos(PI j (2y+1)/2N)

(c)

s(y,x) = SUM from j=0 to (N-1) a(j)/2  cos(PI j (2y+1)/2N)
       [ SUM from i=0 to (N-1) a(i)/2 C(j,i) cos(PI i (2x+1)/2N) ]


Figure 6: 

(a)

A --+-----------+-- C    C =  A * k * cos(3pi/8) + B * k * sin(3pi/8)
    |  kR3      |
B --+-----------+-- D    D = -A * k * sin(3pi/8) + B * k * cos(3pi/8)

(b) 

temp = (A+B) * k*cos(3pi/8)
C = temp  + B * (k*sin(3pi/8) - k*cos(3pi/8))
D = temp  + A * (-k*sin(3pi/8) - k*cos(3pi/8))

(c) 

static const int r2c3=554;  /* sqrt(2)*cos(3pi/8)*1024 */
static const int r2s3=1337; /* sqrt(2)*cos(3pi/8)*1024 */

x3=r2c3*(x1+x0);  x0=(-r2c3+r2s3)*x0+x3;  x1=(-r2c3-r2s3)*x1+x3;


Example 1: 

dct1d4PtTest(int *dctBlock) {
  static const int r2c3=554, r2s3=1337;
  int x0=dctBlock[0], x1=dctBlock[1], x2=dctBlock[2], x3=dctBlock[3];
  int x4=x0+x3; x0-=x3; x3=x1+x2; x1-=x2; /* Stage 1 */

  x2=x4+x3;  x4-=x3;  /* Stage 2 */
  x3=r2c3*(x1+x0)+512;  x0=(-r2c3+r2s3)*x0+x3;  x1=(-r2c3-r2s3)*x1+x3;

  dctBlock[0] = x2;  dctBlock[2] = x4;  /* Round and output */
  dctBlock[1] = x0>>10;  dctBlock[3] = x1>>10;
}


Example 2: 

C =  A  + B * ( sin(3pi/8) / cos(3pi/8) )
D = -A * (sin(3pi/8)/cos(3pi/8)) + B

C *= cos(3pi/8)
D *= cos(3pi/8)


Table 1: 

Implementation         Timing

4-Element DCT           12.5 microseconds
Example 1                0.18 microseconds
8-Element DCT           46.9 microseconds
Listing One              0.625 microseconds
8(8-Element DCT       4375.0 microseconds (Direct 2-D calculation)
8(8-Element DCT        640.0 microseconds (Using 16 1-D calculations)
Listing Two              7.6 microseconds


Listing One
static void
dct1dTest(int *dctBlock) {
  static const int c1=1004 /*cos(pi/16)<<10*/, s1=200 /*sin(pi/16)<<10*/;
  static const int c3=851 /*cos(3pi/16)<<10*/, s3=569 /*sin(3pi/16)<<10*/;
  static const int r2c6=554 /*sqrt(2)*cos(6pi/16)<<10*/, r2s6=1337;
  static const int r2=181; /* sqrt(2)<<7 */
  int x0=dctBlock[0], x1=dctBlock[1], x2=dctBlock[2], x3=dctBlock[3],
    x4=dctBlock[4], x5=dctBlock[5], x6=dctBlock[6], x7=dctBlock[7];
  int x8;

  /* Stage 1 */
  x8=x7+x0; x0-=x7;  x7=x1+x6; x1-=x6;
  x6=x2+x5; x2-=x5;  x5=x3+x4; x3-=x4;

  /* Stage 2 */
  x4=x8+x5; x8-=x5;  x5=x7+x6; x7-=x6;
  x6=c1*(x1+x2); x2=(-s1-c1)*x2+x6; x1=(s1-c1)*x1+x6;
  x6=c3*(x0+x3); x3=(-s3-c3)*x3+x6; x0=(s3-c3)*x0+x6;

  /* Stage 3 */
  x6=x4+x5; x4-=x5;
  x5=r2c6*(x7+x8); x7=(-r2s6-r2c6)*x7+x5; x8=(r2s6-r2c6)*x8+x5;
  x5=x0+x2;x0-=x2; x2=x3+x1; x3-=x1;

  /* Stage 4, round, and output */
  dctBlock[0]=x6;  dctBlock[4]=x4;
  dctBlock[2]=(x8+512)>>10; dctBlock[6] = (x7+512)>>10;
  dctBlock[7]=(x2-x5+512)>>10; dctBlock[1]=(x2+x5+512)>>10;
  dctBlock[3]=(x3*r2+65536)>>17; dctBlock[5]=(x0*r2+65536)>>17;
}
static void
dct2dTest(int (*dctBlock)[8]) {
  static const int c1=1004 /*cos(pi/16)<<10*/, s1=200 /*sin(pi/16)<<10*/;
  static const int c3=851 /*cos(3pi/16)<<10*/, s3=569 /*sin(3pi/16)<<10*/;
  static const int r2c6=554 /*sqrt(2)*cos(6pi/16)<<10*/, r2s6=1337;
  static const int r2=181; /* sqrt(2)<<7 */
  int row,col;

  for(row=0;row<8;row++) {
    int x0=dctBlock[row][0], x1=dctBlock[row][1], x2=dctBlock[row][2],
      x3=dctBlock[row][3], x4=dctBlock[row][4], x5=dctBlock[row][5],
      x6=dctBlock[row][6], x7=dctBlock[row][7], x8;

    /* Stage 1 */
    x8=x7+x0; x0-=x7; x7=x1+x6; x1-=x6; x6=x2+x5; x2-=x5; x5=x3+x4; x3-=x4;

    /* Stage 2 */
    x4=x8+x5; x8-=x5; x5=x7+x6; x7-=x6;
    x6=c1*(x1+x2); x2=(-s1-c1)*x2+x6; x1=(s1-c1)*x1+x6;
    x6=c3*(x0+x3); x3=(-s3-c3)*x3+x6; x0=(s3-c3)*x0+x6;


    /* Stage 3 */
    x6=x4+x5; x4-=x5; x5=x0+x2;x0-=x2; x2=x3+x1; x3-=x1;
    x1=r2c6*(x7+x8); x7=(-r2s6-r2c6)*x7+x1; x8=(r2s6-r2c6)*x8+x1;

    /* Stage 4 and output */
    dctBlock[row][0]=x6;  dctBlock[row][4]=x4;
    dctBlock[row][2]=x8>>10; dctBlock[row][6] = x7>>10;
    dctBlock[row][7]=(x2-x5)>>10; dctBlock[row][1]=(x2+x5)>>10;
    dctBlock[row][3]=(x3*r2)>>17; dctBlock[row][5]=(x0*r2)>>17;
  }
   
Listing Two
  for(col=0;col<8;col++) {
    int x0=dctBlock[0][col], x1=dctBlock[1][col], x2=dctBlock[2][col],
      x3=dctBlock[3][col], x4=dctBlock[4][col], x5=dctBlock[5][col],
      x6=dctBlock[6][col], x7=dctBlock[7][col], x8;

    /* Stage 1 */
    x8=x7+x0; x0-=x7; x7=x1+x6; x1-=x6; x6=x2+x5; x2-=x5; x5=x3+x4; x3-=x4;

    /* Stage 2 */
    x4=x8+x5; x8-=x5; x5=x7+x6; x7-=x6;
    x6=c1*(x1+x2); x2=(-s1-c1)*x2+x6; x1=(s1-c1)*x1+x6;
    x6=c3*(x0+x3); x3=(-s3-c3)*x3+x6; x0=(s3-c3)*x0+x6;

    /* Stage 3 */
    x6=x4+x5; x4-=x5; x5=x0+x2;x0-=x2; x2=x3+x1; x3-=x1;
    x1=r2c6*(x7+x8); x7=(-r2s6-r2c6)*x7+x1; x8=(r2s6-r2c6)*x8+x1;

    /* Stage 4 and output */
    dctBlock[0][col]=(x6+16)>>5;  dctBlock[4][col]=(x4+16)>>5;
    dctBlock[2][col]=(x8+16384)>>15; dctBlock[6][col] = (x7+16384)>>15;
    dctBlock[7][col]=(x2-x5+16384)>>15; dctBlock[1][col]=(x2+x5+16384)>>15;
    dctBlock[3][col]=((x3>>8)*r2+8192)>>14;
    dctBlock[5][col]=((x0>>8)*r2+8192)>>14;
  }
}






1


