fpcsrc/packages/pasjpeg/src/jfdctfst.pas


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

Unit JFDctFst;

{ This file contains a fast, not so accurate integer implementation of the
  forward DCT (Discrete Cosine Transform).

  A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  on each column.  Direct algorithms are also available, but they are
  much more complex and seem not to be any faster when reduced to code.

  This implementation is based on Arai, Agui, and Nakajima's algorithm for
  scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  Japanese, but the algorithm is described in the Pennebaker & Mitchell
  JPEG textbook (see REFERENCES section in file README).  The following code
  is based directly on figure 4-8 in P&M.
  While an 8-point DCT cannot be done in less than 11 multiplies, it is
  possible to arrange the computation so that many of the multiplies are
  simple scalings of the final outputs.  These multiplies can then be
  folded into the multiplications or divisions by the JPEG quantization
  table entries.  The AA&N method leaves only 5 multiplies and 29 adds
  to be done in the DCT itself.
  The primary disadvantage of this method is that with fixed-point math,
  accuracy is lost due to imprecise representation of the scaled
  quantization values.  The smaller the quantization table entry, the less
  precise the scaled value, so this implementation does worse with high-
  quality-setting files than with low-quality ones. }

{ Original: jfdctfst.c ; Copyright (C) 1994-1996, Thomas G. Lane. }


interface

{$I jconfig.inc}

uses
  jmorecfg,
  jinclude,
  jpeglib,
  jdct;         { Private declarations for DCT subsystem }


{ Perform the forward DCT on one block of samples. }

{GLOBAL}
procedure jpeg_fdct_ifast (var data : array of DCTELEM);

implementation

{ This module is specialized to the case DCTSIZE = 8. }

{$ifndef DCTSIZE_IS_8}
  Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
{$endif}


{ Scaling decisions are generally the same as in the LL&M algorithm;
  see jfdctint.c for more details.  However, we choose to descale
  (right shift) multiplication products as soon as they are formed,
  rather than carrying additional fractional bits into subsequent additions.
  This compromises accuracy slightly, but it lets us save a few shifts.
  More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  everywhere except in the multiplications proper; this saves a good deal
  of work on 16-bit-int machines.

  Again to save a few shifts, the intermediate results between pass 1 and
  pass 2 are not upscaled, but are represented only to integral precision.

  A final compromise is to represent the multiplicative constants to only
  8 fractional bits, rather than 13.  This saves some shifting work on some
  machines, and may also reduce the cost of multiplication (since there
  are fewer one-bits in the constants). }

const
  CONST_BITS = 8;
const
  CONST_SCALE = (INT32(1) shl CONST_BITS);


const
  FIX_0_382683433 = INT32(Round(CONST_SCALE * 0.382683433)); {98}
  FIX_0_541196100 = INT32(Round(CONST_SCALE * 0.541196100)); {139}
  FIX_0_707106781 = INT32(Round(CONST_SCALE * 0.707106781)); {181}
  FIX_1_306562965 = INT32(Round(CONST_SCALE * 1.306562965)); {334}

{ Descale and correctly round an INT32 value that's scaled by N bits.
  We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  the fudge factor is correct for either sign of X. }

function DESCALE(x : INT32; n : int) : INT32;
var
  shift_temp : INT32;
begin
{ We can gain a little more speed, with a further compromise in accuracy,
  by omitting the addition in a descaling shift.  This yields an incorrectly
  rounded result half the time... }
{$ifndef USE_ACCURATE_ROUNDING}
  shift_temp := x;
{$else}
  shift_temp := x + (INT32(1) shl (n-1));
{$endif}

{$ifdef RIGHT_SHIFT_IS_UNSIGNED}
  if shift_temp < 0 then
    Descale :=  (shift_temp shr n) or ((not INT32(0)) shl (32-n))
  else
{$endif}
    Descale :=  (shift_temp shr n);
end;

{ Multiply a DCTELEM variable by an INT32 constant, and immediately
  descale to yield a DCTELEM result. }


   function MULTIPLY(X : DCTELEM; Y: INT32): DCTELEM;
   begin
     Multiply := DeScale((X) * (Y), CONST_BITS);
   end;


{ Perform the forward DCT on one block of samples. }

{GLOBAL}
procedure jpeg_fdct_ifast (var data : array of DCTELEM);
type
  PWorkspace = ^TWorkspace;
  TWorkspace = array [0..DCTSIZE2-1] of DCTELEM;
var
  tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 : DCTELEM;
  tmp10, tmp11, tmp12, tmp13 : DCTELEM;
  z1, z2, z3, z4, z5, z11, z13 : DCTELEM;
  dataptr :  PWorkspace;
  ctr : int;
  {SHIFT_TEMPS}
begin
  { Pass 1: process rows. }

  dataptr := PWorkspace(@data);
  for ctr := DCTSIZE-1 downto 0 do
  begin
    tmp0 := dataptr^[0] + dataptr^[7];
    tmp7 := dataptr^[0] - dataptr^[7];
    tmp1 := dataptr^[1] + dataptr^[6];
    tmp6 := dataptr^[1] - dataptr^[6];
    tmp2 := dataptr^[2] + dataptr^[5];
    tmp5 := dataptr^[2] - dataptr^[5];
    tmp3 := dataptr^[3] + dataptr^[4];
    tmp4 := dataptr^[3] - dataptr^[4];

    { Even part }

    tmp10 := tmp0 + tmp3;       { phase 2 }
    tmp13 := tmp0 - tmp3;
    tmp11 := tmp1 + tmp2;
    tmp12 := tmp1 - tmp2;

    dataptr^[0] := tmp10 + tmp11; { phase 3 }
    dataptr^[4] := tmp10 - tmp11;

    z1 := MULTIPLY(tmp12 + tmp13, FIX_0_707106781); { c4 }
    dataptr^[2] := tmp13 + z1;  { phase 5 }
    dataptr^[6] := tmp13 - z1;

    { Odd part }

    tmp10 := tmp4 + tmp5;       { phase 2 }
    tmp11 := tmp5 + tmp6;
    tmp12 := tmp6 + tmp7;

    { The rotator is modified from fig 4-8 to avoid extra negations. }
    z5 := MULTIPLY(tmp10 - tmp12, FIX_0_382683433); { c6 }
    z2 := MULTIPLY(tmp10, FIX_0_541196100) + z5; { c2-c6 }
    z4 := MULTIPLY(tmp12, FIX_1_306562965) + z5; { c2+c6 }
    z3 := MULTIPLY(tmp11, FIX_0_707106781); { c4 }

    z11 := tmp7 + z3;           { phase 5 }
    z13 := tmp7 - z3;

    dataptr^[5] := z13 + z2;    { phase 6 }
    dataptr^[3] := z13 - z2;
    dataptr^[1] := z11 + z4;
    dataptr^[7] := z11 - z4;

    Inc(DCTELEMPTR(dataptr), DCTSIZE);  { advance pointer to next row }
  end;

  { Pass 2: process columns. }

  dataptr := PWorkspace(@data);
  for ctr := DCTSIZE-1 downto 0 do
  begin
    tmp0 := dataptr^[DCTSIZE*0] + dataptr^[DCTSIZE*7];
    tmp7 := dataptr^[DCTSIZE*0] - dataptr^[DCTSIZE*7];
    tmp1 := dataptr^[DCTSIZE*1] + dataptr^[DCTSIZE*6];
    tmp6 := dataptr^[DCTSIZE*1] - dataptr^[DCTSIZE*6];
    tmp2 := dataptr^[DCTSIZE*2] + dataptr^[DCTSIZE*5];
    tmp5 := dataptr^[DCTSIZE*2] - dataptr^[DCTSIZE*5];
    tmp3 := dataptr^[DCTSIZE*3] + dataptr^[DCTSIZE*4];
    tmp4 := dataptr^[DCTSIZE*3] - dataptr^[DCTSIZE*4];

    { Even part }

    tmp10 := tmp0 + tmp3;       { phase 2 }
    tmp13 := tmp0 - tmp3;
    tmp11 := tmp1 + tmp2;
    tmp12 := tmp1 - tmp2;

    dataptr^[DCTSIZE*0] := tmp10 + tmp11; { phase 3 }
    dataptr^[DCTSIZE*4] := tmp10 - tmp11;

    z1 := MULTIPLY(tmp12 + tmp13, FIX_0_707106781); { c4 }
    dataptr^[DCTSIZE*2] := tmp13 + z1; { phase 5 }
    dataptr^[DCTSIZE*6] := tmp13 - z1;

    { Odd part }

    tmp10 := tmp4 + tmp5;       { phase 2 }
    tmp11 := tmp5 + tmp6;
    tmp12 := tmp6 + tmp7;

    { The rotator is modified from fig 4-8 to avoid extra negations. }
    z5 := MULTIPLY(tmp10 - tmp12, FIX_0_382683433); { c6 }
    z2 := MULTIPLY(tmp10, FIX_0_541196100) + z5; { c2-c6 }
    z4 := MULTIPLY(tmp12, FIX_1_306562965) + z5; { c2+c6 }
    z3 := MULTIPLY(tmp11, FIX_0_707106781); { c4 }

    z11 := tmp7 + z3;           { phase 5 }
    z13 := tmp7 - z3;

    dataptr^[DCTSIZE*5] := z13 + z2; { phase 6 }
    dataptr^[DCTSIZE*3] := z13 - z2;
    dataptr^[DCTSIZE*1] := z11 + z4;
    dataptr^[DCTSIZE*7] := z11 - z4;

    Inc(DCTELEMPTR(dataptr));   { advance pointer to next column }
  end;
end;

end.