3 files changed, 217 insertions, 183 deletions
diff --git a/src/pkg/image/jpeg/idct.go b/src/pkg/image/jpeg/idct.go
index e5a2f40f5..b387dfdff 100644
--- a/src/pkg/image/jpeg/idct.go
+++ b/src/pkg/image/jpeg/idct.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+package jpeg
+
 // This is a Go translation of idct.c from
 //
 // http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
@@ -35,8 +37,6 @@
  *
  */
 
-package jpeg
-
 const (
 	w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
 	w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
@@ -55,41 +55,45 @@ const (
 	r2 = 181 // 256/sqrt(2)
 )
 
-// 2-D Inverse Discrete Cosine Transformation, followed by a +128 level shift.
+// idct performs a 2-D Inverse Discrete Cosine Transformation, followed by a
+// +128 level shift and a clip to [0, 255], writing the results to dst.
+// stride is the number of elements between successive rows of dst.
 //
-// The input coefficients should already have been multiplied by the appropriate quantization table.
-// We use fixed-point computation, with the number of bits for the fractional component varying over the
-// intermediate stages. The final values are expected to range within [0, 255], after a +128 level shift.
+// The input coefficients should already have been multiplied by the
+// appropriate quantization table. We use fixed-point computation, with the
+// number of bits for the fractional component varying over the intermediate
+// stages.
 //
-// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the discrete W transform and
-// for the discrete Fourier transform", IEEE Trans. on ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-func idct(b *block) {
+// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
+// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
+// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
+func idct(dst []byte, stride int, src *block) {
 	// Horizontal 1-D IDCT.
 	for y := 0; y < 8; y++ {
 		// If all the AC components are zero, then the IDCT is trivial.
-		if b[y*8+1] == 0 && b[y*8+2] == 0 && b[y*8+3] == 0 &&
-			b[y*8+4] == 0 && b[y*8+5] == 0 && b[y*8+6] == 0 && b[y*8+7] == 0 {
-			dc := b[y*8+0] << 3
-			b[y*8+0] = dc
-			b[y*8+1] = dc
-			b[y*8+2] = dc
-			b[y*8+3] = dc
-			b[y*8+4] = dc
-			b[y*8+5] = dc
-			b[y*8+6] = dc
-			b[y*8+7] = dc
+		if src[y*8+1] == 0 && src[y*8+2] == 0 && src[y*8+3] == 0 &&
+			src[y*8+4] == 0 && src[y*8+5] == 0 && src[y*8+6] == 0 && src[y*8+7] == 0 {
+			dc := src[y*8+0] << 3
+			src[y*8+0] = dc
+			src[y*8+1] = dc
+			src[y*8+2] = dc
+			src[y*8+3] = dc
+			src[y*8+4] = dc
+			src[y*8+5] = dc
+			src[y*8+6] = dc
+			src[y*8+7] = dc
 			continue
 		}
 
 		// Prescale.
-		x0 := (b[y*8+0] << 11) + 128
-		x1 := b[y*8+4] << 11
-		x2 := b[y*8+6]
-		x3 := b[y*8+2]
-		x4 := b[y*8+1]
-		x5 := b[y*8+7]
-		x6 := b[y*8+5]
-		x7 := b[y*8+3]
+		x0 := (src[y*8+0] << 11) + 128
+		x1 := src[y*8+4] << 11
+		x2 := src[y*8+6]
+		x3 := src[y*8+2]
+		x4 := src[y*8+1]
+		x5 := src[y*8+7]
+		x6 := src[y*8+5]
+		x7 := src[y*8+3]
 
 		// Stage 1.
 		x8 := w7 * (x4 + x5)
@@ -119,14 +123,14 @@ func idct(b *block) {
 		x4 = (r2*(x4-x5) + 128) >> 8
 
 		// Stage 4.
-		b[8*y+0] = (x7 + x1) >> 8
-		b[8*y+1] = (x3 + x2) >> 8
-		b[8*y+2] = (x0 + x4) >> 8
-		b[8*y+3] = (x8 + x6) >> 8
-		b[8*y+4] = (x8 - x6) >> 8
-		b[8*y+5] = (x0 - x4) >> 8
-		b[8*y+6] = (x3 - x2) >> 8
-		b[8*y+7] = (x7 - x1) >> 8
+		src[8*y+0] = (x7 + x1) >> 8
+		src[8*y+1] = (x3 + x2) >> 8
+		src[8*y+2] = (x0 + x4) >> 8
+		src[8*y+3] = (x8 + x6) >> 8
+		src[8*y+4] = (x8 - x6) >> 8
+		src[8*y+5] = (x0 - x4) >> 8
+		src[8*y+6] = (x3 - x2) >> 8
+		src[8*y+7] = (x7 - x1) >> 8
 	}
 
 	// Vertical 1-D IDCT.
@@ -136,14 +140,14 @@ func idct(b *block) {
 		// we do not bother to check for the all-zero case.
 
 		// Prescale.
-		y0 := (b[8*0+x] << 8) + 8192
-		y1 := b[8*4+x] << 8
-		y2 := b[8*6+x]
-		y3 := b[8*2+x]
-		y4 := b[8*1+x]
-		y5 := b[8*7+x]
-		y6 := b[8*5+x]
-		y7 := b[8*3+x]
+		y0 := (src[8*0+x] << 8) + 8192
+		y1 := src[8*4+x] << 8
+		y2 := src[8*6+x]
+		y3 := src[8*2+x]
+		y4 := src[8*1+x]
+		y5 := src[8*7+x]
+		y6 := src[8*5+x]
+		y7 := src[8*3+x]
 
 		// Stage 1.
 		y8 := w7*(y4+y5) + 4
@@ -173,18 +177,28 @@ func idct(b *block) {
 		y4 = (r2*(y4-y5) + 128) >> 8
 
 		// Stage 4.
-		b[8*0+x] = (y7 + y1) >> 14
-		b[8*1+x] = (y3 + y2) >> 14
-		b[8*2+x] = (y0 + y4) >> 14
-		b[8*3+x] = (y8 + y6) >> 14
-		b[8*4+x] = (y8 - y6) >> 14
-		b[8*5+x] = (y0 - y4) >> 14
-		b[8*6+x] = (y3 - y2) >> 14
-		b[8*7+x] = (y7 - y1) >> 14
+		src[8*0+x] = (y7 + y1) >> 14
+		src[8*1+x] = (y3 + y2) >> 14
+		src[8*2+x] = (y0 + y4) >> 14
+		src[8*3+x] = (y8 + y6) >> 14
+		src[8*4+x] = (y8 - y6) >> 14
+		src[8*5+x] = (y0 - y4) >> 14
+		src[8*6+x] = (y3 - y2) >> 14
+		src[8*7+x] = (y7 - y1) >> 14
 	}
 
-	// Level shift.
-	for i := range *b {
-		b[i] += 128
+	// Level shift by +128, clip to [0, 255], and write to dst.
+	for y := 0; y < 8; y++ {
+		for x := 0; x < 8; x++ {
+			c := src[y*8+x]
+			if c < -128 {
+				c = 0
+			} else if c > 127 {
+				c = 255
+			} else {
+				c += 128
+			}
+			dst[y*stride+x] = uint8(c)
+		}
 	}
 }
diff --git a/src/pkg/image/jpeg/reader.go b/src/pkg/image/jpeg/reader.go
index 21a6fff96..ef8383a35 100644
--- a/src/pkg/image/jpeg/reader.go
+++ b/src/pkg/image/jpeg/reader.go
@@ -41,16 +41,22 @@ type block [blockSize]int
 const (
 	blockSize = 64 // A DCT block is 8x8.
 
-	dcTableClass = 0
-	acTableClass = 1
-	maxTc        = 1
-	maxTh        = 3
-	maxTq        = 3
-
-	// We only support 4:4:4, 4:2:2 and 4:2:0 downsampling, and assume that the components are Y, Cb, Cr.
-	nComponent = 3
-	maxH       = 2
-	maxV       = 2
+	dcTable = 0
+	acTable = 1
+	maxTc   = 1
+	maxTh   = 3
+	maxTq   = 3
+
+	// A grayscale JPEG image has only a Y component.
+	nGrayComponent = 1
+	// A color JPEG image has Y, Cb and Cr components.
+	nColorComponent = 3
+
+	// We only support 4:4:4, 4:2:2 and 4:2:0 downsampling, and therefore the
+	// number of luma samples per chroma sample is at most 2 in the horizontal
+	// and 2 in the vertical direction.
+	maxH = 2
+	maxV = 2
 )
 
 const (
@@ -90,13 +96,14 @@ type Reader interface {
 type decoder struct {
 	r             Reader
 	width, height int
-	img           *ycbcr.YCbCr
+	img1          *image.Gray
+	img3          *ycbcr.YCbCr
 	ri            int // Restart Interval.
-	comps         [nComponent]component
+	nComp         int
+	comp          [nColorComponent]component
 	huff          [maxTc + 1][maxTh + 1]huffman
 	quant         [maxTq + 1]block
 	b             bits
-	blocks        [nComponent][maxH * maxV]block
 	tmp           [1024]byte
 }
 
@@ -118,10 +125,15 @@ func (d *decoder) ignore(n int) os.Error {
 
 // Specified in section B.2.2.
 func (d *decoder) processSOF(n int) os.Error {
-	if n != 6+3*nComponent {
+	switch n {
+	case 6 + 3*nGrayComponent:
+		d.nComp = nGrayComponent
+	case 6 + 3*nColorComponent:
+		d.nComp = nColorComponent
+	default:
 		return UnsupportedError("SOF has wrong length")
 	}
-	_, err := io.ReadFull(d.r, d.tmp[0:6+3*nComponent])
+	_, err := io.ReadFull(d.r, d.tmp[:n])
 	if err != nil {
 		return err
 	}
@@ -131,26 +143,28 @@ func (d *decoder) processSOF(n int) os.Error {
 	}
 	d.height = int(d.tmp[1])<<8 + int(d.tmp[2])
 	d.width = int(d.tmp[3])<<8 + int(d.tmp[4])
-	if d.tmp[5] != nComponent {
+	if int(d.tmp[5]) != d.nComp {
 		return UnsupportedError("SOF has wrong number of image components")
 	}
-	for i := 0; i < nComponent; i++ {
+	for i := 0; i < d.nComp; i++ {
 		hv := d.tmp[7+3*i]
-		d.comps[i].h = int(hv >> 4)
-		d.comps[i].v = int(hv & 0x0f)
-		d.comps[i].c = d.tmp[6+3*i]
-		d.comps[i].tq = d.tmp[8+3*i]
-		// We only support YCbCr images, and 4:4:4, 4:2:2 or 4:2:0 chroma downsampling ratios. This implies that
-		// the (h, v) values for the Y component are either (1, 1), (2, 1) or (2, 2), and the
-		// (h, v) values for the Cr and Cb components must be (1, 1).
+		d.comp[i].h = int(hv >> 4)
+		d.comp[i].v = int(hv & 0x0f)
+		d.comp[i].c = d.tmp[6+3*i]
+		d.comp[i].tq = d.tmp[8+3*i]
+		if d.nComp == nGrayComponent {
+			continue
+		}
+		// For color images, we only support 4:4:4, 4:2:2 or 4:2:0 chroma
+		// downsampling ratios. This implies that the (h, v) values for the Y
+		// component are either (1, 1), (2, 1) or (2, 2), and the (h, v)
+		// values for the Cr and Cb components must be (1, 1).
 		if i == 0 {
 			if hv != 0x11 && hv != 0x21 && hv != 0x22 {
 				return UnsupportedError("luma downsample ratio")
 			}
-		} else {
-			if hv != 0x11 {
-				return UnsupportedError("chroma downsample ratio")
-			}
+		} else if hv != 0x11 {
+			return UnsupportedError("chroma downsample ratio")
 		}
 	}
 	return nil
@@ -182,110 +196,88 @@ func (d *decoder) processDQT(n int) os.Error {
 	return nil
 }
 
-// Clip x to the range [0, 255] inclusive.
-func clip(x int) uint8 {
-	if x < 0 {
-		return 0
+// makeImg allocates and initializes the destination image.
+func (d *decoder) makeImg(h0, v0, mxx, myy int) {
+	if d.nComp == nGrayComponent {
+		d.img1 = image.NewGray(8*mxx, 8*myy)
+		d.img1.Rect = image.Rect(0, 0, d.width, d.height)
+		return
 	}
-	if x > 255 {
-		return 255
+	var subsampleRatio ycbcr.SubsampleRatio
+	n := h0 * v0
+	switch n {
+	case 1:
+		subsampleRatio = ycbcr.SubsampleRatio444
+	case 2:
+		subsampleRatio = ycbcr.SubsampleRatio422
+	case 4:
+		subsampleRatio = ycbcr.SubsampleRatio420
+	default:
+		panic("unreachable")
 	}
-	return uint8(x)
-}
-
-// Store the MCU to the image.
-func (d *decoder) storeMCU(mx, my int) {
-	h0, v0 := d.comps[0].h, d.comps[0].v
-	// Store the luma blocks.
-	for v := 0; v < v0; v++ {
-		for h := 0; h < h0; h++ {
-			p := 8 * ((v0*my+v)*d.img.YStride + (h0*mx + h))
-			for y := 0; y < 8; y++ {
-				for x := 0; x < 8; x++ {
-					d.img.Y[p] = clip(d.blocks[0][h0*v+h][8*y+x])
-					p++
-				}
-				p += d.img.YStride - 8
-			}
-		}
-	}
-	// Store the chroma blocks.
-	p := 8 * (my*d.img.CStride + mx)
-	for y := 0; y < 8; y++ {
-		for x := 0; x < 8; x++ {
-			d.img.Cb[p] = clip(d.blocks[1][0][8*y+x])
-			d.img.Cr[p] = clip(d.blocks[2][0][8*y+x])
-			p++
-		}
-		p += d.img.CStride - 8
+	b := make([]byte, mxx*myy*(1*8*8*n+2*8*8))
+	d.img3 = &ycbcr.YCbCr{
+		Y:              b[mxx*myy*(0*8*8*n+0*8*8) : mxx*myy*(1*8*8*n+0*8*8)],
+		Cb:             b[mxx*myy*(1*8*8*n+0*8*8) : mxx*myy*(1*8*8*n+1*8*8)],
+		Cr:             b[mxx*myy*(1*8*8*n+1*8*8) : mxx*myy*(1*8*8*n+2*8*8)],
+		SubsampleRatio: subsampleRatio,
+		YStride:        mxx * 8 * h0,
+		CStride:        mxx * 8,
+		Rect:           image.Rect(0, 0, d.width, d.height),
 	}
 }
 
 // Specified in section B.2.3.
 func (d *decoder) processSOS(n int) os.Error {
-	if n != 4+2*nComponent {
+	if d.nComp == 0 {
+		return FormatError("missing SOF marker")
+	}
+	if n != 4+2*d.nComp {
 		return UnsupportedError("SOS has wrong length")
 	}
-	_, err := io.ReadFull(d.r, d.tmp[0:4+2*nComponent])
+	_, err := io.ReadFull(d.r, d.tmp[0:4+2*d.nComp])
 	if err != nil {
 		return err
 	}
-	if d.tmp[0] != nComponent {
+	if int(d.tmp[0]) != d.nComp {
 		return UnsupportedError("SOS has wrong number of image components")
 	}
-	var scanComps [nComponent]struct {
+	var scan [nColorComponent]struct {
 		td uint8 // DC table selector.
 		ta uint8 // AC table selector.
 	}
-	for i := 0; i < nComponent; i++ {
+	for i := 0; i < d.nComp; i++ {
 		cs := d.tmp[1+2*i] // Component selector.
-		if cs != d.comps[i].c {
+		if cs != d.comp[i].c {
 			return UnsupportedError("scan components out of order")
 		}
-		scanComps[i].td = d.tmp[2+2*i] >> 4
-		scanComps[i].ta = d.tmp[2+2*i] & 0x0f
+		scan[i].td = d.tmp[2+2*i] >> 4
+		scan[i].ta = d.tmp[2+2*i] & 0x0f
 	}
 	// mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
-	h0, v0 := d.comps[0].h, d.comps[0].v // The h and v values from the Y components.
+	h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
 	mxx := (d.width + 8*h0 - 1) / (8 * h0)
 	myy := (d.height + 8*v0 - 1) / (8 * v0)
-	if d.img == nil {
-		var subsampleRatio ycbcr.SubsampleRatio
-		n := h0 * v0
-		switch n {
-		case 1:
-			subsampleRatio = ycbcr.SubsampleRatio444
-		case 2:
-			subsampleRatio = ycbcr.SubsampleRatio422
-		case 4:
-			subsampleRatio = ycbcr.SubsampleRatio420
-		default:
-			panic("unreachable")
-		}
-		b := make([]byte, mxx*myy*(1*8*8*n+2*8*8))
-		d.img = &ycbcr.YCbCr{
-			Y:              b[mxx*myy*(0*8*8*n+0*8*8) : mxx*myy*(1*8*8*n+0*8*8)],
-			Cb:             b[mxx*myy*(1*8*8*n+0*8*8) : mxx*myy*(1*8*8*n+1*8*8)],
-			Cr:             b[mxx*myy*(1*8*8*n+1*8*8) : mxx*myy*(1*8*8*n+2*8*8)],
-			SubsampleRatio: subsampleRatio,
-			YStride:        mxx * 8 * h0,
-			CStride:        mxx * 8,
-			Rect:           image.Rect(0, 0, d.width, d.height),
-		}
+	if d.img1 == nil && d.img3 == nil {
+		d.makeImg(h0, v0, mxx, myy)
 	}
 
 	mcu, expectedRST := 0, uint8(rst0Marker)
-	var allZeroes block
-	var dc [nComponent]int
+	var (
+		b  block
+		dc [nColorComponent]int
+	)
 	for my := 0; my < myy; my++ {
 		for mx := 0; mx < mxx; mx++ {
-			for i := 0; i < nComponent; i++ {
-				qt := &d.quant[d.comps[i].tq]
-				for j := 0; j < d.comps[i].h*d.comps[i].v; j++ {
-					d.blocks[i][j] = allZeroes
+			for i := 0; i < d.nComp; i++ {
+				qt := &d.quant[d.comp[i].tq]
+				for j := 0; j < d.comp[i].h*d.comp[i].v; j++ {
+					// TODO(nigeltao): make this a "var b block" once the compiler's escape
+					// analysis is good enough to allocate it on the stack, not the heap.
+					b = block{}
 
 					// Decode the DC coefficient, as specified in section F.2.2.1.
-					value, err := d.decodeHuffman(&d.huff[dcTableClass][scanComps[i].td])
+					value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
 					if err != nil {
 						return err
 					}
@@ -297,11 +289,11 @@ func (d *decoder) processSOS(n int) os.Error {
 						return err
 					}
 					dc[i] += dcDelta
-					d.blocks[i][j][0] = dc[i] * qt[0]
+					b[0] = dc[i] * qt[0]
 
 					// Decode the AC coefficients, as specified in section F.2.2.2.
 					for k := 1; k < blockSize; k++ {
-						value, err := d.decodeHuffman(&d.huff[acTableClass][scanComps[i].ta])
+						value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
 						if err != nil {
 							return err
 						}
@@ -316,7 +308,7 @@ func (d *decoder) processSOS(n int) os.Error {
 							if err != nil {
 								return err
 							}
-							d.blocks[i][j][unzig[k]] = ac * qt[k]
+							b[unzig[k]] = ac * qt[k]
 						} else {
 							if val0 != 0x0f {
 								break
@@ -325,10 +317,32 @@ func (d *decoder) processSOS(n int) os.Error {
 						}
 					}
 
-					idct(&d.blocks[i][j])
+					// Perform the inverse DCT and store the MCU component to the image.
+					if d.nComp == nGrayComponent {
+						idct(d.tmp[:64], 8, &b)
+						// Convert from []uint8 to []image.GrayColor.
+						p := d.img1.Pix[8*(my*d.img1.Stride+mx):]
+						for y := 0; y < 8; y++ {
+							dst := p[y*d.img1.Stride:]
+							src := d.tmp[8*y:]
+							for x := 0; x < 8; x++ {
+								dst[x] = image.GrayColor{src[x]}
+							}
+						}
+					} else {
+						switch i {
+						case 0:
+							mx0 := h0*mx + (j % 2)
+							my0 := v0*my + (j / 2)
+							idct(d.img3.Y[8*(my0*d.img3.YStride+mx0):], d.img3.YStride, &b)
+						case 1:
+							idct(d.img3.Cb[8*(my*d.img3.CStride+mx):], d.img3.CStride, &b)
+						case 2:
+							idct(d.img3.Cr[8*(my*d.img3.CStride+mx):], d.img3.CStride, &b)
+						}
+					}
 				} // for j
 			} // for i
-			d.storeMCU(mx, my)
 			mcu++
 			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
 				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
@@ -347,9 +361,7 @@ func (d *decoder) processSOS(n int) os.Error {
 				// Reset the Huffman decoder.
 				d.b = bits{}
 				// Reset the DC components, as per section F.2.1.3.1.
-				for i := 0; i < nComponent; i++ {
-					dc[i] = 0
-				}
+				dc = [nColorComponent]int{}
 			}
 		} // for mx
 	} // for my
@@ -437,7 +449,13 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, os.Error) {
 			return nil, err
 		}
 	}
-	return d.img, nil
+	if d.img1 != nil {
+		return d.img1, nil
+	}
+	if d.img3 != nil {
+		return d.img3, nil
+	}
+	return nil, FormatError("missing SOS marker")
 }
 
 // Decode reads a JPEG image from r and returns it as an image.Image.
@@ -453,7 +471,13 @@ func DecodeConfig(r io.Reader) (image.Config, os.Error) {
 	if _, err := d.decode(r, true); err != nil {
 		return image.Config{}, err
 	}
-	return image.Config{image.RGBAColorModel, d.width, d.height}, nil
+	switch d.nComp {
+	case nGrayComponent:
+		return image.Config{image.GrayColorModel, d.width, d.height}, nil
+	case nColorComponent:
+		return image.Config{ycbcr.YCbCrColorModel, d.width, d.height}, nil
+	}
+	return image.Config{}, FormatError("missing SOF marker")
 }
 
 func init() {
diff --git a/src/pkg/image/jpeg/writer.go b/src/pkg/image/jpeg/writer.go
index 52b3dc4e2..eddaaefb6 100644
--- a/src/pkg/image/jpeg/writer.go
+++ b/src/pkg/image/jpeg/writer.go
@@ -221,8 +221,7 @@ type encoder struct {
 	// buf is a scratch buffer.
 	buf [16]byte
 	// bits and nBits are accumulated bits to write to w.
-	bits  uint32
-	nBits uint8
+	bits, nBits uint32
 	// quant is the scaled quantization tables.
 	quant [nQuantIndex][blockSize]byte
 }
@@ -250,7 +249,7 @@ func (e *encoder) writeByte(b byte) {
 
 // emit emits the least significant nBits bits of bits to the bitstream.
 // The precondition is bits < 1<<nBits && nBits <= 16.
-func (e *encoder) emit(bits uint32, nBits uint8) {
+func (e *encoder) emit(bits, nBits uint32) {
 	nBits += e.nBits
 	bits <<= 32 - nBits
 	bits |= e.bits
@@ -269,7 +268,7 @@ func (e *encoder) emit(bits uint32, nBits uint8) {
 // emitHuff emits the given value with the given Huffman encoder.
 func (e *encoder) emitHuff(h huffIndex, value int) {
 	x := theHuffmanLUT[h][value]
-	e.emit(x&(1<<24-1), uint8(x>>24))
+	e.emit(x&(1<<24-1), x>>24)
 }
 
 // emitHuffRLE emits a run of runLength copies of value encoded with the given
@@ -279,11 +278,11 @@ func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int) {
 	if a < 0 {
 		a, b = -value, value-1
 	}
-	var nBits uint8
+	var nBits uint32
 	if a < 0x100 {
-		nBits = bitCount[a]
+		nBits = uint32(bitCount[a])
 	} else {
-		nBits = 8 + bitCount[a>>8]
+		nBits = 8 + uint32(bitCount[a>>8])
 	}
 	e.emitHuff(h, runLength<<4|int(nBits))
 	if nBits > 0 {
@@ -302,34 +301,31 @@ func (e *encoder) writeMarkerHeader(marker uint8, markerlen int) {
 
 // writeDQT writes the Define Quantization Table marker.
 func (e *encoder) writeDQT() {
-	markerlen := 2
-	for _, q := range e.quant {
-		markerlen += 1 + len(q)
-	}
+	markerlen := 2 + int(nQuantIndex)*(1+blockSize)
 	e.writeMarkerHeader(dqtMarker, markerlen)
-	for i, q := range e.quant {
+	for i := range e.quant {
 		e.writeByte(uint8(i))
-		e.write(q[:])
+		e.write(e.quant[i][:])
 	}
 }
 
 // writeSOF0 writes the Start Of Frame (Baseline) marker.
 func (e *encoder) writeSOF0(size image.Point) {
-	markerlen := 8 + 3*nComponent
+	markerlen := 8 + 3*nColorComponent
 	e.writeMarkerHeader(sof0Marker, markerlen)
 	e.buf[0] = 8 // 8-bit color.
 	e.buf[1] = uint8(size.Y >> 8)
 	e.buf[2] = uint8(size.Y & 0xff)
 	e.buf[3] = uint8(size.X >> 8)
 	e.buf[4] = uint8(size.X & 0xff)
-	e.buf[5] = nComponent
-	for i := 0; i < nComponent; i++ {
+	e.buf[5] = nColorComponent
+	for i := 0; i < nColorComponent; i++ {
 		e.buf[3*i+6] = uint8(i + 1)
 		// We use 4:2:0 chroma subsampling.
 		e.buf[3*i+7] = "\x22\x11\x11"[i]
 		e.buf[3*i+8] = "\x00\x01\x01"[i]
 	}
-	e.write(e.buf[:3*(nComponent-1)+9])
+	e.write(e.buf[:3*(nColorComponent-1)+9])
 }
 
 // writeDHT writes the Define Huffman Table marker.