8 files changed, 1151 insertions, 291 deletions
diff --git a/src/pkg/image/jpeg/dct_test.go b/src/pkg/image/jpeg/dct_test.go
new file mode 100644
index 000000000..7389f7e4f
--- /dev/null
+++ b/src/pkg/image/jpeg/dct_test.go
@@ -0,0 +1,299 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"math/rand"
+	"testing"
+)
+
+func benchmarkDCT(b *testing.B, f func(*block)) {
+	b.StopTimer()
+	blocks := make([]block, 0, b.N*len(testBlocks))
+	for i := 0; i < b.N; i++ {
+		blocks = append(blocks, testBlocks[:]...)
+	}
+	b.StartTimer()
+	for i := range blocks {
+		f(&blocks[i])
+	}
+}
+
+func BenchmarkFDCT(b *testing.B) {
+	benchmarkDCT(b, fdct)
+}
+
+func BenchmarkIDCT(b *testing.B) {
+	benchmarkDCT(b, idct)
+}
+
+func TestDCT(t *testing.T) {
+	blocks := make([]block, len(testBlocks))
+	copy(blocks, testBlocks[:])
+
+	// Append some randomly generated blocks of varying sparseness.
+	r := rand.New(rand.NewSource(123))
+	for i := 0; i < 100; i++ {
+		b := block{}
+		n := r.Int() % 64
+		for j := 0; j < n; j++ {
+			b[r.Int()%len(b)] = r.Int31() % 256
+		}
+		blocks = append(blocks, b)
+	}
+
+	// Check that the FDCT and IDCT functions are inverses, after a scale and
+	// level shift. Scaling reduces the rounding errors in the conversion from
+	// floats to ints.
+	for i, b := range blocks {
+		got, want := b, b
+		for j := range got {
+			got[j] = (got[j] - 128) * 8
+		}
+		slowFDCT(&got)
+		slowIDCT(&got)
+		for j := range got {
+			got[j] = got[j]/8 + 128
+		}
+		if differ(&got, &want) {
+			t.Errorf("i=%d: IDCT(FDCT)\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+
+	// Check that the optimized and slow FDCT implementations agree.
+	// The fdct function already does a scale and level shift.
+	for i, b := range blocks {
+		got, want := b, b
+		fdct(&got)
+		for j := range want {
+			want[j] = (want[j] - 128) * 8
+		}
+		slowFDCT(&want)
+		if differ(&got, &want) {
+			t.Errorf("i=%d: FDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+
+	// Check that the optimized and slow IDCT implementations agree.
+	for i, b := range blocks {
+		got, want := b, b
+		idct(&got)
+		slowIDCT(&want)
+		if differ(&got, &want) {
+			t.Errorf("i=%d: IDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+}
+
+// differ returns whether any pair-wise elements in b0 and b1 differ by 2 or
+// more. That tolerance is because there isn't a single definitive decoding of
+// a given JPEG image, even before the YCbCr to RGB conversion; implementations
+// can have different IDCT rounding errors.
+func differ(b0, b1 *block) bool {
+	for i := range b0 {
+		delta := b0[i] - b1[i]
+		if delta < -2 || +2 < delta {
+			return true
+		}
+	}
+	return false
+}
+
+// alpha returns 1 if i is 0 and returns √2 otherwise.
+func alpha(i int) float64 {
+	if i == 0 {
+		return 1
+	}
+	return math.Sqrt2
+}
+
+var cosines [32]float64 // cosines[k] = cos(π/2 * k/8)
+
+func init() {
+	for k := range cosines {
+		cosines[k] = math.Cos(math.Pi * float64(k) / 16)
+	}
+}
+
+// slowFDCT performs the 8*8 2-dimensional forward discrete cosine transform:
+//
+//	dst[u,v] = (1/8) * Σ_x Σ_y alpha(u) * alpha(v) * src[x,y] *
+//		cos((π/2) * (2*x + 1) * u / 8) *
+//		cos((π/2) * (2*y + 1) * v / 8)
+//
+// x and y are in pixel space, and u and v are in transform space.
+//
+// b acts as both dst and src.
+func slowFDCT(b *block) {
+	var dst [blockSize]float64
+	for v := 0; v < 8; v++ {
+		for u := 0; u < 8; u++ {
+			sum := 0.0
+			for y := 0; y < 8; y++ {
+				for x := 0; x < 8; x++ {
+					sum += alpha(u) * alpha(v) * float64(b[8*y+x]) *
+						cosines[((2*x+1)*u)%32] *
+						cosines[((2*y+1)*v)%32]
+				}
+			}
+			dst[8*v+u] = sum / 8
+		}
+	}
+	// Convert from float64 to int32.
+	for i := range dst {
+		b[i] = int32(dst[i] + 0.5)
+	}
+}
+
+// slowIDCT performs the 8*8 2-dimensional inverse discrete cosine transform:
+//
+//	dst[x,y] = (1/8) * Σ_u Σ_v alpha(u) * alpha(v) * src[u,v] *
+//		cos((π/2) * (2*x + 1) * u / 8) *
+//		cos((π/2) * (2*y + 1) * v / 8)
+//
+// x and y are in pixel space, and u and v are in transform space.
+//
+// b acts as both dst and src.
+func slowIDCT(b *block) {
+	var dst [blockSize]float64
+	for y := 0; y < 8; y++ {
+		for x := 0; x < 8; x++ {
+			sum := 0.0
+			for v := 0; v < 8; v++ {
+				for u := 0; u < 8; u++ {
+					sum += alpha(u) * alpha(v) * float64(b[8*v+u]) *
+						cosines[((2*x+1)*u)%32] *
+						cosines[((2*y+1)*v)%32]
+				}
+			}
+			dst[8*y+x] = sum / 8
+		}
+	}
+	// Convert from float64 to int32.
+	for i := range dst {
+		b[i] = int32(dst[i] + 0.5)
+	}
+}
+
+func (b *block) String() string {
+	s := bytes.NewBuffer(nil)
+	fmt.Fprintf(s, "{\n")
+	for y := 0; y < 8; y++ {
+		fmt.Fprintf(s, "\t")
+		for x := 0; x < 8; x++ {
+			fmt.Fprintf(s, "0x%04x, ", uint16(b[8*y+x]))
+		}
+		fmt.Fprintln(s)
+	}
+	fmt.Fprintf(s, "}")
+	return s.String()
+}
+
+// testBlocks are the first 10 pre-IDCT blocks from ../testdata/video-001.jpeg.
+var testBlocks = [10]block{
+	{
+		0x7f, 0xf6, 0x01, 0x07, 0xff, 0x00, 0x00, 0x00,
+		0xf5, 0x01, 0xfa, 0x01, 0xfe, 0x00, 0x01, 0x00,
+		0x05, 0x05, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0xff, 0xf8, 0x00, 0x01, 0xff, 0x00, 0x00,
+		0x00, 0x01, 0x00, 0x01, 0x00, 0xff, 0xff, 0x00,
+		0xff, 0x0c, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x01, 0xff, 0x01, 0x00, 0xfe,
+	},
+	{
+		0x29, 0x07, 0x00, 0xfc, 0x01, 0x01, 0x00, 0x00,
+		0x07, 0x00, 0x03, 0x00, 0x01, 0x00, 0xff, 0xff,
+		0xff, 0xfd, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x04, 0x00, 0xff, 0x01, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0xfa, 0x01, 0x00, 0x01, 0x00, 0x01, 0xff,
+		0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x02,
+	},
+	{
+		0xc5, 0xfa, 0x01, 0x00, 0x00, 0x01, 0x00, 0xff,
+		0x02, 0xff, 0x01, 0x00, 0x01, 0x00, 0xff, 0x00,
+		0xff, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00,
+		0xff, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	{
+		0x86, 0x05, 0x00, 0x02, 0x00, 0x00, 0x01, 0x00,
+		0xf2, 0x06, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+		0xf6, 0xfa, 0xf9, 0x00, 0xff, 0x01, 0x00, 0x00,
+		0xf9, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
+		0xff, 0x00, 0x00, 0x01, 0x00, 0xff, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x01,
+		0x00, 0x01, 0xff, 0x01, 0x00, 0xff, 0x00, 0x00,
+	},
+	{
+		0x24, 0xfe, 0x00, 0xff, 0x00, 0xff, 0xff, 0x00,
+		0x08, 0xfd, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00,
+		0x06, 0x03, 0x03, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x04, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01,
+		0x01, 0x00, 0x01, 0xff, 0x00, 0x01, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0x01,
+	},
+	{
+		0xcd, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+		0x03, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0xff,
+	},
+	{
+		0x81, 0xfe, 0x05, 0xff, 0x01, 0xff, 0x01, 0x00,
+		0xef, 0xf9, 0x00, 0xf9, 0x00, 0xff, 0x00, 0xff,
+		0x05, 0xf9, 0x00, 0xf8, 0x01, 0xff, 0x01, 0xff,
+		0x00, 0xff, 0x07, 0x00, 0x01, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x01,
+		0xff, 0x01, 0x01, 0x00, 0xff, 0x00, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+	},
+	{
+		0x28, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x0b, 0x02, 0x01, 0x03, 0x00, 0xff, 0x00, 0x01,
+		0xfe, 0x02, 0x01, 0x03, 0xff, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0xfd, 0x00, 0x01, 0x00, 0xff, 0x00,
+		0x01, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x01, 0x01, 0x00, 0xff,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x01,
+	},
+	{
+		0xdf, 0xf9, 0xfe, 0x00, 0x03, 0x01, 0xff, 0xff,
+		0x04, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01,
+		0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00, 0x01,
+		0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x01,
+		0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+	},
+	{
+		0x88, 0xfd, 0x00, 0x00, 0xff, 0x00, 0x01, 0xff,
+		0xe1, 0x06, 0x06, 0x01, 0xff, 0x00, 0x01, 0x00,
+		0x08, 0x00, 0xfa, 0x00, 0xff, 0xff, 0xff, 0xff,
+		0x08, 0x01, 0x00, 0xff, 0x01, 0xff, 0x00, 0x00,
+		0xf5, 0xff, 0x00, 0x01, 0xff, 0x01, 0x01, 0x00,
+		0xff, 0xff, 0x01, 0xff, 0x01, 0x00, 0x01, 0x00,
+		0x00, 0x01, 0x01, 0xff, 0x00, 0xff, 0x00, 0x01,
+		0x02, 0x00, 0x00, 0xff, 0xff, 0x00, 0xff, 0x00,
+	},
+}
diff --git a/src/pkg/image/jpeg/huffman.go b/src/pkg/image/jpeg/huffman.go
index d2382490f..9b731fdc4 100644
--- a/src/pkg/image/jpeg/huffman.go
+++ b/src/pkg/image/jpeg/huffman.go
@@ -15,9 +15,9 @@ const maxNumValues = 256
 // Bit stream for the Huffman decoder.
 // The n least significant bits of a form the unread bits, to be read in MSB to LSB order.
 type bits struct {
-	a int // accumulator.
-	n int // the number of unread bits in a.
-	m int // mask. m==1<<(n-1) when n>0, with m==0 when n==0.
+	a uint32 // accumulator.
+	m uint32 // mask. m==1<<(n-1) when n>0, with m==0 when n==0.
+	n int    // the number of unread bits in a.
 }
 
 // Huffman table decoder, specified in section C.
@@ -39,7 +39,7 @@ func (d *decoder) ensureNBits(n int) error {
 		if err != nil {
 			return err
 		}
-		d.b.a = d.b.a<<8 | int(c)
+		d.b.a = d.b.a<<8 | uint32(c)
 		d.b.n += 8
 		if d.b.m == 0 {
 			d.b.m = 1 << 7
@@ -61,15 +61,16 @@ func (d *decoder) ensureNBits(n int) error {
 }
 
 // The composition of RECEIVE and EXTEND, specified in section F.2.2.1.
-func (d *decoder) receiveExtend(t uint8) (int, error) {
-	err := d.ensureNBits(int(t))
-	if err != nil {
-		return 0, err
+func (d *decoder) receiveExtend(t uint8) (int32, error) {
+	if d.b.n < int(t) {
+		if err := d.ensureNBits(int(t)); err != nil {
+			return 0, err
+		}
 	}
 	d.b.n -= int(t)
 	d.b.m >>= t
-	s := 1 << t
-	x := (d.b.a >> uint8(d.b.n)) & (s - 1)
+	s := int32(1) << t
+	x := int32(d.b.a>>uint8(d.b.n)) & (s - 1)
 	if x < s>>1 {
 		x += ((-1) << t) + 1
 	}
@@ -92,8 +93,7 @@ func (d *decoder) processDHT(n int) error {
 			return FormatError("bad Tc value")
 		}
 		th := d.tmp[0] & 0x0f
-		const isBaseline = true // Progressive mode is not yet supported.
-		if th > maxTh || isBaseline && th > 1 {
+		if th > maxTh || !d.progressive && th > 1 {
 			return FormatError("bad Th value")
 		}
 		h := &d.huff[tc][th]
@@ -163,15 +163,16 @@ func (d *decoder) processDHT(n int) error {
 
 // Returns the next Huffman-coded value from the bit stream, decoded according to h.
 // TODO(nigeltao): This decoding algorithm is simple, but slow. A lookahead table, instead of always
-// peeling off only 1 bit at at time, ought to be faster.
+// peeling off only 1 bit at time, ought to be faster.
 func (d *decoder) decodeHuffman(h *huffman) (uint8, error) {
 	if h.length == 0 {
 		return 0, FormatError("uninitialized Huffman table")
 	}
 	for i, code := 0, 0; i < maxCodeLength; i++ {
-		err := d.ensureNBits(1)
-		if err != nil {
-			return 0, err
+		if d.b.n == 0 {
+			if err := d.ensureNBits(1); err != nil {
+				return 0, err
+			}
 		}
 		if d.b.a&d.b.m != 0 {
 			code |= 1
@@ -185,3 +186,28 @@ func (d *decoder) decodeHuffman(h *huffman) (uint8, error) {
 	}
 	return 0, FormatError("bad Huffman code")
 }
+
+func (d *decoder) decodeBit() (bool, error) {
+	if d.b.n == 0 {
+		if err := d.ensureNBits(1); err != nil {
+			return false, err
+		}
+	}
+	ret := d.b.a&d.b.m != 0
+	d.b.n--
+	d.b.m >>= 1
+	return ret, nil
+}
+
+func (d *decoder) decodeBits(n int) (uint32, error) {
+	if d.b.n < n {
+		if err := d.ensureNBits(n); err != nil {
+			return 0, err
+		}
+	}
+	ret := d.b.a >> uint(d.b.n-n)
+	ret &= (1 << uint(n)) - 1
+	d.b.n -= n
+	d.b.m >>= uint(n)
+	return ret, nil
+}
diff --git a/src/pkg/image/jpeg/idct.go b/src/pkg/image/jpeg/idct.go
index b387dfdff..46fcaecb7 100644
--- a/src/pkg/image/jpeg/idct.go
+++ b/src/pkg/image/jpeg/idct.go
@@ -37,6 +37,10 @@ package jpeg
  *
  */
 
+const blockSize = 64 // A DCT block is 8x8.
+
+type block [blockSize]int32
+
 const (
 	w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
 	w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
@@ -55,9 +59,7 @@ const (
 	r2 = 181 // 256/sqrt(2)
 )
 
-// idct performs a 2-D Inverse Discrete Cosine Transformation, followed by a
-// +128 level shift and a clip to [0, 255], writing the results to dst.
-// stride is the number of elements between successive rows of dst.
+// idct performs a 2-D Inverse Discrete Cosine Transformation.
 //
 // The input coefficients should already have been multiplied by the
 // appropriate quantization table. We use fixed-point computation, with the
@@ -67,33 +69,34 @@ const (
 // For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
 // discrete W transform and for the discrete Fourier transform", IEEE Trans. on
 // ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-func idct(dst []byte, stride int, src *block) {
+func idct(src *block) {
 	// Horizontal 1-D IDCT.
 	for y := 0; y < 8; y++ {
+		y8 := y * 8
 		// If all the AC components are zero, then the IDCT is trivial.
-		if src[y*8+1] == 0 && src[y*8+2] == 0 && src[y*8+3] == 0 &&
-			src[y*8+4] == 0 && src[y*8+5] == 0 && src[y*8+6] == 0 && src[y*8+7] == 0 {
-			dc := src[y*8+0] << 3
-			src[y*8+0] = dc
-			src[y*8+1] = dc
-			src[y*8+2] = dc
-			src[y*8+3] = dc
-			src[y*8+4] = dc
-			src[y*8+5] = dc
-			src[y*8+6] = dc
-			src[y*8+7] = dc
+		if src[y8+1] == 0 && src[y8+2] == 0 && src[y8+3] == 0 &&
+			src[y8+4] == 0 && src[y8+5] == 0 && src[y8+6] == 0 && src[y8+7] == 0 {
+			dc := src[y8+0] << 3
+			src[y8+0] = dc
+			src[y8+1] = dc
+			src[y8+2] = dc
+			src[y8+3] = dc
+			src[y8+4] = dc
+			src[y8+5] = dc
+			src[y8+6] = dc
+			src[y8+7] = dc
 			continue
 		}
 
 		// Prescale.
-		x0 := (src[y*8+0] << 11) + 128
-		x1 := src[y*8+4] << 11
-		x2 := src[y*8+6]
-		x3 := src[y*8+2]
-		x4 := src[y*8+1]
-		x5 := src[y*8+7]
-		x6 := src[y*8+5]
-		x7 := src[y*8+3]
+		x0 := (src[y8+0] << 11) + 128
+		x1 := src[y8+4] << 11
+		x2 := src[y8+6]
+		x3 := src[y8+2]
+		x4 := src[y8+1]
+		x5 := src[y8+7]
+		x6 := src[y8+5]
+		x7 := src[y8+3]
 
 		// Stage 1.
 		x8 := w7 * (x4 + x5)
@@ -123,14 +126,14 @@ func idct(dst []byte, stride int, src *block) {
 		x4 = (r2*(x4-x5) + 128) >> 8
 
 		// Stage 4.
-		src[8*y+0] = (x7 + x1) >> 8
-		src[8*y+1] = (x3 + x2) >> 8
-		src[8*y+2] = (x0 + x4) >> 8
-		src[8*y+3] = (x8 + x6) >> 8
-		src[8*y+4] = (x8 - x6) >> 8
-		src[8*y+5] = (x0 - x4) >> 8
-		src[8*y+6] = (x3 - x2) >> 8
-		src[8*y+7] = (x7 - x1) >> 8
+		src[y8+0] = (x7 + x1) >> 8
+		src[y8+1] = (x3 + x2) >> 8
+		src[y8+2] = (x0 + x4) >> 8
+		src[y8+3] = (x8 + x6) >> 8
+		src[y8+4] = (x8 - x6) >> 8
+		src[y8+5] = (x0 - x4) >> 8
+		src[y8+6] = (x3 - x2) >> 8
+		src[y8+7] = (x7 - x1) >> 8
 	}
 
 	// Vertical 1-D IDCT.
@@ -186,19 +189,4 @@ func idct(dst []byte, stride int, src *block) {
 		src[8*6+x] = (y3 - y2) >> 14
 		src[8*7+x] = (y7 - y1) >> 14
 	}
-
-	// Level shift by +128, clip to [0, 255], and write to dst.
-	for y := 0; y < 8; y++ {
-		for x := 0; x < 8; x++ {
-			c := src[y*8+x]
-			if c < -128 {
-				c = 0
-			} else if c > 127 {
-				c = 255
-			} else {
-				c += 128
-			}
-			dst[y*stride+x] = uint8(c)
-		}
-	}
 }
diff --git a/src/pkg/image/jpeg/reader.go b/src/pkg/image/jpeg/reader.go
index d9adf6e58..1ee6bbcd1 100644
--- a/src/pkg/image/jpeg/reader.go
+++ b/src/pkg/image/jpeg/reader.go
@@ -35,11 +35,7 @@ type component struct {
 	tq uint8 // Quantization table destination selector.
 }
 
-type block [blockSize]int
-
 const (
-	blockSize = 64 // A DCT block is 8x8.
-
 	dcTable = 0
 	acTable = 1
 	maxTc   = 1
@@ -51,7 +47,7 @@ const (
 	// A color JPEG image has Y, Cb and Cr components.
 	nColorComponent = 3
 
-	// We only support 4:4:4, 4:2:2 and 4:2:0 downsampling, and therefore the
+	// We only support 4:4:4, 4:4:0, 4:2:2 and 4:2:0 downsampling, and therefore the
 	// number of luma samples per chroma sample is at most 2 in the horizontal
 	// and 2 in the vertical direction.
 	maxH = 2
@@ -74,7 +70,9 @@ const (
 	comMarker   = 0xfe // COMment.
 )
 
-// Maps from the zig-zag ordering to the natural ordering.
+// unzig maps from the zig-zag ordering to the natural ordering. For example,
+// unzig[3] is the column and row of the fourth element in zig-zag order. The
+// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
 var unzig = [blockSize]int{
 	0, 1, 8, 16, 9, 2, 3, 10,
 	17, 24, 32, 25, 18, 11, 4, 5,
@@ -94,15 +92,18 @@ type Reader interface {
 
 type decoder struct {
 	r             Reader
+	b             bits
 	width, height int
 	img1          *image.Gray
 	img3          *image.YCbCr
 	ri            int // Restart Interval.
 	nComp         int
+	progressive   bool
+	eobRun        uint16 // End-of-Band run, specified in section G.1.2.2.
 	comp          [nColorComponent]component
+	progCoeffs    [nColorComponent][]block // Saved state between progressive-mode scans.
 	huff          [maxTc + 1][maxTh + 1]huffman
-	quant         [maxTq + 1]block
-	b             bits
+	quant         [maxTq + 1]block // Quantization tables, in zig-zag order.
 	tmp           [1024]byte
 }
 
@@ -146,20 +147,33 @@ func (d *decoder) processSOF(n int) error {
 		return UnsupportedError("SOF has wrong number of image components")
 	}
 	for i := 0; i < d.nComp; i++ {
-		hv := d.tmp[7+3*i]
-		d.comp[i].h = int(hv >> 4)
-		d.comp[i].v = int(hv & 0x0f)
 		d.comp[i].c = d.tmp[6+3*i]
 		d.comp[i].tq = d.tmp[8+3*i]
 		if d.nComp == nGrayComponent {
+			// If a JPEG image has only one component, section A.2 says "this data
+			// is non-interleaved by definition" and section A.2.2 says "[in this
+			// case...] the order of data units within a scan shall be left-to-right
+			// and top-to-bottom... regardless of the values of H_1 and V_1". Section
+			// 4.8.2 also says "[for non-interleaved data], the MCU is defined to be
+			// one data unit". Similarly, section A.1.1 explains that it is the ratio
+			// of H_i to max_j(H_j) that matters, and similarly for V. For grayscale
+			// images, H_1 is the maximum H_j for all components j, so that ratio is
+			// always 1. The component's (h, v) is effectively always (1, 1): even if
+			// the nominal (h, v) is (2, 1), a 20x5 image is encoded in three 8x8
+			// MCUs, not two 16x8 MCUs.
+			d.comp[i].h = 1
+			d.comp[i].v = 1
 			continue
 		}
-		// For color images, we only support 4:4:4, 4:2:2 or 4:2:0 chroma
+		hv := d.tmp[7+3*i]
+		d.comp[i].h = int(hv >> 4)
+		d.comp[i].v = int(hv & 0x0f)
+		// For color images, we only support 4:4:4, 4:4:0, 4:2:2 or 4:2:0 chroma
 		// downsampling ratios. This implies that the (h, v) values for the Y
-		// component are either (1, 1), (2, 1) or (2, 2), and the (h, v)
+		// component are either (1, 1), (1, 2), (2, 1) or (2, 2), and the (h, v)
 		// values for the Cr and Cb components must be (1, 1).
 		if i == 0 {
-			if hv != 0x11 && hv != 0x21 && hv != 0x22 {
+			if hv != 0x11 && hv != 0x21 && hv != 0x22 && hv != 0x12 {
 				return UnsupportedError("luma downsample ratio")
 			}
 		} else if hv != 0x11 {
@@ -186,7 +200,7 @@ func (d *decoder) processDQT(n int) error {
 			return FormatError("bad Tq value")
 		}
 		for i := range d.quant[tq] {
-			d.quant[tq][i] = int(d.tmp[i+1])
+			d.quant[tq][i] = int32(d.tmp[i+1])
 		}
 	}
 	if n != 0 {
@@ -195,161 +209,6 @@ func (d *decoder) processDQT(n int) error {
 	return nil
 }
 
-// makeImg allocates and initializes the destination image.
-func (d *decoder) makeImg(h0, v0, mxx, myy int) {
-	if d.nComp == nGrayComponent {
-		m := image.NewGray(image.Rect(0, 0, 8*mxx, 8*myy))
-		d.img1 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.Gray)
-		return
-	}
-	var subsampleRatio image.YCbCrSubsampleRatio
-	switch h0 * v0 {
-	case 1:
-		subsampleRatio = image.YCbCrSubsampleRatio444
-	case 2:
-		subsampleRatio = image.YCbCrSubsampleRatio422
-	case 4:
-		subsampleRatio = image.YCbCrSubsampleRatio420
-	default:
-		panic("unreachable")
-	}
-	m := image.NewYCbCr(image.Rect(0, 0, 8*h0*mxx, 8*v0*myy), subsampleRatio)
-	d.img3 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.YCbCr)
-}
-
-// Specified in section B.2.3.
-func (d *decoder) processSOS(n int) error {
-	if d.nComp == 0 {
-		return FormatError("missing SOF marker")
-	}
-	if n != 4+2*d.nComp {
-		return UnsupportedError("SOS has wrong length")
-	}
-	_, err := io.ReadFull(d.r, d.tmp[0:4+2*d.nComp])
-	if err != nil {
-		return err
-	}
-	if int(d.tmp[0]) != d.nComp {
-		return UnsupportedError("SOS has wrong number of image components")
-	}
-	var scan [nColorComponent]struct {
-		td uint8 // DC table selector.
-		ta uint8 // AC table selector.
-	}
-	for i := 0; i < d.nComp; i++ {
-		cs := d.tmp[1+2*i] // Component selector.
-		if cs != d.comp[i].c {
-			return UnsupportedError("scan components out of order")
-		}
-		scan[i].td = d.tmp[2+2*i] >> 4
-		scan[i].ta = d.tmp[2+2*i] & 0x0f
-	}
-	// mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
-	h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
-	mxx := (d.width + 8*h0 - 1) / (8 * h0)
-	myy := (d.height + 8*v0 - 1) / (8 * v0)
-	if d.img1 == nil && d.img3 == nil {
-		d.makeImg(h0, v0, mxx, myy)
-	}
-
-	mcu, expectedRST := 0, uint8(rst0Marker)
-	var (
-		b  block
-		dc [nColorComponent]int
-	)
-	for my := 0; my < myy; my++ {
-		for mx := 0; mx < mxx; mx++ {
-			for i := 0; i < d.nComp; i++ {
-				qt := &d.quant[d.comp[i].tq]
-				for j := 0; j < d.comp[i].h*d.comp[i].v; j++ {
-					// TODO(nigeltao): make this a "var b block" once the compiler's escape
-					// analysis is good enough to allocate it on the stack, not the heap.
-					b = block{}
-
-					// Decode the DC coefficient, as specified in section F.2.2.1.
-					value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
-					if err != nil {
-						return err
-					}
-					if value > 16 {
-						return UnsupportedError("excessive DC component")
-					}
-					dcDelta, err := d.receiveExtend(value)
-					if err != nil {
-						return err
-					}
-					dc[i] += dcDelta
-					b[0] = dc[i] * qt[0]
-
-					// Decode the AC coefficients, as specified in section F.2.2.2.
-					for k := 1; k < blockSize; k++ {
-						value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
-						if err != nil {
-							return err
-						}
-						val0 := value >> 4
-						val1 := value & 0x0f
-						if val1 != 0 {
-							k += int(val0)
-							if k > blockSize {
-								return FormatError("bad DCT index")
-							}
-							ac, err := d.receiveExtend(val1)
-							if err != nil {
-								return err
-							}
-							b[unzig[k]] = ac * qt[k]
-						} else {
-							if val0 != 0x0f {
-								break
-							}
-							k += 0x0f
-						}
-					}
-
-					// Perform the inverse DCT and store the MCU component to the image.
-					if d.nComp == nGrayComponent {
-						idct(d.img1.Pix[8*(my*d.img1.Stride+mx):], d.img1.Stride, &b)
-					} else {
-						switch i {
-						case 0:
-							mx0 := h0*mx + (j % 2)
-							my0 := v0*my + (j / 2)
-							idct(d.img3.Y[8*(my0*d.img3.YStride+mx0):], d.img3.YStride, &b)
-						case 1:
-							idct(d.img3.Cb[8*(my*d.img3.CStride+mx):], d.img3.CStride, &b)
-						case 2:
-							idct(d.img3.Cr[8*(my*d.img3.CStride+mx):], d.img3.CStride, &b)
-						}
-					}
-				} // for j
-			} // for i
-			mcu++
-			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
-				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
-				// but this one assumes well-formed input, and hence the restart marker follows immediately.
-				_, err := io.ReadFull(d.r, d.tmp[0:2])
-				if err != nil {
-					return err
-				}
-				if d.tmp[0] != 0xff || d.tmp[1] != expectedRST {
-					return FormatError("bad RST marker")
-				}
-				expectedRST++
-				if expectedRST == rst7Marker+1 {
-					expectedRST = rst0Marker
-				}
-				// Reset the Huffman decoder.
-				d.b = bits{}
-				// Reset the DC components, as per section F.2.1.3.1.
-				dc = [nColorComponent]int{}
-			}
-		} // for mx
-	} // for my
-
-	return nil
-}
-
 // Specified in section B.2.4.4.
 func (d *decoder) processDRI(n int) error {
 	if n != 2 {
@@ -390,9 +249,26 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
 			return nil, FormatError("missing 0xff marker start")
 		}
 		marker := d.tmp[1]
+		for marker == 0xff {
+			// Section B.1.1.2 says, "Any marker may optionally be preceded by any
+			// number of fill bytes, which are bytes assigned code X'FF'".
+			marker, err = d.r.ReadByte()
+			if err != nil {
+				return nil, err
+			}
+		}
 		if marker == eoiMarker { // End Of Image.
 			break
 		}
+		if rst0Marker <= marker && marker <= rst7Marker {
+			// Figures B.2 and B.16 of the specification suggest that restart markers should
+			// only occur between Entropy Coded Segments and not after the final ECS.
+			// However, some encoders may generate incorrect JPEGs with a final restart
+			// marker. That restart marker will be seen here instead of inside the processSOS
+			// method, and is ignored as a harmless error. Restart markers have no extra data,
+			// so we check for this before we read the 16-bit length of the segment.
+			continue
+		}
 
 		// Read the 16-bit length of the segment. The value includes the 2 bytes for the
 		// length itself, so we subtract 2 to get the number of remaining bytes.
@@ -406,13 +282,12 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
 		}
 
 		switch {
-		case marker == sof0Marker: // Start Of Frame (Baseline).
+		case marker == sof0Marker || marker == sof2Marker: // Start Of Frame.
+			d.progressive = marker == sof2Marker
 			err = d.processSOF(n)
 			if configOnly {
 				return nil, err
 			}
-		case marker == sof2Marker: // Start Of Frame (Progressive).
-			err = UnsupportedError("progressive mode")
 		case marker == dhtMarker: // Define Huffman Table.
 			err = d.processDHT(n)
 		case marker == dqtMarker: // Define Quantization Table.
@@ -421,7 +296,7 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
 			err = d.processSOS(n)
 		case marker == driMarker: // Define Restart Interval.
 			err = d.processDRI(n)
-		case marker >= app0Marker && marker <= app15Marker || marker == comMarker: // APPlication specific, or COMment.
+		case app0Marker <= marker && marker <= app15Marker || marker == comMarker: // APPlication specific, or COMment.
 			err = d.ignore(n)
 		default:
 			err = UnsupportedError("unknown marker")
diff --git a/src/pkg/image/jpeg/reader_test.go b/src/pkg/image/jpeg/reader_test.go
new file mode 100644
index 000000000..b520a8ab1
--- /dev/null
+++ b/src/pkg/image/jpeg/reader_test.go
@@ -0,0 +1,157 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"io/ioutil"
+	"os"
+	"testing"
+)
+
+// TestDecodeProgressive tests that decoding the baseline and progressive
+// versions of the same image result in exactly the same pixel data, in YCbCr
+// space for color images, and Y space for grayscale images.
+func TestDecodeProgressive(t *testing.T) {
+	testCases := []string{
+		"../testdata/video-001",
+		"../testdata/video-001.q50.420",
+		"../testdata/video-001.q50.422",
+		"../testdata/video-001.q50.440",
+		"../testdata/video-001.q50.444",
+		"../testdata/video-005.gray.q50",
+		"../testdata/video-005.gray.q50.2x2",
+	}
+	for _, tc := range testCases {
+		m0, err := decodeFile(tc + ".jpeg")
+		if err != nil {
+			t.Errorf("%s: %v", tc+".jpeg", err)
+			continue
+		}
+		m1, err := decodeFile(tc + ".progressive.jpeg")
+		if err != nil {
+			t.Errorf("%s: %v", tc+".progressive.jpeg", err)
+			continue
+		}
+		if m0.Bounds() != m1.Bounds() {
+			t.Errorf("%s: bounds differ: %v and %v", tc, m0.Bounds(), m1.Bounds())
+			continue
+		}
+		switch m0 := m0.(type) {
+		case *image.YCbCr:
+			m1 := m1.(*image.YCbCr)
+			if err := check(m0.Bounds(), m0.Y, m1.Y, m0.YStride, m1.YStride); err != nil {
+				t.Errorf("%s (Y): %v", tc, err)
+				continue
+			}
+			if err := check(m0.Bounds(), m0.Cb, m1.Cb, m0.CStride, m1.CStride); err != nil {
+				t.Errorf("%s (Cb): %v", tc, err)
+				continue
+			}
+			if err := check(m0.Bounds(), m0.Cr, m1.Cr, m0.CStride, m1.CStride); err != nil {
+				t.Errorf("%s (Cr): %v", tc, err)
+				continue
+			}
+		case *image.Gray:
+			m1 := m1.(*image.Gray)
+			if err := check(m0.Bounds(), m0.Pix, m1.Pix, m0.Stride, m1.Stride); err != nil {
+				t.Errorf("%s: %v", tc, err)
+				continue
+			}
+		default:
+			t.Errorf("%s: unexpected image type %T", tc, m0)
+			continue
+		}
+	}
+}
+
+func decodeFile(filename string) (image.Image, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	return Decode(f)
+
+}
+
+// check checks that the two pix data are equal, within the given bounds.
+func check(bounds image.Rectangle, pix0, pix1 []byte, stride0, stride1 int) error {
+	if len(pix0) != len(pix1) {
+		return fmt.Errorf("len(pix) %d and %d differ", len(pix0), len(pix1))
+	}
+	if stride0 != stride1 {
+		return fmt.Errorf("strides %d and %d differ", stride0, stride1)
+	}
+	if stride0%8 != 0 {
+		return fmt.Errorf("stride %d is not a multiple of 8", stride0)
+	}
+	// Compare the two pix data, one 8x8 block at a time.
+	for y := 0; y < len(pix0)/stride0; y += 8 {
+		for x := 0; x < stride0; x += 8 {
+			if x >= bounds.Max.X || y >= bounds.Max.Y {
+				// We don't care if the two pix data differ if the 8x8 block is
+				// entirely outside of the image's bounds. For example, this can
+				// occur with a 4:2:0 chroma subsampling and a 1x1 image. Baseline
+				// decoding works on the one 16x16 MCU as a whole; progressive
+				// decoding's first pass works on that 16x16 MCU as a whole but
+				// refinement passes only process one 8x8 block within the MCU.
+				continue
+			}
+
+			for j := 0; j < 8; j++ {
+				for i := 0; i < 8; i++ {
+					index := (y+j)*stride0 + (x + i)
+					if pix0[index] != pix1[index] {
+						return fmt.Errorf("blocks at (%d, %d) differ:\n%sand\n%s", x, y,
+							pixString(pix0, stride0, x, y),
+							pixString(pix1, stride1, x, y),
+						)
+					}
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func pixString(pix []byte, stride, x, y int) string {
+	s := bytes.NewBuffer(nil)
+	for j := 0; j < 8; j++ {
+		fmt.Fprintf(s, "\t")
+		for i := 0; i < 8; i++ {
+			fmt.Fprintf(s, "%02x ", pix[(y+j)*stride+(x+i)])
+		}
+		fmt.Fprintf(s, "\n")
+	}
+	return s.String()
+}
+
+func benchmarkDecode(b *testing.B, filename string) {
+	b.StopTimer()
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+	cfg, err := DecodeConfig(bytes.NewReader(data))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(cfg.Width * cfg.Height * 4))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Decode(bytes.NewReader(data))
+	}
+}
+
+func BenchmarkDecodeBaseline(b *testing.B) {
+	benchmarkDecode(b, "../testdata/video-001.jpeg")
+}
+
+func BenchmarkDecodeProgressive(b *testing.B) {
+	benchmarkDecode(b, "../testdata/video-001.progressive.jpeg")
+}
diff --git a/src/pkg/image/jpeg/scan.go b/src/pkg/image/jpeg/scan.go
new file mode 100644
index 000000000..e3ae8ae44
--- /dev/null
+++ b/src/pkg/image/jpeg/scan.go
@@ -0,0 +1,432 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"image"
+	"io"
+)
+
+// makeImg allocates and initializes the destination image.
+func (d *decoder) makeImg(h0, v0, mxx, myy int) {
+	if d.nComp == nGrayComponent {
+		m := image.NewGray(image.Rect(0, 0, 8*mxx, 8*myy))
+		d.img1 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.Gray)
+		return
+	}
+	var subsampleRatio image.YCbCrSubsampleRatio
+	switch {
+	case h0 == 1 && v0 == 1:
+		subsampleRatio = image.YCbCrSubsampleRatio444
+	case h0 == 1 && v0 == 2:
+		subsampleRatio = image.YCbCrSubsampleRatio440
+	case h0 == 2 && v0 == 1:
+		subsampleRatio = image.YCbCrSubsampleRatio422
+	case h0 == 2 && v0 == 2:
+		subsampleRatio = image.YCbCrSubsampleRatio420
+	default:
+		panic("unreachable")
+	}
+	m := image.NewYCbCr(image.Rect(0, 0, 8*h0*mxx, 8*v0*myy), subsampleRatio)
+	d.img3 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.YCbCr)
+}
+
+// Specified in section B.2.3.
+func (d *decoder) processSOS(n int) error {
+	if d.nComp == 0 {
+		return FormatError("missing SOF marker")
+	}
+	if n < 6 || 4+2*d.nComp < n || n%2 != 0 {
+		return FormatError("SOS has wrong length")
+	}
+	_, err := io.ReadFull(d.r, d.tmp[:n])
+	if err != nil {
+		return err
+	}
+	nComp := int(d.tmp[0])
+	if n != 4+2*nComp {
+		return FormatError("SOS length inconsistent with number of components")
+	}
+	var scan [nColorComponent]struct {
+		compIndex uint8
+		td        uint8 // DC table selector.
+		ta        uint8 // AC table selector.
+	}
+	for i := 0; i < nComp; i++ {
+		cs := d.tmp[1+2*i] // Component selector.
+		compIndex := -1
+		for j, comp := range d.comp {
+			if cs == comp.c {
+				compIndex = j
+			}
+		}
+		if compIndex < 0 {
+			return FormatError("unknown component selector")
+		}
+		scan[i].compIndex = uint8(compIndex)
+		scan[i].td = d.tmp[2+2*i] >> 4
+		scan[i].ta = d.tmp[2+2*i] & 0x0f
+	}
+
+	// zigStart and zigEnd are the spectral selection bounds.
+	// ah and al are the successive approximation high and low values.
+	// The spec calls these values Ss, Se, Ah and Al.
+	//
+	// For progressive JPEGs, these are the two more-or-less independent
+	// aspects of progression. Spectral selection progression is when not
+	// all of a block's 64 DCT coefficients are transmitted in one pass.
+	// For example, three passes could transmit coefficient 0 (the DC
+	// component), coefficients 1-5, and coefficients 6-63, in zig-zag
+	// order. Successive approximation is when not all of the bits of a
+	// band of coefficients are transmitted in one pass. For example,
+	// three passes could transmit the 6 most significant bits, followed
+	// by the second-least significant bit, followed by the least
+	// significant bit.
+	//
+	// For baseline JPEGs, these parameters are hard-coded to 0/63/0/0.
+	zigStart, zigEnd, ah, al := int32(0), int32(blockSize-1), uint32(0), uint32(0)
+	if d.progressive {
+		zigStart = int32(d.tmp[1+2*nComp])
+		zigEnd = int32(d.tmp[2+2*nComp])
+		ah = uint32(d.tmp[3+2*nComp] >> 4)
+		al = uint32(d.tmp[3+2*nComp] & 0x0f)
+		if (zigStart == 0 && zigEnd != 0) || zigStart > zigEnd || blockSize <= zigEnd {
+			return FormatError("bad spectral selection bounds")
+		}
+		if zigStart != 0 && nComp != 1 {
+			return FormatError("progressive AC coefficients for more than one component")
+		}
+		if ah != 0 && ah != al+1 {
+			return FormatError("bad successive approximation values")
+		}
+	}
+
+	// mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
+	h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
+	mxx := (d.width + 8*h0 - 1) / (8 * h0)
+	myy := (d.height + 8*v0 - 1) / (8 * v0)
+	if d.img1 == nil && d.img3 == nil {
+		d.makeImg(h0, v0, mxx, myy)
+		if d.progressive {
+			for i := 0; i < nComp; i++ {
+				compIndex := scan[i].compIndex
+				d.progCoeffs[compIndex] = make([]block, mxx*myy*d.comp[compIndex].h*d.comp[compIndex].v)
+			}
+		}
+	}
+
+	d.b = bits{}
+	mcu, expectedRST := 0, uint8(rst0Marker)
+	var (
+		// b is the decoded coefficients, in natural (not zig-zag) order.
+		b  block
+		dc [nColorComponent]int32
+		// mx0 and my0 are the location of the current (in terms of 8x8 blocks).
+		// For example, with 4:2:0 chroma subsampling, the block whose top left
+		// pixel co-ordinates are (16, 8) is the third block in the first row:
+		// mx0 is 2 and my0 is 0, even though the pixel is in the second MCU.
+		// TODO(nigeltao): rename mx0 and my0 to bx and by?
+		mx0, my0   int
+		blockCount int
+	)
+	for my := 0; my < myy; my++ {
+		for mx := 0; mx < mxx; mx++ {
+			for i := 0; i < nComp; i++ {
+				compIndex := scan[i].compIndex
+				qt := &d.quant[d.comp[compIndex].tq]
+				for j := 0; j < d.comp[compIndex].h*d.comp[compIndex].v; j++ {
+					// The blocks are traversed one MCU at a time. For 4:2:0 chroma
+					// subsampling, there are four Y 8x8 blocks in every 16x16 MCU.
+					// For a baseline 32x16 pixel image, the Y blocks visiting order is:
+					//	0 1 4 5
+					//	2 3 6 7
+					//
+					// For progressive images, the DC data blocks (zigStart == 0) are traversed
+					// as above, but AC data blocks are traversed left to right, top to bottom:
+					//	0 1 2 3
+					//	4 5 6 7
+					//
+					// To further complicate matters, there is no AC data for any blocks that
+					// are inside the image at the MCU level but outside the image at the pixel
+					// level. For example, a 24x16 pixel 4:2:0 progressive image consists of
+					// two 16x16 MCUs. The earlier scans will process 8 Y blocks:
+					//	0 1 4 5
+					//	2 3 6 7
+					// The later scans will process only 6 Y blocks:
+					//	0 1 2
+					//	3 4 5
+					if zigStart == 0 {
+						mx0, my0 = d.comp[compIndex].h*mx, d.comp[compIndex].v*my
+						if h0 == 1 {
+							my0 += j
+						} else {
+							mx0 += j % 2
+							my0 += j / 2
+						}
+					} else {
+						q := mxx * d.comp[compIndex].h
+						mx0 = blockCount % q
+						my0 = blockCount / q
+						blockCount++
+						if mx0*8 >= d.width || my0*8 >= d.height {
+							continue
+						}
+					}
+
+					// Load the previous partially decoded coefficients, if applicable.
+					if d.progressive {
+						b = d.progCoeffs[compIndex][my0*mxx*d.comp[compIndex].h+mx0]
+					} else {
+						b = block{}
+					}
+
+					if ah != 0 {
+						if err := d.refine(&b, &d.huff[acTable][scan[i].ta], zigStart, zigEnd, 1<<al); err != nil {
+							return err
+						}
+					} else {
+						zig := zigStart
+						if zig == 0 {
+							zig++
+							// Decode the DC coefficient, as specified in section F.2.2.1.
+							value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
+							if err != nil {
+								return err
+							}
+							if value > 16 {
+								return UnsupportedError("excessive DC component")
+							}
+							dcDelta, err := d.receiveExtend(value)
+							if err != nil {
+								return err
+							}
+							dc[compIndex] += dcDelta
+							b[0] = dc[compIndex] << al
+						}
+
+						if zig <= zigEnd && d.eobRun > 0 {
+							d.eobRun--
+						} else {
+							// Decode the AC coefficients, as specified in section F.2.2.2.
+							for ; zig <= zigEnd; zig++ {
+								value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
+								if err != nil {
+									return err
+								}
+								val0 := value >> 4
+								val1 := value & 0x0f
+								if val1 != 0 {
+									zig += int32(val0)
+									if zig > zigEnd {
+										break
+									}
+									ac, err := d.receiveExtend(val1)
+									if err != nil {
+										return err
+									}
+									b[unzig[zig]] = ac << al
+								} else {
+									if val0 != 0x0f {
+										d.eobRun = uint16(1 << val0)
+										if val0 != 0 {
+											bits, err := d.decodeBits(int(val0))
+											if err != nil {
+												return err
+											}
+											d.eobRun |= uint16(bits)
+										}
+										d.eobRun--
+										break
+									}
+									zig += 0x0f
+								}
+							}
+						}
+					}
+
+					if d.progressive {
+						if zigEnd != blockSize-1 || al != 0 {
+							// We haven't completely decoded this 8x8 block. Save the coefficients.
+							d.progCoeffs[compIndex][my0*mxx*d.comp[compIndex].h+mx0] = b
+							// At this point, we could execute the rest of the loop body to dequantize and
+							// perform the inverse DCT, to save early stages of a progressive image to the
+							// *image.YCbCr buffers (the whole point of progressive encoding), but in Go,
+							// the jpeg.Decode function does not return until the entire image is decoded,
+							// so we "continue" here to avoid wasted computation.
+							continue
+						}
+					}
+
+					// Dequantize, perform the inverse DCT and store the block to the image.
+					for zig := 0; zig < blockSize; zig++ {
+						b[unzig[zig]] *= qt[zig]
+					}
+					idct(&b)
+					dst, stride := []byte(nil), 0
+					if d.nComp == nGrayComponent {
+						dst, stride = d.img1.Pix[8*(my0*d.img1.Stride+mx0):], d.img1.Stride
+					} else {
+						switch compIndex {
+						case 0:
+							dst, stride = d.img3.Y[8*(my0*d.img3.YStride+mx0):], d.img3.YStride
+						case 1:
+							dst, stride = d.img3.Cb[8*(my0*d.img3.CStride+mx0):], d.img3.CStride
+						case 2:
+							dst, stride = d.img3.Cr[8*(my0*d.img3.CStride+mx0):], d.img3.CStride
+						default:
+							return UnsupportedError("too many components")
+						}
+					}
+					// Level shift by +128, clip to [0, 255], and write to dst.
+					for y := 0; y < 8; y++ {
+						y8 := y * 8
+						yStride := y * stride
+						for x := 0; x < 8; x++ {
+							c := b[y8+x]
+							if c < -128 {
+								c = 0
+							} else if c > 127 {
+								c = 255
+							} else {
+								c += 128
+							}
+							dst[yStride+x] = uint8(c)
+						}
+					}
+				} // for j
+			} // for i
+			mcu++
+			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
+				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
+				// but this one assumes well-formed input, and hence the restart marker follows immediately.
+				_, err := io.ReadFull(d.r, d.tmp[0:2])
+				if err != nil {
+					return err
+				}
+				if d.tmp[0] != 0xff || d.tmp[1] != expectedRST {
+					return FormatError("bad RST marker")
+				}
+				expectedRST++
+				if expectedRST == rst7Marker+1 {
+					expectedRST = rst0Marker
+				}
+				// Reset the Huffman decoder.
+				d.b = bits{}
+				// Reset the DC components, as per section F.2.1.3.1.
+				dc = [nColorComponent]int32{}
+				// Reset the progressive decoder state, as per section G.1.2.2.
+				d.eobRun = 0
+			}
+		} // for mx
+	} // for my
+
+	return nil
+}
+
+// refine decodes a successive approximation refinement block, as specified in
+// section G.1.2.
+func (d *decoder) refine(b *block, h *huffman, zigStart, zigEnd, delta int32) error {
+	// Refining a DC component is trivial.
+	if zigStart == 0 {
+		if zigEnd != 0 {
+			panic("unreachable")
+		}
+		bit, err := d.decodeBit()
+		if err != nil {
+			return err
+		}
+		if bit {
+			b[0] |= delta
+		}
+		return nil
+	}
+
+	// Refining AC components is more complicated; see sections G.1.2.2 and G.1.2.3.
+	zig := zigStart
+	if d.eobRun == 0 {
+	loop:
+		for ; zig <= zigEnd; zig++ {
+			z := int32(0)
+			value, err := d.decodeHuffman(h)
+			if err != nil {
+				return err
+			}
+			val0 := value >> 4
+			val1 := value & 0x0f
+
+			switch val1 {
+			case 0:
+				if val0 != 0x0f {
+					d.eobRun = uint16(1 << val0)
+					if val0 != 0 {
+						bits, err := d.decodeBits(int(val0))
+						if err != nil {
+							return err
+						}
+						d.eobRun |= uint16(bits)
+					}
+					break loop
+				}
+			case 1:
+				z = delta
+				bit, err := d.decodeBit()
+				if err != nil {
+					return err
+				}
+				if !bit {
+					z = -z
+				}
+			default:
+				return FormatError("unexpected Huffman code")
+			}
+
+			zig, err = d.refineNonZeroes(b, zig, zigEnd, int32(val0), delta)
+			if err != nil {
+				return err
+			}
+			if zig > zigEnd {
+				return FormatError("too many coefficients")
+			}
+			if z != 0 {
+				b[unzig[zig]] = z
+			}
+		}
+	}
+	if d.eobRun > 0 {
+		d.eobRun--
+		if _, err := d.refineNonZeroes(b, zig, zigEnd, -1, delta); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// refineNonZeroes refines non-zero entries of b in zig-zag order. If nz >= 0,
+// the first nz zero entries are skipped over.
+func (d *decoder) refineNonZeroes(b *block, zig, zigEnd, nz, delta int32) (int32, error) {
+	for ; zig <= zigEnd; zig++ {
+		u := unzig[zig]
+		if b[u] == 0 {
+			if nz == 0 {
+				break
+			}
+			nz--
+			continue
+		}
+		bit, err := d.decodeBit()
+		if err != nil {
+			return 0, err
+		}
+		if !bit {
+			continue
+		}
+		if b[u] >= 0 {
+			b[u] += delta
+		} else {
+			b[u] -= delta
+		}
+	}
+	return zig, nil
+}
diff --git a/src/pkg/image/jpeg/writer.go b/src/pkg/image/jpeg/writer.go
index 3322c09fe..c58fbf305 100644
--- a/src/pkg/image/jpeg/writer.go
+++ b/src/pkg/image/jpeg/writer.go
@@ -21,7 +21,7 @@ func min(x, y int) int {
 }
 
 // div returns a/b rounded to the nearest integer, instead of rounded to zero.
-func div(a int, b int) int {
+func div(a, b int32) int32 {
 	if a >= 0 {
 		return (a + (b >> 1)) / b
 	}
@@ -56,26 +56,28 @@ const (
 	nQuantIndex
 )
 
-// unscaledQuant are the unscaled quantization tables. Each encoder copies and
-// scales the tables according to its quality parameter.
+// unscaledQuant are the unscaled quantization tables in zig-zag order. Each
+// encoder copies and scales the tables according to its quality parameter.
+// The values are derived from section K.1 after converting from natural to
+// zig-zag order.
 var unscaledQuant = [nQuantIndex][blockSize]byte{
 	// Luminance.
 	{
-		16, 11, 10, 16, 24, 40, 51, 61,
-		12, 12, 14, 19, 26, 58, 60, 55,
-		14, 13, 16, 24, 40, 57, 69, 56,
-		14, 17, 22, 29, 51, 87, 80, 62,
-		18, 22, 37, 56, 68, 109, 103, 77,
-		24, 35, 55, 64, 81, 104, 113, 92,
-		49, 64, 78, 87, 103, 121, 120, 101,
-		72, 92, 95, 98, 112, 100, 103, 99,
+		16, 11, 12, 14, 12, 10, 16, 14,
+		13, 14, 18, 17, 16, 19, 24, 40,
+		26, 24, 22, 22, 24, 49, 35, 37,
+		29, 40, 58, 51, 61, 60, 57, 51,
+		56, 55, 64, 72, 92, 78, 64, 68,
+		87, 69, 55, 56, 80, 109, 81, 87,
+		95, 98, 103, 104, 103, 62, 77, 113,
+		121, 112, 100, 120, 92, 101, 103, 99,
 	},
 	// Chrominance.
 	{
-		17, 18, 24, 47, 99, 99, 99, 99,
-		18, 21, 26, 66, 99, 99, 99, 99,
-		24, 26, 56, 99, 99, 99, 99, 99,
-		47, 66, 99, 99, 99, 99, 99, 99,
+		17, 18, 18, 24, 21, 24, 47, 26,
+		26, 47, 99, 66, 56, 66, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
 		99, 99, 99, 99, 99, 99, 99, 99,
 		99, 99, 99, 99, 99, 99, 99, 99,
 		99, 99, 99, 99, 99, 99, 99, 99,
@@ -208,8 +210,8 @@ func init() {
 // writer is a buffered writer.
 type writer interface {
 	Flush() error
-	Write([]byte) (int, error)
-	WriteByte(byte) error
+	io.Writer
+	io.ByteWriter
 }
 
 // encoder encodes an image to the JPEG format.
@@ -222,7 +224,7 @@ type encoder struct {
 	buf [16]byte
 	// bits and nBits are accumulated bits to write to w.
 	bits, nBits uint32
-	// quant is the scaled quantization tables.
+	// quant is the scaled quantization tables, in zig-zag order.
 	quant [nQuantIndex][blockSize]byte
 }
 
@@ -266,14 +268,14 @@ func (e *encoder) emit(bits, nBits uint32) {
 }
 
 // emitHuff emits the given value with the given Huffman encoder.
-func (e *encoder) emitHuff(h huffIndex, value int) {
+func (e *encoder) emitHuff(h huffIndex, value int32) {
 	x := theHuffmanLUT[h][value]
 	e.emit(x&(1<<24-1), x>>24)
 }
 
 // emitHuffRLE emits a run of runLength copies of value encoded with the given
 // Huffman encoder.
-func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int) {
+func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int32) {
 	a, b := value, value
 	if a < 0 {
 		a, b = -value, value-1
@@ -284,7 +286,7 @@ func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int) {
 	} else {
 		nBits = 8 + uint32(bitCount[a>>8])
 	}
-	e.emitHuff(h, runLength<<4|int(nBits))
+	e.emitHuff(h, runLength<<4|int32(nBits))
 	if nBits > 0 {
 		e.emit(uint32(b)&(1<<nBits-1), nBits)
 	}
@@ -301,7 +303,7 @@ func (e *encoder) writeMarkerHeader(marker uint8, markerlen int) {
 
 // writeDQT writes the Define Quantization Table marker.
 func (e *encoder) writeDQT() {
-	markerlen := 2 + int(nQuantIndex)*(1+blockSize)
+	const markerlen = 2 + int(nQuantIndex)*(1+blockSize)
 	e.writeMarkerHeader(dqtMarker, markerlen)
 	for i := range e.quant {
 		e.writeByte(uint8(i))
@@ -311,7 +313,7 @@ func (e *encoder) writeDQT() {
 
 // writeSOF0 writes the Start Of Frame (Baseline) marker.
 func (e *encoder) writeSOF0(size image.Point) {
-	markerlen := 8 + 3*nColorComponent
+	const markerlen = 8 + 3*nColorComponent
 	e.writeMarkerHeader(sof0Marker, markerlen)
 	e.buf[0] = 8 // 8-bit color.
 	e.buf[1] = uint8(size.Y >> 8)
@@ -344,15 +346,16 @@ func (e *encoder) writeDHT() {
 
 // writeBlock writes a block of pixel data using the given quantization table,
 // returning the post-quantized DC value of the DCT-transformed block.
-func (e *encoder) writeBlock(b *block, q quantIndex, prevDC int) int {
+// b is in natural (not zig-zag) order.
+func (e *encoder) writeBlock(b *block, q quantIndex, prevDC int32) int32 {
 	fdct(b)
 	// Emit the DC delta.
-	dc := div(b[0], (8 * int(e.quant[q][0])))
+	dc := div(b[0], 8*int32(e.quant[q][0]))
 	e.emitHuffRLE(huffIndex(2*q+0), 0, dc-prevDC)
 	// Emit the AC components.
-	h, runLength := huffIndex(2*q+1), 0
-	for k := 1; k < blockSize; k++ {
-		ac := div(b[unzig[k]], (8 * int(e.quant[q][k])))
+	h, runLength := huffIndex(2*q+1), int32(0)
+	for zig := 1; zig < blockSize; zig++ {
+		ac := div(b[unzig[zig]], 8*int32(e.quant[q][zig]))
 		if ac == 0 {
 			runLength++
 		} else {
@@ -380,9 +383,9 @@ func toYCbCr(m image.Image, p image.Point, yBlock, cbBlock, crBlock *block) {
 		for i := 0; i < 8; i++ {
 			r, g, b, _ := m.At(min(p.X+i, xmax), min(p.Y+j, ymax)).RGBA()
 			yy, cb, cr := color.RGBToYCbCr(uint8(r>>8), uint8(g>>8), uint8(b>>8))
-			yBlock[8*j+i] = int(yy)
-			cbBlock[8*j+i] = int(cb)
-			crBlock[8*j+i] = int(cr)
+			yBlock[8*j+i] = int32(yy)
+			cbBlock[8*j+i] = int32(cb)
+			crBlock[8*j+i] = int32(cr)
 		}
 	}
 }
@@ -405,9 +408,9 @@ func rgbaToYCbCr(m *image.RGBA, p image.Point, yBlock, cbBlock, crBlock *block)
 			}
 			pix := m.Pix[offset+sx*4:]
 			yy, cb, cr := color.RGBToYCbCr(pix[0], pix[1], pix[2])
-			yBlock[8*j+i] = int(yy)
-			cbBlock[8*j+i] = int(cb)
-			crBlock[8*j+i] = int(cr)
+			yBlock[8*j+i] = int32(yy)
+			cbBlock[8*j+i] = int32(cb)
+			crBlock[8*j+i] = int32(cr)
 		}
 	}
 }
@@ -433,10 +436,12 @@ func scale(dst *block, src *[4]block) {
 //	- component 1 uses DC table 0 and AC table 0 "\x01\x00",
 //	- component 2 uses DC table 1 and AC table 1 "\x02\x11",
 //	- component 3 uses DC table 1 and AC table 1 "\x03\x11",
-//	- padding "\x00\x00\x00".
+//	- the bytes "\x00\x3f\x00". Section B.2.3 of the spec says that for
+//	  sequential DCTs, those bytes (8-bit Ss, 8-bit Se, 4-bit Ah, 4-bit Al)
+//	  should be 0x00, 0x3f, 0x00<<4 | 0x00.
 var sosHeader = []byte{
 	0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
-	0x11, 0x03, 0x11, 0x00, 0x00, 0x00,
+	0x11, 0x03, 0x11, 0x00, 0x3f, 0x00,
 }
 
 // writeSOS writes the StartOfScan marker.
@@ -444,12 +449,11 @@ func (e *encoder) writeSOS(m image.Image) {
 	e.write(sosHeader)
 	var (
 		// Scratch buffers to hold the YCbCr values.
-		yBlock  block
-		cbBlock [4]block
-		crBlock [4]block
-		cBlock  block
+		// The blocks are in natural (not zig-zag) order.
+		b      block
+		cb, cr [4]block
 		// DC components are delta-encoded.
-		prevDCY, prevDCCb, prevDCCr int
+		prevDCY, prevDCCb, prevDCCr int32
 	)
 	bounds := m.Bounds()
 	rgba, _ := m.(*image.RGBA)
@@ -460,16 +464,16 @@ func (e *encoder) writeSOS(m image.Image) {
 				yOff := (i & 2) * 4
 				p := image.Pt(x+xOff, y+yOff)
 				if rgba != nil {
-					rgbaToYCbCr(rgba, p, &yBlock, &cbBlock[i], &crBlock[i])
+					rgbaToYCbCr(rgba, p, &b, &cb[i], &cr[i])
 				} else {
-					toYCbCr(m, p, &yBlock, &cbBlock[i], &crBlock[i])
+					toYCbCr(m, p, &b, &cb[i], &cr[i])
 				}
-				prevDCY = e.writeBlock(&yBlock, 0, prevDCY)
+				prevDCY = e.writeBlock(&b, 0, prevDCY)
 			}
-			scale(&cBlock, &cbBlock)
-			prevDCCb = e.writeBlock(&cBlock, 1, prevDCCb)
-			scale(&cBlock, &crBlock)
-			prevDCCr = e.writeBlock(&cBlock, 1, prevDCCr)
+			scale(&b, &cb)
+			prevDCCb = e.writeBlock(&b, 1, prevDCCb)
+			scale(&b, &cr)
+			prevDCCr = e.writeBlock(&b, 1, prevDCCr)
 		}
 	}
 	// Pad the last byte with 1's.
diff --git a/src/pkg/image/jpeg/writer_test.go b/src/pkg/image/jpeg/writer_test.go
index b8e8fa34e..0b2143f5b 100644
--- a/src/pkg/image/jpeg/writer_test.go
+++ b/src/pkg/image/jpeg/writer_test.go
@@ -6,6 +6,7 @@ package jpeg
 
 import (
 	"bytes"
+	"fmt"
 	"image"
 	"image/color"
 	"image/png"
@@ -15,6 +16,87 @@ import (
 	"testing"
 )
 
+// zigzag maps from the natural ordering to the zig-zag ordering. For example,
+// zigzag[0*8 + 3] is the zig-zag sequence number of the element in the fourth
+// column and first row.
+var zigzag = [blockSize]int{
+	0, 1, 5, 6, 14, 15, 27, 28,
+	2, 4, 7, 13, 16, 26, 29, 42,
+	3, 8, 12, 17, 25, 30, 41, 43,
+	9, 11, 18, 24, 31, 40, 44, 53,
+	10, 19, 23, 32, 39, 45, 52, 54,
+	20, 22, 33, 38, 46, 51, 55, 60,
+	21, 34, 37, 47, 50, 56, 59, 61,
+	35, 36, 48, 49, 57, 58, 62, 63,
+}
+
+func TestZigUnzig(t *testing.T) {
+	for i := 0; i < blockSize; i++ {
+		if unzig[zigzag[i]] != i {
+			t.Errorf("unzig[zigzag[%d]] == %d", i, unzig[zigzag[i]])
+		}
+		if zigzag[unzig[i]] != i {
+			t.Errorf("zigzag[unzig[%d]] == %d", i, zigzag[unzig[i]])
+		}
+	}
+}
+
+// unscaledQuantInNaturalOrder are the unscaled quantization tables in
+// natural (not zig-zag) order, as specified in section K.1.
+var unscaledQuantInNaturalOrder = [nQuantIndex][blockSize]byte{
+	// Luminance.
+	{
+		16, 11, 10, 16, 24, 40, 51, 61,
+		12, 12, 14, 19, 26, 58, 60, 55,
+		14, 13, 16, 24, 40, 57, 69, 56,
+		14, 17, 22, 29, 51, 87, 80, 62,
+		18, 22, 37, 56, 68, 109, 103, 77,
+		24, 35, 55, 64, 81, 104, 113, 92,
+		49, 64, 78, 87, 103, 121, 120, 101,
+		72, 92, 95, 98, 112, 100, 103, 99,
+	},
+	// Chrominance.
+	{
+		17, 18, 24, 47, 99, 99, 99, 99,
+		18, 21, 26, 66, 99, 99, 99, 99,
+		24, 26, 56, 99, 99, 99, 99, 99,
+		47, 66, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+	},
+}
+
+func TestUnscaledQuant(t *testing.T) {
+	bad := false
+	for i := quantIndex(0); i < nQuantIndex; i++ {
+		for zig := 0; zig < blockSize; zig++ {
+			got := unscaledQuant[i][zig]
+			want := unscaledQuantInNaturalOrder[i][unzig[zig]]
+			if got != want {
+				t.Errorf("i=%d, zig=%d: got %d, want %d", i, zig, got, want)
+				bad = true
+			}
+		}
+	}
+	if bad {
+		names := [nQuantIndex]string{"Luminance", "Chrominance"}
+		buf := &bytes.Buffer{}
+		for i, name := range names {
+			fmt.Fprintf(buf, "// %s.\n{\n", name)
+			for zig := 0; zig < blockSize; zig++ {
+				fmt.Fprintf(buf, "%d, ", unscaledQuantInNaturalOrder[i][unzig[zig]])
+				if zig%8 == 7 {
+					buf.WriteString("\n")
+				}
+			}
+			buf.WriteString("},\n")
+		}
+		t.Logf("expected unscaledQuant values:\n%s", buf.String())
+	}
+}
+
 var testCase = []struct {
 	filename  string
 	quality   int
@@ -89,24 +171,21 @@ func TestWriter(t *testing.T) {
 	}
 }
 
-func BenchmarkEncodeRGBOpaque(b *testing.B) {
+func BenchmarkEncode(b *testing.B) {
 	b.StopTimer()
 	img := image.NewRGBA(image.Rect(0, 0, 640, 480))
-	// Set all pixels to 0xFF alpha to force opaque mode.
 	bo := img.Bounds()
 	rnd := rand.New(rand.NewSource(123))
 	for y := bo.Min.Y; y < bo.Max.Y; y++ {
 		for x := bo.Min.X; x < bo.Max.X; x++ {
-			img.Set(x, y, color.RGBA{
+			img.SetRGBA(x, y, color.RGBA{
 				uint8(rnd.Intn(256)),
 				uint8(rnd.Intn(256)),
 				uint8(rnd.Intn(256)),
-				255})
+				255,
+			})
 		}
 	}
-	if !img.Opaque() {
-		b.Fatal("expected image to be opaque")
-	}
 	b.SetBytes(640 * 480 * 4)
 	b.StartTimer()
 	options := &Options{Quality: 90}