38 files changed, 1595 insertions, 418 deletions
diff --git a/src/pkg/image/decode_test.go b/src/pkg/image/decode_test.go
index d65986724..8dee57ee4 100644
--- a/src/pkg/image/decode_test.go
+++ b/src/pkg/image/decode_test.go
@@ -31,6 +31,7 @@ var imageTests = []imageTest{
 	{"testdata/video-001.png", "testdata/video-001.5bpp.gif", 128 << 8},
 	// JPEG is a lossy format and hence needs a non-zero tolerance.
 	{"testdata/video-001.png", "testdata/video-001.jpeg", 8 << 8},
+	{"testdata/video-001.png", "testdata/video-001.progressive.jpeg", 8 << 8},
 	// Grayscale images.
 	{"testdata/video-005.gray.png", "testdata/video-005.gray.jpeg", 8 << 8},
 	{"testdata/video-005.gray.png", "testdata/video-005.gray.png", 0},
diff --git a/src/pkg/image/draw/draw.go b/src/pkg/image/draw/draw.go
index bef325c0c..56d30dd6f 100644
--- a/src/pkg/image/draw/draw.go
+++ b/src/pkg/image/draw/draw.go
@@ -81,8 +81,9 @@ func DrawMask(dst Image, r image.Rectangle, src image.Image, sp image.Point, mas
 					drawNRGBAOver(dst0, r, src0, sp)
 					return
 				case *image.YCbCr:
-					drawYCbCr(dst0, r, src0, sp)
-					return
+					if drawYCbCr(dst0, r, src0, sp) {
+						return
+					}
 				}
 			} else if mask0, ok := mask.(*image.Alpha); ok {
 				switch src0 := src.(type) {
@@ -104,8 +105,9 @@ func DrawMask(dst Image, r image.Rectangle, src image.Image, sp image.Point, mas
 					drawNRGBASrc(dst0, r, src0, sp)
 					return
 				case *image.YCbCr:
-					drawYCbCr(dst0, r, src0, sp)
-					return
+					if drawYCbCr(dst0, r, src0, sp) {
+						return
+					}
 				}
 			}
 		}
@@ -345,7 +347,7 @@ func drawNRGBASrc(dst *image.RGBA, r image.Rectangle, src *image.NRGBA, sp image
 	}
 }
 
-func drawYCbCr(dst *image.RGBA, r image.Rectangle, src *image.YCbCr, sp image.Point) {
+func drawYCbCr(dst *image.RGBA, r image.Rectangle, src *image.YCbCr, sp image.Point) (ok bool) {
 	// An image.YCbCr is always fully opaque, and so if the mask is implicitly nil
 	// (i.e. fully opaque) then the op is effectively always Src.
 	x0 := (r.Min.X - dst.Rect.Min.X) * 4
@@ -353,6 +355,19 @@ func drawYCbCr(dst *image.RGBA, r image.Rectangle, src *image.YCbCr, sp image.Po
 	y0 := r.Min.Y - dst.Rect.Min.Y
 	y1 := r.Max.Y - dst.Rect.Min.Y
 	switch src.SubsampleRatio {
+	case image.YCbCrSubsampleRatio444:
+		for y, sy := y0, sp.Y; y != y1; y, sy = y+1, sy+1 {
+			dpix := dst.Pix[y*dst.Stride:]
+			yi := (sy-src.Rect.Min.Y)*src.YStride + (sp.X - src.Rect.Min.X)
+			ci := (sy-src.Rect.Min.Y)*src.CStride + (sp.X - src.Rect.Min.X)
+			for x := x0; x != x1; x, yi, ci = x+4, yi+1, ci+1 {
+				rr, gg, bb := color.YCbCrToRGB(src.Y[yi], src.Cb[ci], src.Cr[ci])
+				dpix[x+0] = rr
+				dpix[x+1] = gg
+				dpix[x+2] = bb
+				dpix[x+3] = 255
+			}
+		}
 	case image.YCbCrSubsampleRatio422:
 		for y, sy := y0, sp.Y; y != y1; y, sy = y+1, sy+1 {
 			dpix := dst.Pix[y*dst.Stride:]
@@ -381,12 +396,11 @@ func drawYCbCr(dst *image.RGBA, r image.Rectangle, src *image.YCbCr, sp image.Po
 				dpix[x+3] = 255
 			}
 		}
-	default:
-		// Default to 4:4:4 subsampling.
+	case image.YCbCrSubsampleRatio440:
 		for y, sy := y0, sp.Y; y != y1; y, sy = y+1, sy+1 {
 			dpix := dst.Pix[y*dst.Stride:]
 			yi := (sy-src.Rect.Min.Y)*src.YStride + (sp.X - src.Rect.Min.X)
-			ci := (sy-src.Rect.Min.Y)*src.CStride + (sp.X - src.Rect.Min.X)
+			ci := (sy/2-src.Rect.Min.Y/2)*src.CStride + (sp.X - src.Rect.Min.X)
 			for x := x0; x != x1; x, yi, ci = x+4, yi+1, ci+1 {
 				rr, gg, bb := color.YCbCrToRGB(src.Y[yi], src.Cb[ci], src.Cr[ci])
 				dpix[x+0] = rr
@@ -395,7 +409,10 @@ func drawYCbCr(dst *image.RGBA, r image.Rectangle, src *image.YCbCr, sp image.Po
 				dpix[x+3] = 255
 			}
 		}
+	default:
+		return false
 	}
+	return true
 }
 
 func drawGlyphOver(dst *image.RGBA, r image.Rectangle, src *image.Uniform, mask *image.Alpha, mp image.Point) {
diff --git a/src/pkg/image/format.go b/src/pkg/image/format.go
index f93d356b0..36635bcc5 100644
--- a/src/pkg/image/format.go
+++ b/src/pkg/image/format.go
@@ -39,7 +39,7 @@ type reader interface {
 	Peek(int) ([]byte, error)
 }
 
-// AsReader converts an io.Reader to a reader.
+// asReader converts an io.Reader to a reader.
 func asReader(r io.Reader) reader {
 	if rr, ok := r.(reader); ok {
 		return rr
diff --git a/src/pkg/image/jpeg/dct_test.go b/src/pkg/image/jpeg/dct_test.go
new file mode 100644
index 000000000..7389f7e4f
--- /dev/null
+++ b/src/pkg/image/jpeg/dct_test.go
@@ -0,0 +1,299 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"math/rand"
+	"testing"
+)
+
+func benchmarkDCT(b *testing.B, f func(*block)) {
+	b.StopTimer()
+	blocks := make([]block, 0, b.N*len(testBlocks))
+	for i := 0; i < b.N; i++ {
+		blocks = append(blocks, testBlocks[:]...)
+	}
+	b.StartTimer()
+	for i := range blocks {
+		f(&blocks[i])
+	}
+}
+
+func BenchmarkFDCT(b *testing.B) {
+	benchmarkDCT(b, fdct)
+}
+
+func BenchmarkIDCT(b *testing.B) {
+	benchmarkDCT(b, idct)
+}
+
+func TestDCT(t *testing.T) {
+	blocks := make([]block, len(testBlocks))
+	copy(blocks, testBlocks[:])
+
+	// Append some randomly generated blocks of varying sparseness.
+	r := rand.New(rand.NewSource(123))
+	for i := 0; i < 100; i++ {
+		b := block{}
+		n := r.Int() % 64
+		for j := 0; j < n; j++ {
+			b[r.Int()%len(b)] = r.Int31() % 256
+		}
+		blocks = append(blocks, b)
+	}
+
+	// Check that the FDCT and IDCT functions are inverses, after a scale and
+	// level shift. Scaling reduces the rounding errors in the conversion from
+	// floats to ints.
+	for i, b := range blocks {
+		got, want := b, b
+		for j := range got {
+			got[j] = (got[j] - 128) * 8
+		}
+		slowFDCT(&got)
+		slowIDCT(&got)
+		for j := range got {
+			got[j] = got[j]/8 + 128
+		}
+		if differ(&got, &want) {
+			t.Errorf("i=%d: IDCT(FDCT)\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+
+	// Check that the optimized and slow FDCT implementations agree.
+	// The fdct function already does a scale and level shift.
+	for i, b := range blocks {
+		got, want := b, b
+		fdct(&got)
+		for j := range want {
+			want[j] = (want[j] - 128) * 8
+		}
+		slowFDCT(&want)
+		if differ(&got, &want) {
+			t.Errorf("i=%d: FDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+
+	// Check that the optimized and slow IDCT implementations agree.
+	for i, b := range blocks {
+		got, want := b, b
+		idct(&got)
+		slowIDCT(&want)
+		if differ(&got, &want) {
+			t.Errorf("i=%d: IDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+}
+
+// differ returns whether any pair-wise elements in b0 and b1 differ by 2 or
+// more. That tolerance is because there isn't a single definitive decoding of
+// a given JPEG image, even before the YCbCr to RGB conversion; implementations
+// can have different IDCT rounding errors.
+func differ(b0, b1 *block) bool {
+	for i := range b0 {
+		delta := b0[i] - b1[i]
+		if delta < -2 || +2 < delta {
+			return true
+		}
+	}
+	return false
+}
+
+// alpha returns 1 if i is 0 and returns √2 otherwise.
+func alpha(i int) float64 {
+	if i == 0 {
+		return 1
+	}
+	return math.Sqrt2
+}
+
+var cosines [32]float64 // cosines[k] = cos(π/2 * k/8)
+
+func init() {
+	for k := range cosines {
+		cosines[k] = math.Cos(math.Pi * float64(k) / 16)
+	}
+}
+
+// slowFDCT performs the 8*8 2-dimensional forward discrete cosine transform:
+//
+//	dst[u,v] = (1/8) * Σ_x Σ_y alpha(u) * alpha(v) * src[x,y] *
+//		cos((π/2) * (2*x + 1) * u / 8) *
+//		cos((π/2) * (2*y + 1) * v / 8)
+//
+// x and y are in pixel space, and u and v are in transform space.
+//
+// b acts as both dst and src.
+func slowFDCT(b *block) {
+	var dst [blockSize]float64
+	for v := 0; v < 8; v++ {
+		for u := 0; u < 8; u++ {
+			sum := 0.0
+			for y := 0; y < 8; y++ {
+				for x := 0; x < 8; x++ {
+					sum += alpha(u) * alpha(v) * float64(b[8*y+x]) *
+						cosines[((2*x+1)*u)%32] *
+						cosines[((2*y+1)*v)%32]
+				}
+			}
+			dst[8*v+u] = sum / 8
+		}
+	}
+	// Convert from float64 to int32.
+	for i := range dst {
+		b[i] = int32(dst[i] + 0.5)
+	}
+}
+
+// slowIDCT performs the 8*8 2-dimensional inverse discrete cosine transform:
+//
+//	dst[x,y] = (1/8) * Σ_u Σ_v alpha(u) * alpha(v) * src[u,v] *
+//		cos((π/2) * (2*x + 1) * u / 8) *
+//		cos((π/2) * (2*y + 1) * v / 8)
+//
+// x and y are in pixel space, and u and v are in transform space.
+//
+// b acts as both dst and src.
+func slowIDCT(b *block) {
+	var dst [blockSize]float64
+	for y := 0; y < 8; y++ {
+		for x := 0; x < 8; x++ {
+			sum := 0.0
+			for v := 0; v < 8; v++ {
+				for u := 0; u < 8; u++ {
+					sum += alpha(u) * alpha(v) * float64(b[8*v+u]) *
+						cosines[((2*x+1)*u)%32] *
+						cosines[((2*y+1)*v)%32]
+				}
+			}
+			dst[8*y+x] = sum / 8
+		}
+	}
+	// Convert from float64 to int32.
+	for i := range dst {
+		b[i] = int32(dst[i] + 0.5)
+	}
+}
+
+func (b *block) String() string {
+	s := bytes.NewBuffer(nil)
+	fmt.Fprintf(s, "{\n")
+	for y := 0; y < 8; y++ {
+		fmt.Fprintf(s, "\t")
+		for x := 0; x < 8; x++ {
+			fmt.Fprintf(s, "0x%04x, ", uint16(b[8*y+x]))
+		}
+		fmt.Fprintln(s)
+	}
+	fmt.Fprintf(s, "}")
+	return s.String()
+}
+
+// testBlocks are the first 10 pre-IDCT blocks from ../testdata/video-001.jpeg.
+var testBlocks = [10]block{
+	{
+		0x7f, 0xf6, 0x01, 0x07, 0xff, 0x00, 0x00, 0x00,
+		0xf5, 0x01, 0xfa, 0x01, 0xfe, 0x00, 0x01, 0x00,
+		0x05, 0x05, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0xff, 0xf8, 0x00, 0x01, 0xff, 0x00, 0x00,
+		0x00, 0x01, 0x00, 0x01, 0x00, 0xff, 0xff, 0x00,
+		0xff, 0x0c, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x01, 0xff, 0x01, 0x00, 0xfe,
+	},
+	{
+		0x29, 0x07, 0x00, 0xfc, 0x01, 0x01, 0x00, 0x00,
+		0x07, 0x00, 0x03, 0x00, 0x01, 0x00, 0xff, 0xff,
+		0xff, 0xfd, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x04, 0x00, 0xff, 0x01, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0xfa, 0x01, 0x00, 0x01, 0x00, 0x01, 0xff,
+		0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x02,
+	},
+	{
+		0xc5, 0xfa, 0x01, 0x00, 0x00, 0x01, 0x00, 0xff,
+		0x02, 0xff, 0x01, 0x00, 0x01, 0x00, 0xff, 0x00,
+		0xff, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00,
+		0xff, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	{
+		0x86, 0x05, 0x00, 0x02, 0x00, 0x00, 0x01, 0x00,
+		0xf2, 0x06, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+		0xf6, 0xfa, 0xf9, 0x00, 0xff, 0x01, 0x00, 0x00,
+		0xf9, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
+		0xff, 0x00, 0x00, 0x01, 0x00, 0xff, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x01,
+		0x00, 0x01, 0xff, 0x01, 0x00, 0xff, 0x00, 0x00,
+	},
+	{
+		0x24, 0xfe, 0x00, 0xff, 0x00, 0xff, 0xff, 0x00,
+		0x08, 0xfd, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00,
+		0x06, 0x03, 0x03, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x04, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01,
+		0x01, 0x00, 0x01, 0xff, 0x00, 0x01, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0x01,
+	},
+	{
+		0xcd, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+		0x03, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0xff,
+	},
+	{
+		0x81, 0xfe, 0x05, 0xff, 0x01, 0xff, 0x01, 0x00,
+		0xef, 0xf9, 0x00, 0xf9, 0x00, 0xff, 0x00, 0xff,
+		0x05, 0xf9, 0x00, 0xf8, 0x01, 0xff, 0x01, 0xff,
+		0x00, 0xff, 0x07, 0x00, 0x01, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x01,
+		0xff, 0x01, 0x01, 0x00, 0xff, 0x00, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+	},
+	{
+		0x28, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x0b, 0x02, 0x01, 0x03, 0x00, 0xff, 0x00, 0x01,
+		0xfe, 0x02, 0x01, 0x03, 0xff, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0xfd, 0x00, 0x01, 0x00, 0xff, 0x00,
+		0x01, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x01, 0x01, 0x00, 0xff,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x01,
+	},
+	{
+		0xdf, 0xf9, 0xfe, 0x00, 0x03, 0x01, 0xff, 0xff,
+		0x04, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01,
+		0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00, 0x01,
+		0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x01,
+		0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+	},
+	{
+		0x88, 0xfd, 0x00, 0x00, 0xff, 0x00, 0x01, 0xff,
+		0xe1, 0x06, 0x06, 0x01, 0xff, 0x00, 0x01, 0x00,
+		0x08, 0x00, 0xfa, 0x00, 0xff, 0xff, 0xff, 0xff,
+		0x08, 0x01, 0x00, 0xff, 0x01, 0xff, 0x00, 0x00,
+		0xf5, 0xff, 0x00, 0x01, 0xff, 0x01, 0x01, 0x00,
+		0xff, 0xff, 0x01, 0xff, 0x01, 0x00, 0x01, 0x00,
+		0x00, 0x01, 0x01, 0xff, 0x00, 0xff, 0x00, 0x01,
+		0x02, 0x00, 0x00, 0xff, 0xff, 0x00, 0xff, 0x00,
+	},
+}
diff --git a/src/pkg/image/jpeg/huffman.go b/src/pkg/image/jpeg/huffman.go
index d2382490f..9b731fdc4 100644
--- a/src/pkg/image/jpeg/huffman.go
+++ b/src/pkg/image/jpeg/huffman.go
@@ -15,9 +15,9 @@ const maxNumValues = 256
 // Bit stream for the Huffman decoder.
 // The n least significant bits of a form the unread bits, to be read in MSB to LSB order.
 type bits struct {
-	a int // accumulator.
-	n int // the number of unread bits in a.
-	m int // mask. m==1<<(n-1) when n>0, with m==0 when n==0.
+	a uint32 // accumulator.
+	m uint32 // mask. m==1<<(n-1) when n>0, with m==0 when n==0.
+	n int    // the number of unread bits in a.
 }
 
 // Huffman table decoder, specified in section C.
@@ -39,7 +39,7 @@ func (d *decoder) ensureNBits(n int) error {
 		if err != nil {
 			return err
 		}
-		d.b.a = d.b.a<<8 | int(c)
+		d.b.a = d.b.a<<8 | uint32(c)
 		d.b.n += 8
 		if d.b.m == 0 {
 			d.b.m = 1 << 7
@@ -61,15 +61,16 @@ func (d *decoder) ensureNBits(n int) error {
 }
 
 // The composition of RECEIVE and EXTEND, specified in section F.2.2.1.
-func (d *decoder) receiveExtend(t uint8) (int, error) {
-	err := d.ensureNBits(int(t))
-	if err != nil {
-		return 0, err
+func (d *decoder) receiveExtend(t uint8) (int32, error) {
+	if d.b.n < int(t) {
+		if err := d.ensureNBits(int(t)); err != nil {
+			return 0, err
+		}
 	}
 	d.b.n -= int(t)
 	d.b.m >>= t
-	s := 1 << t
-	x := (d.b.a >> uint8(d.b.n)) & (s - 1)
+	s := int32(1) << t
+	x := int32(d.b.a>>uint8(d.b.n)) & (s - 1)
 	if x < s>>1 {
 		x += ((-1) << t) + 1
 	}
@@ -92,8 +93,7 @@ func (d *decoder) processDHT(n int) error {
 			return FormatError("bad Tc value")
 		}
 		th := d.tmp[0] & 0x0f
-		const isBaseline = true // Progressive mode is not yet supported.
-		if th > maxTh || isBaseline && th > 1 {
+		if th > maxTh || !d.progressive && th > 1 {
 			return FormatError("bad Th value")
 		}
 		h := &d.huff[tc][th]
@@ -163,15 +163,16 @@ func (d *decoder) processDHT(n int) error {
 
 // Returns the next Huffman-coded value from the bit stream, decoded according to h.
 // TODO(nigeltao): This decoding algorithm is simple, but slow. A lookahead table, instead of always
-// peeling off only 1 bit at at time, ought to be faster.
+// peeling off only 1 bit at time, ought to be faster.
 func (d *decoder) decodeHuffman(h *huffman) (uint8, error) {
 	if h.length == 0 {
 		return 0, FormatError("uninitialized Huffman table")
 	}
 	for i, code := 0, 0; i < maxCodeLength; i++ {
-		err := d.ensureNBits(1)
-		if err != nil {
-			return 0, err
+		if d.b.n == 0 {
+			if err := d.ensureNBits(1); err != nil {
+				return 0, err
+			}
 		}
 		if d.b.a&d.b.m != 0 {
 			code |= 1
@@ -185,3 +186,28 @@ func (d *decoder) decodeHuffman(h *huffman) (uint8, error) {
 	}
 	return 0, FormatError("bad Huffman code")
 }
+
+func (d *decoder) decodeBit() (bool, error) {
+	if d.b.n == 0 {
+		if err := d.ensureNBits(1); err != nil {
+			return false, err
+		}
+	}
+	ret := d.b.a&d.b.m != 0
+	d.b.n--
+	d.b.m >>= 1
+	return ret, nil
+}
+
+func (d *decoder) decodeBits(n int) (uint32, error) {
+	if d.b.n < n {
+		if err := d.ensureNBits(n); err != nil {
+			return 0, err
+		}
+	}
+	ret := d.b.a >> uint(d.b.n-n)
+	ret &= (1 << uint(n)) - 1
+	d.b.n -= n
+	d.b.m >>= uint(n)
+	return ret, nil
+}
diff --git a/src/pkg/image/jpeg/idct.go b/src/pkg/image/jpeg/idct.go
index b387dfdff..46fcaecb7 100644
--- a/src/pkg/image/jpeg/idct.go
+++ b/src/pkg/image/jpeg/idct.go
@@ -37,6 +37,10 @@ package jpeg
  *
  */
 
+const blockSize = 64 // A DCT block is 8x8.
+
+type block [blockSize]int32
+
 const (
 	w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
 	w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
@@ -55,9 +59,7 @@ const (
 	r2 = 181 // 256/sqrt(2)
 )
 
-// idct performs a 2-D Inverse Discrete Cosine Transformation, followed by a
-// +128 level shift and a clip to [0, 255], writing the results to dst.
-// stride is the number of elements between successive rows of dst.
+// idct performs a 2-D Inverse Discrete Cosine Transformation.
 //
 // The input coefficients should already have been multiplied by the
 // appropriate quantization table. We use fixed-point computation, with the
@@ -67,33 +69,34 @@ const (
 // For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
 // discrete W transform and for the discrete Fourier transform", IEEE Trans. on
 // ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-func idct(dst []byte, stride int, src *block) {
+func idct(src *block) {
 	// Horizontal 1-D IDCT.
 	for y := 0; y < 8; y++ {
+		y8 := y * 8
 		// If all the AC components are zero, then the IDCT is trivial.
-		if src[y*8+1] == 0 && src[y*8+2] == 0 && src[y*8+3] == 0 &&
-			src[y*8+4] == 0 && src[y*8+5] == 0 && src[y*8+6] == 0 && src[y*8+7] == 0 {
-			dc := src[y*8+0] << 3
-			src[y*8+0] = dc
-			src[y*8+1] = dc
-			src[y*8+2] = dc
-			src[y*8+3] = dc
-			src[y*8+4] = dc
-			src[y*8+5] = dc
-			src[y*8+6] = dc
-			src[y*8+7] = dc
+		if src[y8+1] == 0 && src[y8+2] == 0 && src[y8+3] == 0 &&
+			src[y8+4] == 0 && src[y8+5] == 0 && src[y8+6] == 0 && src[y8+7] == 0 {
+			dc := src[y8+0] << 3
+			src[y8+0] = dc
+			src[y8+1] = dc
+			src[y8+2] = dc
+			src[y8+3] = dc
+			src[y8+4] = dc
+			src[y8+5] = dc
+			src[y8+6] = dc
+			src[y8+7] = dc
 			continue
 		}
 
 		// Prescale.
-		x0 := (src[y*8+0] << 11) + 128
-		x1 := src[y*8+4] << 11
-		x2 := src[y*8+6]
-		x3 := src[y*8+2]
-		x4 := src[y*8+1]
-		x5 := src[y*8+7]
-		x6 := src[y*8+5]
-		x7 := src[y*8+3]
+		x0 := (src[y8+0] << 11) + 128
+		x1 := src[y8+4] << 11
+		x2 := src[y8+6]
+		x3 := src[y8+2]
+		x4 := src[y8+1]
+		x5 := src[y8+7]
+		x6 := src[y8+5]
+		x7 := src[y8+3]
 
 		// Stage 1.
 		x8 := w7 * (x4 + x5)
@@ -123,14 +126,14 @@ func idct(dst []byte, stride int, src *block) {
 		x4 = (r2*(x4-x5) + 128) >> 8
 
 		// Stage 4.
-		src[8*y+0] = (x7 + x1) >> 8
-		src[8*y+1] = (x3 + x2) >> 8
-		src[8*y+2] = (x0 + x4) >> 8
-		src[8*y+3] = (x8 + x6) >> 8
-		src[8*y+4] = (x8 - x6) >> 8
-		src[8*y+5] = (x0 - x4) >> 8
-		src[8*y+6] = (x3 - x2) >> 8
-		src[8*y+7] = (x7 - x1) >> 8
+		src[y8+0] = (x7 + x1) >> 8
+		src[y8+1] = (x3 + x2) >> 8
+		src[y8+2] = (x0 + x4) >> 8
+		src[y8+3] = (x8 + x6) >> 8
+		src[y8+4] = (x8 - x6) >> 8
+		src[y8+5] = (x0 - x4) >> 8
+		src[y8+6] = (x3 - x2) >> 8
+		src[y8+7] = (x7 - x1) >> 8
 	}
 
 	// Vertical 1-D IDCT.
@@ -186,19 +189,4 @@ func idct(dst []byte, stride int, src *block) {
 		src[8*6+x] = (y3 - y2) >> 14
 		src[8*7+x] = (y7 - y1) >> 14
 	}
-
-	// Level shift by +128, clip to [0, 255], and write to dst.
-	for y := 0; y < 8; y++ {
-		for x := 0; x < 8; x++ {
-			c := src[y*8+x]
-			if c < -128 {
-				c = 0
-			} else if c > 127 {
-				c = 255
-			} else {
-				c += 128
-			}
-			dst[y*stride+x] = uint8(c)
-		}
-	}
 }
diff --git a/src/pkg/image/jpeg/reader.go b/src/pkg/image/jpeg/reader.go
index d9adf6e58..1ee6bbcd1 100644
--- a/src/pkg/image/jpeg/reader.go
+++ b/src/pkg/image/jpeg/reader.go
@@ -35,11 +35,7 @@ type component struct {
 	tq uint8 // Quantization table destination selector.
 }
 
-type block [blockSize]int
-
 const (
-	blockSize = 64 // A DCT block is 8x8.
-
 	dcTable = 0
 	acTable = 1
 	maxTc   = 1
@@ -51,7 +47,7 @@ const (
 	// A color JPEG image has Y, Cb and Cr components.
 	nColorComponent = 3
 
-	// We only support 4:4:4, 4:2:2 and 4:2:0 downsampling, and therefore the
+	// We only support 4:4:4, 4:4:0, 4:2:2 and 4:2:0 downsampling, and therefore the
 	// number of luma samples per chroma sample is at most 2 in the horizontal
 	// and 2 in the vertical direction.
 	maxH = 2
@@ -74,7 +70,9 @@ const (
 	comMarker   = 0xfe // COMment.
 )
 
-// Maps from the zig-zag ordering to the natural ordering.
+// unzig maps from the zig-zag ordering to the natural ordering. For example,
+// unzig[3] is the column and row of the fourth element in zig-zag order. The
+// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
 var unzig = [blockSize]int{
 	0, 1, 8, 16, 9, 2, 3, 10,
 	17, 24, 32, 25, 18, 11, 4, 5,
@@ -94,15 +92,18 @@ type Reader interface {
 
 type decoder struct {
 	r             Reader
+	b             bits
 	width, height int
 	img1          *image.Gray
 	img3          *image.YCbCr
 	ri            int // Restart Interval.
 	nComp         int
+	progressive   bool
+	eobRun        uint16 // End-of-Band run, specified in section G.1.2.2.
 	comp          [nColorComponent]component
+	progCoeffs    [nColorComponent][]block // Saved state between progressive-mode scans.
 	huff          [maxTc + 1][maxTh + 1]huffman
-	quant         [maxTq + 1]block
-	b             bits
+	quant         [maxTq + 1]block // Quantization tables, in zig-zag order.
 	tmp           [1024]byte
 }
 
@@ -146,20 +147,33 @@ func (d *decoder) processSOF(n int) error {
 		return UnsupportedError("SOF has wrong number of image components")
 	}
 	for i := 0; i < d.nComp; i++ {
-		hv := d.tmp[7+3*i]
-		d.comp[i].h = int(hv >> 4)
-		d.comp[i].v = int(hv & 0x0f)
 		d.comp[i].c = d.tmp[6+3*i]
 		d.comp[i].tq = d.tmp[8+3*i]
 		if d.nComp == nGrayComponent {
+			// If a JPEG image has only one component, section A.2 says "this data
+			// is non-interleaved by definition" and section A.2.2 says "[in this
+			// case...] the order of data units within a scan shall be left-to-right
+			// and top-to-bottom... regardless of the values of H_1 and V_1". Section
+			// 4.8.2 also says "[for non-interleaved data], the MCU is defined to be
+			// one data unit". Similarly, section A.1.1 explains that it is the ratio
+			// of H_i to max_j(H_j) that matters, and similarly for V. For grayscale
+			// images, H_1 is the maximum H_j for all components j, so that ratio is
+			// always 1. The component's (h, v) is effectively always (1, 1): even if
+			// the nominal (h, v) is (2, 1), a 20x5 image is encoded in three 8x8
+			// MCUs, not two 16x8 MCUs.
+			d.comp[i].h = 1
+			d.comp[i].v = 1
 			continue
 		}
-		// For color images, we only support 4:4:4, 4:2:2 or 4:2:0 chroma
+		hv := d.tmp[7+3*i]
+		d.comp[i].h = int(hv >> 4)
+		d.comp[i].v = int(hv & 0x0f)
+		// For color images, we only support 4:4:4, 4:4:0, 4:2:2 or 4:2:0 chroma
 		// downsampling ratios. This implies that the (h, v) values for the Y
-		// component are either (1, 1), (2, 1) or (2, 2), and the (h, v)
+		// component are either (1, 1), (1, 2), (2, 1) or (2, 2), and the (h, v)
 		// values for the Cr and Cb components must be (1, 1).
 		if i == 0 {
-			if hv != 0x11 && hv != 0x21 && hv != 0x22 {
+			if hv != 0x11 && hv != 0x21 && hv != 0x22 && hv != 0x12 {
 				return UnsupportedError("luma downsample ratio")
 			}
 		} else if hv != 0x11 {
@@ -186,7 +200,7 @@ func (d *decoder) processDQT(n int) error {
 			return FormatError("bad Tq value")
 		}
 		for i := range d.quant[tq] {
-			d.quant[tq][i] = int(d.tmp[i+1])
+			d.quant[tq][i] = int32(d.tmp[i+1])
 		}
 	}
 	if n != 0 {
@@ -195,161 +209,6 @@ func (d *decoder) processDQT(n int) error {
 	return nil
 }
 
-// makeImg allocates and initializes the destination image.
-func (d *decoder) makeImg(h0, v0, mxx, myy int) {
-	if d.nComp == nGrayComponent {
-		m := image.NewGray(image.Rect(0, 0, 8*mxx, 8*myy))
-		d.img1 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.Gray)
-		return
-	}
-	var subsampleRatio image.YCbCrSubsampleRatio
-	switch h0 * v0 {
-	case 1:
-		subsampleRatio = image.YCbCrSubsampleRatio444
-	case 2:
-		subsampleRatio = image.YCbCrSubsampleRatio422
-	case 4:
-		subsampleRatio = image.YCbCrSubsampleRatio420
-	default:
-		panic("unreachable")
-	}
-	m := image.NewYCbCr(image.Rect(0, 0, 8*h0*mxx, 8*v0*myy), subsampleRatio)
-	d.img3 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.YCbCr)
-}
-
-// Specified in section B.2.3.
-func (d *decoder) processSOS(n int) error {
-	if d.nComp == 0 {
-		return FormatError("missing SOF marker")
-	}
-	if n != 4+2*d.nComp {
-		return UnsupportedError("SOS has wrong length")
-	}
-	_, err := io.ReadFull(d.r, d.tmp[0:4+2*d.nComp])
-	if err != nil {
-		return err
-	}
-	if int(d.tmp[0]) != d.nComp {
-		return UnsupportedError("SOS has wrong number of image components")
-	}
-	var scan [nColorComponent]struct {
-		td uint8 // DC table selector.
-		ta uint8 // AC table selector.
-	}
-	for i := 0; i < d.nComp; i++ {
-		cs := d.tmp[1+2*i] // Component selector.
-		if cs != d.comp[i].c {
-			return UnsupportedError("scan components out of order")
-		}
-		scan[i].td = d.tmp[2+2*i] >> 4
-		scan[i].ta = d.tmp[2+2*i] & 0x0f
-	}
-	// mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
-	h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
-	mxx := (d.width + 8*h0 - 1) / (8 * h0)
-	myy := (d.height + 8*v0 - 1) / (8 * v0)
-	if d.img1 == nil && d.img3 == nil {
-		d.makeImg(h0, v0, mxx, myy)
-	}
-
-	mcu, expectedRST := 0, uint8(rst0Marker)
-	var (
-		b  block
-		dc [nColorComponent]int
-	)
-	for my := 0; my < myy; my++ {
-		for mx := 0; mx < mxx; mx++ {
-			for i := 0; i < d.nComp; i++ {
-				qt := &d.quant[d.comp[i].tq]
-				for j := 0; j < d.comp[i].h*d.comp[i].v; j++ {
-					// TODO(nigeltao): make this a "var b block" once the compiler's escape
-					// analysis is good enough to allocate it on the stack, not the heap.
-					b = block{}
-
-					// Decode the DC coefficient, as specified in section F.2.2.1.
-					value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
-					if err != nil {
-						return err
-					}
-					if value > 16 {
-						return UnsupportedError("excessive DC component")
-					}
-					dcDelta, err := d.receiveExtend(value)
-					if err != nil {
-						return err
-					}
-					dc[i] += dcDelta
-					b[0] = dc[i] * qt[0]
-
-					// Decode the AC coefficients, as specified in section F.2.2.2.
-					for k := 1; k < blockSize; k++ {
-						value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
-						if err != nil {
-							return err
-						}
-						val0 := value >> 4
-						val1 := value & 0x0f
-						if val1 != 0 {
-							k += int(val0)
-							if k > blockSize {
-								return FormatError("bad DCT index")
-							}
-							ac, err := d.receiveExtend(val1)
-							if err != nil {
-								return err
-							}
-							b[unzig[k]] = ac * qt[k]
-						} else {
-							if val0 != 0x0f {
-								break
-							}
-							k += 0x0f
-						}
-					}
-
-					// Perform the inverse DCT and store the MCU component to the image.
-					if d.nComp == nGrayComponent {
-						idct(d.img1.Pix[8*(my*d.img1.Stride+mx):], d.img1.Stride, &b)
-					} else {
-						switch i {
-						case 0:
-							mx0 := h0*mx + (j % 2)
-							my0 := v0*my + (j / 2)
-							idct(d.img3.Y[8*(my0*d.img3.YStride+mx0):], d.img3.YStride, &b)
-						case 1:
-							idct(d.img3.Cb[8*(my*d.img3.CStride+mx):], d.img3.CStride, &b)
-						case 2:
-							idct(d.img3.Cr[8*(my*d.img3.CStride+mx):], d.img3.CStride, &b)
-						}
-					}
-				} // for j
-			} // for i
-			mcu++
-			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
-				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
-				// but this one assumes well-formed input, and hence the restart marker follows immediately.
-				_, err := io.ReadFull(d.r, d.tmp[0:2])
-				if err != nil {
-					return err
-				}
-				if d.tmp[0] != 0xff || d.tmp[1] != expectedRST {
-					return FormatError("bad RST marker")
-				}
-				expectedRST++
-				if expectedRST == rst7Marker+1 {
-					expectedRST = rst0Marker
-				}
-				// Reset the Huffman decoder.
-				d.b = bits{}
-				// Reset the DC components, as per section F.2.1.3.1.
-				dc = [nColorComponent]int{}
-			}
-		} // for mx
-	} // for my
-
-	return nil
-}
-
 // Specified in section B.2.4.4.
 func (d *decoder) processDRI(n int) error {
 	if n != 2 {
@@ -390,9 +249,26 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
 			return nil, FormatError("missing 0xff marker start")
 		}
 		marker := d.tmp[1]
+		for marker == 0xff {
+			// Section B.1.1.2 says, "Any marker may optionally be preceded by any
+			// number of fill bytes, which are bytes assigned code X'FF'".
+			marker, err = d.r.ReadByte()
+			if err != nil {
+				return nil, err
+			}
+		}
 		if marker == eoiMarker { // End Of Image.
 			break
 		}
+		if rst0Marker <= marker && marker <= rst7Marker {
+			// Figures B.2 and B.16 of the specification suggest that restart markers should
+			// only occur between Entropy Coded Segments and not after the final ECS.
+			// However, some encoders may generate incorrect JPEGs with a final restart
+			// marker. That restart marker will be seen here instead of inside the processSOS
+			// method, and is ignored as a harmless error. Restart markers have no extra data,
+			// so we check for this before we read the 16-bit length of the segment.
+			continue
+		}
 
 		// Read the 16-bit length of the segment. The value includes the 2 bytes for the
 		// length itself, so we subtract 2 to get the number of remaining bytes.
@@ -406,13 +282,12 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
 		}
 
 		switch {
-		case marker == sof0Marker: // Start Of Frame (Baseline).
+		case marker == sof0Marker || marker == sof2Marker: // Start Of Frame.
+			d.progressive = marker == sof2Marker
 			err = d.processSOF(n)
 			if configOnly {
 				return nil, err
 			}
-		case marker == sof2Marker: // Start Of Frame (Progressive).
-			err = UnsupportedError("progressive mode")
 		case marker == dhtMarker: // Define Huffman Table.
 			err = d.processDHT(n)
 		case marker == dqtMarker: // Define Quantization Table.
@@ -421,7 +296,7 @@ func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
 			err = d.processSOS(n)
 		case marker == driMarker: // Define Restart Interval.
 			err = d.processDRI(n)
-		case marker >= app0Marker && marker <= app15Marker || marker == comMarker: // APPlication specific, or COMment.
+		case app0Marker <= marker && marker <= app15Marker || marker == comMarker: // APPlication specific, or COMment.
 			err = d.ignore(n)
 		default:
 			err = UnsupportedError("unknown marker")
diff --git a/src/pkg/image/jpeg/reader_test.go b/src/pkg/image/jpeg/reader_test.go
new file mode 100644
index 000000000..b520a8ab1
--- /dev/null
+++ b/src/pkg/image/jpeg/reader_test.go
@@ -0,0 +1,157 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"io/ioutil"
+	"os"
+	"testing"
+)
+
+// TestDecodeProgressive tests that decoding the baseline and progressive
+// versions of the same image result in exactly the same pixel data, in YCbCr
+// space for color images, and Y space for grayscale images.
+func TestDecodeProgressive(t *testing.T) {
+	testCases := []string{
+		"../testdata/video-001",
+		"../testdata/video-001.q50.420",
+		"../testdata/video-001.q50.422",
+		"../testdata/video-001.q50.440",
+		"../testdata/video-001.q50.444",
+		"../testdata/video-005.gray.q50",
+		"../testdata/video-005.gray.q50.2x2",
+	}
+	for _, tc := range testCases {
+		m0, err := decodeFile(tc + ".jpeg")
+		if err != nil {
+			t.Errorf("%s: %v", tc+".jpeg", err)
+			continue
+		}
+		m1, err := decodeFile(tc + ".progressive.jpeg")
+		if err != nil {
+			t.Errorf("%s: %v", tc+".progressive.jpeg", err)
+			continue
+		}
+		if m0.Bounds() != m1.Bounds() {
+			t.Errorf("%s: bounds differ: %v and %v", tc, m0.Bounds(), m1.Bounds())
+			continue
+		}
+		switch m0 := m0.(type) {
+		case *image.YCbCr:
+			m1 := m1.(*image.YCbCr)
+			if err := check(m0.Bounds(), m0.Y, m1.Y, m0.YStride, m1.YStride); err != nil {
+				t.Errorf("%s (Y): %v", tc, err)
+				continue
+			}
+			if err := check(m0.Bounds(), m0.Cb, m1.Cb, m0.CStride, m1.CStride); err != nil {
+				t.Errorf("%s (Cb): %v", tc, err)
+				continue
+			}
+			if err := check(m0.Bounds(), m0.Cr, m1.Cr, m0.CStride, m1.CStride); err != nil {
+				t.Errorf("%s (Cr): %v", tc, err)
+				continue
+			}
+		case *image.Gray:
+			m1 := m1.(*image.Gray)
+			if err := check(m0.Bounds(), m0.Pix, m1.Pix, m0.Stride, m1.Stride); err != nil {
+				t.Errorf("%s: %v", tc, err)
+				continue
+			}
+		default:
+			t.Errorf("%s: unexpected image type %T", tc, m0)
+			continue
+		}
+	}
+}
+
+func decodeFile(filename string) (image.Image, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	return Decode(f)
+
+}
+
+// check checks that the two pix data are equal, within the given bounds.
+func check(bounds image.Rectangle, pix0, pix1 []byte, stride0, stride1 int) error {
+	if len(pix0) != len(pix1) {
+		return fmt.Errorf("len(pix) %d and %d differ", len(pix0), len(pix1))
+	}
+	if stride0 != stride1 {
+		return fmt.Errorf("strides %d and %d differ", stride0, stride1)
+	}
+	if stride0%8 != 0 {
+		return fmt.Errorf("stride %d is not a multiple of 8", stride0)
+	}
+	// Compare the two pix data, one 8x8 block at a time.
+	for y := 0; y < len(pix0)/stride0; y += 8 {
+		for x := 0; x < stride0; x += 8 {
+			if x >= bounds.Max.X || y >= bounds.Max.Y {
+				// We don't care if the two pix data differ if the 8x8 block is
+				// entirely outside of the image's bounds. For example, this can
+				// occur with a 4:2:0 chroma subsampling and a 1x1 image. Baseline
+				// decoding works on the one 16x16 MCU as a whole; progressive
+				// decoding's first pass works on that 16x16 MCU as a whole but
+				// refinement passes only process one 8x8 block within the MCU.
+				continue
+			}
+
+			for j := 0; j < 8; j++ {
+				for i := 0; i < 8; i++ {
+					index := (y+j)*stride0 + (x + i)
+					if pix0[index] != pix1[index] {
+						return fmt.Errorf("blocks at (%d, %d) differ:\n%sand\n%s", x, y,
+							pixString(pix0, stride0, x, y),
+							pixString(pix1, stride1, x, y),
+						)
+					}
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func pixString(pix []byte, stride, x, y int) string {
+	s := bytes.NewBuffer(nil)
+	for j := 0; j < 8; j++ {
+		fmt.Fprintf(s, "\t")
+		for i := 0; i < 8; i++ {
+			fmt.Fprintf(s, "%02x ", pix[(y+j)*stride+(x+i)])
+		}
+		fmt.Fprintf(s, "\n")
+	}
+	return s.String()
+}
+
+func benchmarkDecode(b *testing.B, filename string) {
+	b.StopTimer()
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+	cfg, err := DecodeConfig(bytes.NewReader(data))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(cfg.Width * cfg.Height * 4))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Decode(bytes.NewReader(data))
+	}
+}
+
+func BenchmarkDecodeBaseline(b *testing.B) {
+	benchmarkDecode(b, "../testdata/video-001.jpeg")
+}
+
+func BenchmarkDecodeProgressive(b *testing.B) {
+	benchmarkDecode(b, "../testdata/video-001.progressive.jpeg")
+}
diff --git a/src/pkg/image/jpeg/scan.go b/src/pkg/image/jpeg/scan.go
new file mode 100644
index 000000000..e3ae8ae44
--- /dev/null
+++ b/src/pkg/image/jpeg/scan.go
@@ -0,0 +1,432 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"image"
+	"io"
+)
+
+// makeImg allocates and initializes the destination image.
+func (d *decoder) makeImg(h0, v0, mxx, myy int) {
+	if d.nComp == nGrayComponent {
+		m := image.NewGray(image.Rect(0, 0, 8*mxx, 8*myy))
+		d.img1 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.Gray)
+		return
+	}
+	var subsampleRatio image.YCbCrSubsampleRatio
+	switch {
+	case h0 == 1 && v0 == 1:
+		subsampleRatio = image.YCbCrSubsampleRatio444
+	case h0 == 1 && v0 == 2:
+		subsampleRatio = image.YCbCrSubsampleRatio440
+	case h0 == 2 && v0 == 1:
+		subsampleRatio = image.YCbCrSubsampleRatio422
+	case h0 == 2 && v0 == 2:
+		subsampleRatio = image.YCbCrSubsampleRatio420
+	default:
+		panic("unreachable")
+	}
+	m := image.NewYCbCr(image.Rect(0, 0, 8*h0*mxx, 8*v0*myy), subsampleRatio)
+	d.img3 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.YCbCr)
+}
+
+// Specified in section B.2.3.
+func (d *decoder) processSOS(n int) error {
+	if d.nComp == 0 {
+		return FormatError("missing SOF marker")
+	}
+	if n < 6 || 4+2*d.nComp < n || n%2 != 0 {
+		return FormatError("SOS has wrong length")
+	}
+	_, err := io.ReadFull(d.r, d.tmp[:n])
+	if err != nil {
+		return err
+	}
+	nComp := int(d.tmp[0])
+	if n != 4+2*nComp {
+		return FormatError("SOS length inconsistent with number of components")
+	}
+	var scan [nColorComponent]struct {
+		compIndex uint8
+		td        uint8 // DC table selector.
+		ta        uint8 // AC table selector.
+	}
+	for i := 0; i < nComp; i++ {
+		cs := d.tmp[1+2*i] // Component selector.
+		compIndex := -1
+		for j, comp := range d.comp {
+			if cs == comp.c {
+				compIndex = j
+			}
+		}
+		if compIndex < 0 {
+			return FormatError("unknown component selector")
+		}
+		scan[i].compIndex = uint8(compIndex)
+		scan[i].td = d.tmp[2+2*i] >> 4
+		scan[i].ta = d.tmp[2+2*i] & 0x0f
+	}
+
+	// zigStart and zigEnd are the spectral selection bounds.
+	// ah and al are the successive approximation high and low values.
+	// The spec calls these values Ss, Se, Ah and Al.
+	//
+	// For progressive JPEGs, these are the two more-or-less independent
+	// aspects of progression. Spectral selection progression is when not
+	// all of a block's 64 DCT coefficients are transmitted in one pass.
+	// For example, three passes could transmit coefficient 0 (the DC
+	// component), coefficients 1-5, and coefficients 6-63, in zig-zag
+	// order. Successive approximation is when not all of the bits of a
+	// band of coefficients are transmitted in one pass. For example,
+	// three passes could transmit the 6 most significant bits, followed
+	// by the second-least significant bit, followed by the least
+	// significant bit.
+	//
+	// For baseline JPEGs, these parameters are hard-coded to 0/63/0/0.
+	zigStart, zigEnd, ah, al := int32(0), int32(blockSize-1), uint32(0), uint32(0)
+	if d.progressive {
+		zigStart = int32(d.tmp[1+2*nComp])
+		zigEnd = int32(d.tmp[2+2*nComp])
+		ah = uint32(d.tmp[3+2*nComp] >> 4)
+		al = uint32(d.tmp[3+2*nComp] & 0x0f)
+		if (zigStart == 0 && zigEnd != 0) || zigStart > zigEnd || blockSize <= zigEnd {
+			return FormatError("bad spectral selection bounds")
+		}
+		if zigStart != 0 && nComp != 1 {
+			return FormatError("progressive AC coefficients for more than one component")
+		}
+		if ah != 0 && ah != al+1 {
+			return FormatError("bad successive approximation values")
+		}
+	}
+
+	// mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
+	h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
+	mxx := (d.width + 8*h0 - 1) / (8 * h0)
+	myy := (d.height + 8*v0 - 1) / (8 * v0)
+	if d.img1 == nil && d.img3 == nil {
+		d.makeImg(h0, v0, mxx, myy)
+		if d.progressive {
+			for i := 0; i < nComp; i++ {
+				compIndex := scan[i].compIndex
+				d.progCoeffs[compIndex] = make([]block, mxx*myy*d.comp[compIndex].h*d.comp[compIndex].v)
+			}
+		}
+	}
+
+	d.b = bits{}
+	mcu, expectedRST := 0, uint8(rst0Marker)
+	var (
+		// b is the decoded coefficients, in natural (not zig-zag) order.
+		b  block
+		dc [nColorComponent]int32
+		// mx0 and my0 are the location of the current (in terms of 8x8 blocks).
+		// For example, with 4:2:0 chroma subsampling, the block whose top left
+		// pixel co-ordinates are (16, 8) is the third block in the first row:
+		// mx0 is 2 and my0 is 0, even though the pixel is in the second MCU.
+		// TODO(nigeltao): rename mx0 and my0 to bx and by?
+		mx0, my0   int
+		blockCount int
+	)
+	for my := 0; my < myy; my++ {
+		for mx := 0; mx < mxx; mx++ {
+			for i := 0; i < nComp; i++ {
+				compIndex := scan[i].compIndex
+				qt := &d.quant[d.comp[compIndex].tq]
+				for j := 0; j < d.comp[compIndex].h*d.comp[compIndex].v; j++ {
+					// The blocks are traversed one MCU at a time. For 4:2:0 chroma
+					// subsampling, there are four Y 8x8 blocks in every 16x16 MCU.
+					// For a baseline 32x16 pixel image, the Y blocks visiting order is:
+					//	0 1 4 5
+					//	2 3 6 7
+					//
+					// For progressive images, the DC data blocks (zigStart == 0) are traversed
+					// as above, but AC data blocks are traversed left to right, top to bottom:
+					//	0 1 2 3
+					//	4 5 6 7
+					//
+					// To further complicate matters, there is no AC data for any blocks that
+					// are inside the image at the MCU level but outside the image at the pixel
+					// level. For example, a 24x16 pixel 4:2:0 progressive image consists of
+					// two 16x16 MCUs. The earlier scans will process 8 Y blocks:
+					//	0 1 4 5
+					//	2 3 6 7
+					// The later scans will process only 6 Y blocks:
+					//	0 1 2
+					//	3 4 5
+					if zigStart == 0 {
+						mx0, my0 = d.comp[compIndex].h*mx, d.comp[compIndex].v*my
+						if h0 == 1 {
+							my0 += j
+						} else {
+							mx0 += j % 2
+							my0 += j / 2
+						}
+					} else {
+						q := mxx * d.comp[compIndex].h
+						mx0 = blockCount % q
+						my0 = blockCount / q
+						blockCount++
+						if mx0*8 >= d.width || my0*8 >= d.height {
+							continue
+						}
+					}
+
+					// Load the previous partially decoded coefficients, if applicable.
+					if d.progressive {
+						b = d.progCoeffs[compIndex][my0*mxx*d.comp[compIndex].h+mx0]
+					} else {
+						b = block{}
+					}
+
+					if ah != 0 {
+						if err := d.refine(&b, &d.huff[acTable][scan[i].ta], zigStart, zigEnd, 1<<al); err != nil {
+							return err
+						}
+					} else {
+						zig := zigStart
+						if zig == 0 {
+							zig++
+							// Decode the DC coefficient, as specified in section F.2.2.1.
+							value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
+							if err != nil {
+								return err
+							}
+							if value > 16 {
+								return UnsupportedError("excessive DC component")
+							}
+							dcDelta, err := d.receiveExtend(value)
+							if err != nil {
+								return err
+							}
+							dc[compIndex] += dcDelta
+							b[0] = dc[compIndex] << al
+						}
+
+						if zig <= zigEnd && d.eobRun > 0 {
+							d.eobRun--
+						} else {
+							// Decode the AC coefficients, as specified in section F.2.2.2.
+							for ; zig <= zigEnd; zig++ {
+								value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
+								if err != nil {
+									return err
+								}
+								val0 := value >> 4
+								val1 := value & 0x0f
+								if val1 != 0 {
+									zig += int32(val0)
+									if zig > zigEnd {
+										break
+									}
+									ac, err := d.receiveExtend(val1)
+									if err != nil {
+										return err
+									}
+									b[unzig[zig]] = ac << al
+								} else {
+									if val0 != 0x0f {
+										d.eobRun = uint16(1 << val0)
+										if val0 != 0 {
+											bits, err := d.decodeBits(int(val0))
+											if err != nil {
+												return err
+											}
+											d.eobRun |= uint16(bits)
+										}
+										d.eobRun--
+										break
+									}
+									zig += 0x0f
+								}
+							}
+						}
+					}
+
+					if d.progressive {
+						if zigEnd != blockSize-1 || al != 0 {
+							// We haven't completely decoded this 8x8 block. Save the coefficients.
+							d.progCoeffs[compIndex][my0*mxx*d.comp[compIndex].h+mx0] = b
+							// At this point, we could execute the rest of the loop body to dequantize and
+							// perform the inverse DCT, to save early stages of a progressive image to the
+							// *image.YCbCr buffers (the whole point of progressive encoding), but in Go,
+							// the jpeg.Decode function does not return until the entire image is decoded,
+							// so we "continue" here to avoid wasted computation.
+							continue
+						}
+					}
+
+					// Dequantize, perform the inverse DCT and store the block to the image.
+					for zig := 0; zig < blockSize; zig++ {
+						b[unzig[zig]] *= qt[zig]
+					}
+					idct(&b)
+					dst, stride := []byte(nil), 0
+					if d.nComp == nGrayComponent {
+						dst, stride = d.img1.Pix[8*(my0*d.img1.Stride+mx0):], d.img1.Stride
+					} else {
+						switch compIndex {
+						case 0:
+							dst, stride = d.img3.Y[8*(my0*d.img3.YStride+mx0):], d.img3.YStride
+						case 1:
+							dst, stride = d.img3.Cb[8*(my0*d.img3.CStride+mx0):], d.img3.CStride
+						case 2:
+							dst, stride = d.img3.Cr[8*(my0*d.img3.CStride+mx0):], d.img3.CStride
+						default:
+							return UnsupportedError("too many components")
+						}
+					}
+					// Level shift by +128, clip to [0, 255], and write to dst.
+					for y := 0; y < 8; y++ {
+						y8 := y * 8
+						yStride := y * stride
+						for x := 0; x < 8; x++ {
+							c := b[y8+x]
+							if c < -128 {
+								c = 0
+							} else if c > 127 {
+								c = 255
+							} else {
+								c += 128
+							}
+							dst[yStride+x] = uint8(c)
+						}
+					}
+				} // for j
+			} // for i
+			mcu++
+			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
+				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
+				// but this one assumes well-formed input, and hence the restart marker follows immediately.
+				_, err := io.ReadFull(d.r, d.tmp[0:2])
+				if err != nil {
+					return err
+				}
+				if d.tmp[0] != 0xff || d.tmp[1] != expectedRST {
+					return FormatError("bad RST marker")
+				}
+				expectedRST++
+				if expectedRST == rst7Marker+1 {
+					expectedRST = rst0Marker
+				}
+				// Reset the Huffman decoder.
+				d.b = bits{}
+				// Reset the DC components, as per section F.2.1.3.1.
+				dc = [nColorComponent]int32{}
+				// Reset the progressive decoder state, as per section G.1.2.2.
+				d.eobRun = 0
+			}
+		} // for mx
+	} // for my
+
+	return nil
+}
+
+// refine decodes a successive approximation refinement block, as specified in
+// section G.1.2.
+func (d *decoder) refine(b *block, h *huffman, zigStart, zigEnd, delta int32) error {
+	// Refining a DC component is trivial.
+	if zigStart == 0 {
+		if zigEnd != 0 {
+			panic("unreachable")
+		}
+		bit, err := d.decodeBit()
+		if err != nil {
+			return err
+		}
+		if bit {
+			b[0] |= delta
+		}
+		return nil
+	}
+
+	// Refining AC components is more complicated; see sections G.1.2.2 and G.1.2.3.
+	zig := zigStart
+	if d.eobRun == 0 {
+	loop:
+		for ; zig <= zigEnd; zig++ {
+			z := int32(0)
+			value, err := d.decodeHuffman(h)
+			if err != nil {
+				return err
+			}
+			val0 := value >> 4
+			val1 := value & 0x0f
+
+			switch val1 {
+			case 0:
+				if val0 != 0x0f {
+					d.eobRun = uint16(1 << val0)
+					if val0 != 0 {
+						bits, err := d.decodeBits(int(val0))
+						if err != nil {
+							return err
+						}
+						d.eobRun |= uint16(bits)
+					}
+					break loop
+				}
+			case 1:
+				z = delta
+				bit, err := d.decodeBit()
+				if err != nil {
+					return err
+				}
+				if !bit {
+					z = -z
+				}
+			default:
+				return FormatError("unexpected Huffman code")
+			}
+
+			zig, err = d.refineNonZeroes(b, zig, zigEnd, int32(val0), delta)
+			if err != nil {
+				return err
+			}
+			if zig > zigEnd {
+				return FormatError("too many coefficients")
+			}
+			if z != 0 {
+				b[unzig[zig]] = z
+			}
+		}
+	}
+	if d.eobRun > 0 {
+		d.eobRun--
+		if _, err := d.refineNonZeroes(b, zig, zigEnd, -1, delta); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// refineNonZeroes refines non-zero entries of b in zig-zag order. If nz >= 0,
+// the first nz zero entries are skipped over.
+func (d *decoder) refineNonZeroes(b *block, zig, zigEnd, nz, delta int32) (int32, error) {
+	for ; zig <= zigEnd; zig++ {
+		u := unzig[zig]
+		if b[u] == 0 {
+			if nz == 0 {
+				break
+			}
+			nz--
+			continue
+		}
+		bit, err := d.decodeBit()
+		if err != nil {
+			return 0, err
+		}
+		if !bit {
+			continue
+		}
+		if b[u] >= 0 {
+			b[u] += delta
+		} else {
+			b[u] -= delta
+		}
+	}
+	return zig, nil
+}
diff --git a/src/pkg/image/jpeg/writer.go b/src/pkg/image/jpeg/writer.go
index 3322c09fe..c58fbf305 100644
--- a/src/pkg/image/jpeg/writer.go
+++ b/src/pkg/image/jpeg/writer.go
@@ -21,7 +21,7 @@ func min(x, y int) int {
 }
 
 // div returns a/b rounded to the nearest integer, instead of rounded to zero.
-func div(a int, b int) int {
+func div(a, b int32) int32 {
 	if a >= 0 {
 		return (a + (b >> 1)) / b
 	}
@@ -56,26 +56,28 @@ const (
 	nQuantIndex
 )
 
-// unscaledQuant are the unscaled quantization tables. Each encoder copies and
-// scales the tables according to its quality parameter.
+// unscaledQuant are the unscaled quantization tables in zig-zag order. Each
+// encoder copies and scales the tables according to its quality parameter.
+// The values are derived from section K.1 after converting from natural to
+// zig-zag order.
 var unscaledQuant = [nQuantIndex][blockSize]byte{
 	// Luminance.
 	{
-		16, 11, 10, 16, 24, 40, 51, 61,
-		12, 12, 14, 19, 26, 58, 60, 55,
-		14, 13, 16, 24, 40, 57, 69, 56,
-		14, 17, 22, 29, 51, 87, 80, 62,
-		18, 22, 37, 56, 68, 109, 103, 77,
-		24, 35, 55, 64, 81, 104, 113, 92,
-		49, 64, 78, 87, 103, 121, 120, 101,
-		72, 92, 95, 98, 112, 100, 103, 99,
+		16, 11, 12, 14, 12, 10, 16, 14,
+		13, 14, 18, 17, 16, 19, 24, 40,
+		26, 24, 22, 22, 24, 49, 35, 37,
+		29, 40, 58, 51, 61, 60, 57, 51,
+		56, 55, 64, 72, 92, 78, 64, 68,
+		87, 69, 55, 56, 80, 109, 81, 87,
+		95, 98, 103, 104, 103, 62, 77, 113,
+		121, 112, 100, 120, 92, 101, 103, 99,
 	},
 	// Chrominance.
 	{
-		17, 18, 24, 47, 99, 99, 99, 99,
-		18, 21, 26, 66, 99, 99, 99, 99,
-		24, 26, 56, 99, 99, 99, 99, 99,
-		47, 66, 99, 99, 99, 99, 99, 99,
+		17, 18, 18, 24, 21, 24, 47, 26,
+		26, 47, 99, 66, 56, 66, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
 		99, 99, 99, 99, 99, 99, 99, 99,
 		99, 99, 99, 99, 99, 99, 99, 99,
 		99, 99, 99, 99, 99, 99, 99, 99,
@@ -208,8 +210,8 @@ func init() {
 // writer is a buffered writer.
 type writer interface {
 	Flush() error
-	Write([]byte) (int, error)
-	WriteByte(byte) error
+	io.Writer
+	io.ByteWriter
 }
 
 // encoder encodes an image to the JPEG format.
@@ -222,7 +224,7 @@ type encoder struct {
 	buf [16]byte
 	// bits and nBits are accumulated bits to write to w.
 	bits, nBits uint32
-	// quant is the scaled quantization tables.
+	// quant is the scaled quantization tables, in zig-zag order.
 	quant [nQuantIndex][blockSize]byte
 }
 
@@ -266,14 +268,14 @@ func (e *encoder) emit(bits, nBits uint32) {
 }
 
 // emitHuff emits the given value with the given Huffman encoder.
-func (e *encoder) emitHuff(h huffIndex, value int) {
+func (e *encoder) emitHuff(h huffIndex, value int32) {
 	x := theHuffmanLUT[h][value]
 	e.emit(x&(1<<24-1), x>>24)
 }
 
 // emitHuffRLE emits a run of runLength copies of value encoded with the given
 // Huffman encoder.
-func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int) {
+func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int32) {
 	a, b := value, value
 	if a < 0 {
 		a, b = -value, value-1
@@ -284,7 +286,7 @@ func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int) {
 	} else {
 		nBits = 8 + uint32(bitCount[a>>8])
 	}
-	e.emitHuff(h, runLength<<4|int(nBits))
+	e.emitHuff(h, runLength<<4|int32(nBits))
 	if nBits > 0 {
 		e.emit(uint32(b)&(1<<nBits-1), nBits)
 	}
@@ -301,7 +303,7 @@ func (e *encoder) writeMarkerHeader(marker uint8, markerlen int) {
 
 // writeDQT writes the Define Quantization Table marker.
 func (e *encoder) writeDQT() {
-	markerlen := 2 + int(nQuantIndex)*(1+blockSize)
+	const markerlen = 2 + int(nQuantIndex)*(1+blockSize)
 	e.writeMarkerHeader(dqtMarker, markerlen)
 	for i := range e.quant {
 		e.writeByte(uint8(i))
@@ -311,7 +313,7 @@ func (e *encoder) writeDQT() {
 
 // writeSOF0 writes the Start Of Frame (Baseline) marker.
 func (e *encoder) writeSOF0(size image.Point) {
-	markerlen := 8 + 3*nColorComponent
+	const markerlen = 8 + 3*nColorComponent
 	e.writeMarkerHeader(sof0Marker, markerlen)
 	e.buf[0] = 8 // 8-bit color.
 	e.buf[1] = uint8(size.Y >> 8)
@@ -344,15 +346,16 @@ func (e *encoder) writeDHT() {
 
 // writeBlock writes a block of pixel data using the given quantization table,
 // returning the post-quantized DC value of the DCT-transformed block.
-func (e *encoder) writeBlock(b *block, q quantIndex, prevDC int) int {
+// b is in natural (not zig-zag) order.
+func (e *encoder) writeBlock(b *block, q quantIndex, prevDC int32) int32 {
 	fdct(b)
 	// Emit the DC delta.
-	dc := div(b[0], (8 * int(e.quant[q][0])))
+	dc := div(b[0], 8*int32(e.quant[q][0]))
 	e.emitHuffRLE(huffIndex(2*q+0), 0, dc-prevDC)
 	// Emit the AC components.
-	h, runLength := huffIndex(2*q+1), 0
-	for k := 1; k < blockSize; k++ {
-		ac := div(b[unzig[k]], (8 * int(e.quant[q][k])))
+	h, runLength := huffIndex(2*q+1), int32(0)
+	for zig := 1; zig < blockSize; zig++ {
+		ac := div(b[unzig[zig]], 8*int32(e.quant[q][zig]))
 		if ac == 0 {
 			runLength++
 		} else {
@@ -380,9 +383,9 @@ func toYCbCr(m image.Image, p image.Point, yBlock, cbBlock, crBlock *block) {
 		for i := 0; i < 8; i++ {
 			r, g, b, _ := m.At(min(p.X+i, xmax), min(p.Y+j, ymax)).RGBA()
 			yy, cb, cr := color.RGBToYCbCr(uint8(r>>8), uint8(g>>8), uint8(b>>8))
-			yBlock[8*j+i] = int(yy)
-			cbBlock[8*j+i] = int(cb)
-			crBlock[8*j+i] = int(cr)
+			yBlock[8*j+i] = int32(yy)
+			cbBlock[8*j+i] = int32(cb)
+			crBlock[8*j+i] = int32(cr)
 		}
 	}
 }
@@ -405,9 +408,9 @@ func rgbaToYCbCr(m *image.RGBA, p image.Point, yBlock, cbBlock, crBlock *block)
 			}
 			pix := m.Pix[offset+sx*4:]
 			yy, cb, cr := color.RGBToYCbCr(pix[0], pix[1], pix[2])
-			yBlock[8*j+i] = int(yy)
-			cbBlock[8*j+i] = int(cb)
-			crBlock[8*j+i] = int(cr)
+			yBlock[8*j+i] = int32(yy)
+			cbBlock[8*j+i] = int32(cb)
+			crBlock[8*j+i] = int32(cr)
 		}
 	}
 }
@@ -433,10 +436,12 @@ func scale(dst *block, src *[4]block) {
 //	- component 1 uses DC table 0 and AC table 0 "\x01\x00",
 //	- component 2 uses DC table 1 and AC table 1 "\x02\x11",
 //	- component 3 uses DC table 1 and AC table 1 "\x03\x11",
-//	- padding "\x00\x00\x00".
+//	- the bytes "\x00\x3f\x00". Section B.2.3 of the spec says that for
+//	  sequential DCTs, those bytes (8-bit Ss, 8-bit Se, 4-bit Ah, 4-bit Al)
+//	  should be 0x00, 0x3f, 0x00<<4 | 0x00.
 var sosHeader = []byte{
 	0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
-	0x11, 0x03, 0x11, 0x00, 0x00, 0x00,
+	0x11, 0x03, 0x11, 0x00, 0x3f, 0x00,
 }
 
 // writeSOS writes the StartOfScan marker.
@@ -444,12 +449,11 @@ func (e *encoder) writeSOS(m image.Image) {
 	e.write(sosHeader)
 	var (
 		// Scratch buffers to hold the YCbCr values.
-		yBlock  block
-		cbBlock [4]block
-		crBlock [4]block
-		cBlock  block
+		// The blocks are in natural (not zig-zag) order.
+		b      block
+		cb, cr [4]block
 		// DC components are delta-encoded.
-		prevDCY, prevDCCb, prevDCCr int
+		prevDCY, prevDCCb, prevDCCr int32
 	)
 	bounds := m.Bounds()
 	rgba, _ := m.(*image.RGBA)
@@ -460,16 +464,16 @@ func (e *encoder) writeSOS(m image.Image) {
 				yOff := (i & 2) * 4
 				p := image.Pt(x+xOff, y+yOff)
 				if rgba != nil {
-					rgbaToYCbCr(rgba, p, &yBlock, &cbBlock[i], &crBlock[i])
+					rgbaToYCbCr(rgba, p, &b, &cb[i], &cr[i])
 				} else {
-					toYCbCr(m, p, &yBlock, &cbBlock[i], &crBlock[i])
+					toYCbCr(m, p, &b, &cb[i], &cr[i])
 				}
-				prevDCY = e.writeBlock(&yBlock, 0, prevDCY)
+				prevDCY = e.writeBlock(&b, 0, prevDCY)
 			}
-			scale(&cBlock, &cbBlock)
-			prevDCCb = e.writeBlock(&cBlock, 1, prevDCCb)
-			scale(&cBlock, &crBlock)
-			prevDCCr = e.writeBlock(&cBlock, 1, prevDCCr)
+			scale(&b, &cb)
+			prevDCCb = e.writeBlock(&b, 1, prevDCCb)
+			scale(&b, &cr)
+			prevDCCr = e.writeBlock(&b, 1, prevDCCr)
 		}
 	}
 	// Pad the last byte with 1's.
diff --git a/src/pkg/image/jpeg/writer_test.go b/src/pkg/image/jpeg/writer_test.go
index b8e8fa34e..0b2143f5b 100644
--- a/src/pkg/image/jpeg/writer_test.go
+++ b/src/pkg/image/jpeg/writer_test.go
@@ -6,6 +6,7 @@ package jpeg
 
 import (
 	"bytes"
+	"fmt"
 	"image"
 	"image/color"
 	"image/png"
@@ -15,6 +16,87 @@ import (
 	"testing"
 )
 
+// zigzag maps from the natural ordering to the zig-zag ordering. For example,
+// zigzag[0*8 + 3] is the zig-zag sequence number of the element in the fourth
+// column and first row.
+var zigzag = [blockSize]int{
+	0, 1, 5, 6, 14, 15, 27, 28,
+	2, 4, 7, 13, 16, 26, 29, 42,
+	3, 8, 12, 17, 25, 30, 41, 43,
+	9, 11, 18, 24, 31, 40, 44, 53,
+	10, 19, 23, 32, 39, 45, 52, 54,
+	20, 22, 33, 38, 46, 51, 55, 60,
+	21, 34, 37, 47, 50, 56, 59, 61,
+	35, 36, 48, 49, 57, 58, 62, 63,
+}
+
+func TestZigUnzig(t *testing.T) {
+	for i := 0; i < blockSize; i++ {
+		if unzig[zigzag[i]] != i {
+			t.Errorf("unzig[zigzag[%d]] == %d", i, unzig[zigzag[i]])
+		}
+		if zigzag[unzig[i]] != i {
+			t.Errorf("zigzag[unzig[%d]] == %d", i, zigzag[unzig[i]])
+		}
+	}
+}
+
+// unscaledQuantInNaturalOrder are the unscaled quantization tables in
+// natural (not zig-zag) order, as specified in section K.1.
+var unscaledQuantInNaturalOrder = [nQuantIndex][blockSize]byte{
+	// Luminance.
+	{
+		16, 11, 10, 16, 24, 40, 51, 61,
+		12, 12, 14, 19, 26, 58, 60, 55,
+		14, 13, 16, 24, 40, 57, 69, 56,
+		14, 17, 22, 29, 51, 87, 80, 62,
+		18, 22, 37, 56, 68, 109, 103, 77,
+		24, 35, 55, 64, 81, 104, 113, 92,
+		49, 64, 78, 87, 103, 121, 120, 101,
+		72, 92, 95, 98, 112, 100, 103, 99,
+	},
+	// Chrominance.
+	{
+		17, 18, 24, 47, 99, 99, 99, 99,
+		18, 21, 26, 66, 99, 99, 99, 99,
+		24, 26, 56, 99, 99, 99, 99, 99,
+		47, 66, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+	},
+}
+
+func TestUnscaledQuant(t *testing.T) {
+	bad := false
+	for i := quantIndex(0); i < nQuantIndex; i++ {
+		for zig := 0; zig < blockSize; zig++ {
+			got := unscaledQuant[i][zig]
+			want := unscaledQuantInNaturalOrder[i][unzig[zig]]
+			if got != want {
+				t.Errorf("i=%d, zig=%d: got %d, want %d", i, zig, got, want)
+				bad = true
+			}
+		}
+	}
+	if bad {
+		names := [nQuantIndex]string{"Luminance", "Chrominance"}
+		buf := &bytes.Buffer{}
+		for i, name := range names {
+			fmt.Fprintf(buf, "// %s.\n{\n", name)
+			for zig := 0; zig < blockSize; zig++ {
+				fmt.Fprintf(buf, "%d, ", unscaledQuantInNaturalOrder[i][unzig[zig]])
+				if zig%8 == 7 {
+					buf.WriteString("\n")
+				}
+			}
+			buf.WriteString("},\n")
+		}
+		t.Logf("expected unscaledQuant values:\n%s", buf.String())
+	}
+}
+
 var testCase = []struct {
 	filename  string
 	quality   int
@@ -89,24 +171,21 @@ func TestWriter(t *testing.T) {
 	}
 }
 
-func BenchmarkEncodeRGBOpaque(b *testing.B) {
+func BenchmarkEncode(b *testing.B) {
 	b.StopTimer()
 	img := image.NewRGBA(image.Rect(0, 0, 640, 480))
-	// Set all pixels to 0xFF alpha to force opaque mode.
 	bo := img.Bounds()
 	rnd := rand.New(rand.NewSource(123))
 	for y := bo.Min.Y; y < bo.Max.Y; y++ {
 		for x := bo.Min.X; x < bo.Max.X; x++ {
-			img.Set(x, y, color.RGBA{
+			img.SetRGBA(x, y, color.RGBA{
 				uint8(rnd.Intn(256)),
 				uint8(rnd.Intn(256)),
 				uint8(rnd.Intn(256)),
-				255})
+				255,
+			})
 		}
 	}
-	if !img.Opaque() {
-		b.Fatal("expected image to be opaque")
-	}
 	b.SetBytes(640 * 480 * 4)
 	b.StartTimer()
 	options := &Options{Quality: 90}
diff --git a/src/pkg/image/names.go b/src/pkg/image/names.go
index 55f634c17..04ee2cfb4 100644
--- a/src/pkg/image/names.go
+++ b/src/pkg/image/names.go
@@ -20,7 +20,7 @@ var (
 )
 
 // Uniform is an infinite-sized Image of uniform color.
-// It implements the color.Color, color.ColorModel, and Image interfaces.
+// It implements the color.Color, color.Model, and Image interfaces.
 type Uniform struct {
 	C color.Color
 }
diff --git a/src/pkg/image/png/paeth.go b/src/pkg/image/png/paeth.go
new file mode 100644
index 000000000..37978aa66
--- /dev/null
+++ b/src/pkg/image/png/paeth.go
@@ -0,0 +1,70 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package png
+
+// paeth implements the Paeth filter function, as per the PNG specification.
+func paeth(a, b, c uint8) uint8 {
+	// This is an optimized version of the sample code in the PNG spec.
+	// For example, the sample code starts with:
+	//	p := int(a) + int(b) - int(c)
+	//	pa := abs(p - int(a))
+	// but the optimized form uses fewer arithmetic operations:
+	//	pa := int(b) - int(c)
+	//	pa = abs(pa)
+	pc := int(c)
+	pa := int(b) - pc
+	pb := int(a) - pc
+	pc = pa + pb
+	if pa < 0 {
+		pa = -pa
+	}
+	if pb < 0 {
+		pb = -pb
+	}
+	if pc < 0 {
+		pc = -pc
+	}
+	if pa <= pb && pa <= pc {
+		return a
+	} else if pb <= pc {
+		return b
+	}
+	return c
+}
+
+// filterPaeth applies the Paeth filter to the cdat slice.
+// cdat is the current row's data, pdat is the previous row's data.
+func filterPaeth(cdat, pdat []byte, bytesPerPixel int) {
+	var a, b, c, pa, pb, pc int
+	for i := 0; i < bytesPerPixel; i++ {
+		a, c = 0, 0
+		for j := i; j < len(cdat); j += bytesPerPixel {
+			b = int(pdat[j])
+			pa = b - c
+			pb = a - c
+			pc = pa + pb
+			if pa < 0 {
+				pa = -pa
+			}
+			if pb < 0 {
+				pb = -pb
+			}
+			if pc < 0 {
+				pc = -pc
+			}
+			if pa <= pb && pa <= pc {
+				// No-op.
+			} else if pb <= pc {
+				a = b
+			} else {
+				a = c
+			}
+			a += int(cdat[j])
+			a &= 0xff
+			cdat[j] = uint8(a)
+			c = b
+		}
+	}
+}
diff --git a/src/pkg/image/png/paeth_test.go b/src/pkg/image/png/paeth_test.go
new file mode 100644
index 000000000..bb084861a
--- /dev/null
+++ b/src/pkg/image/png/paeth_test.go
@@ -0,0 +1,91 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package png
+
+import (
+	"bytes"
+	"math/rand"
+	"testing"
+)
+
+func abs(x int) int {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
+
+// slowPaeth is a slow but simple implementation of the Paeth function.
+// It is a straight port of the sample code in the PNG spec, section 9.4.
+func slowPaeth(a, b, c uint8) uint8 {
+	p := int(a) + int(b) - int(c)
+	pa := abs(p - int(a))
+	pb := abs(p - int(b))
+	pc := abs(p - int(c))
+	if pa <= pb && pa <= pc {
+		return a
+	} else if pb <= pc {
+		return b
+	}
+	return c
+}
+
+// slowFilterPaeth is a slow but simple implementation of func filterPaeth.
+func slowFilterPaeth(cdat, pdat []byte, bytesPerPixel int) {
+	for i := 0; i < bytesPerPixel; i++ {
+		cdat[i] += paeth(0, pdat[i], 0)
+	}
+	for i := bytesPerPixel; i < len(cdat); i++ {
+		cdat[i] += paeth(cdat[i-bytesPerPixel], pdat[i], pdat[i-bytesPerPixel])
+	}
+}
+
+func TestPaeth(t *testing.T) {
+	for a := 0; a < 256; a += 15 {
+		for b := 0; b < 256; b += 15 {
+			for c := 0; c < 256; c += 15 {
+				got := paeth(uint8(a), uint8(b), uint8(c))
+				want := slowPaeth(uint8(a), uint8(b), uint8(c))
+				if got != want {
+					t.Errorf("a, b, c = %d, %d, %d: got %d, want %d", a, b, c, got, want)
+				}
+			}
+		}
+	}
+}
+
+func BenchmarkPaeth(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		paeth(uint8(i>>16), uint8(i>>8), uint8(i))
+	}
+}
+
+func TestPaethDecode(t *testing.T) {
+	pdat0 := make([]byte, 32)
+	pdat1 := make([]byte, 32)
+	pdat2 := make([]byte, 32)
+	cdat0 := make([]byte, 32)
+	cdat1 := make([]byte, 32)
+	cdat2 := make([]byte, 32)
+	r := rand.New(rand.NewSource(1))
+	for bytesPerPixel := 1; bytesPerPixel <= 8; bytesPerPixel++ {
+		for i := 0; i < 100; i++ {
+			for j := range pdat0 {
+				pdat0[j] = uint8(r.Uint32())
+				cdat0[j] = uint8(r.Uint32())
+			}
+			copy(pdat1, pdat0)
+			copy(pdat2, pdat0)
+			copy(cdat1, cdat0)
+			copy(cdat2, cdat0)
+			filterPaeth(cdat1, pdat1, bytesPerPixel)
+			slowFilterPaeth(cdat2, pdat2, bytesPerPixel)
+			if !bytes.Equal(cdat1, cdat2) {
+				t.Errorf("bytesPerPixel: %d\npdat0: % x\ncdat0: % x\ngot:   % x\nwant:  % x", bytesPerPixel, pdat0, cdat0, cdat1, cdat2)
+				break
+			}
+		}
+	}
+}
diff --git a/src/pkg/image/png/reader.go b/src/pkg/image/png/reader.go
index fe07d60a9..a6bf86ede 100644
--- a/src/pkg/image/png/reader.go
+++ b/src/pkg/image/png/reader.go
@@ -98,13 +98,6 @@ type UnsupportedError string
 
 func (e UnsupportedError) Error() string { return "png: unsupported feature: " + string(e) }
 
-func abs(x int) int {
-	if x < 0 {
-		return -x
-	}
-	return x
-}
-
 func min(a, b int) int {
 	if a < b {
 		return a
@@ -200,10 +193,19 @@ func (d *decoder) parsePLTE(length uint32) error {
 	d.crc.Write(d.tmp[:n])
 	switch d.cb {
 	case cbP1, cbP2, cbP4, cbP8:
-		d.palette = color.Palette(make([]color.Color, np))
+		d.palette = make(color.Palette, 256)
 		for i := 0; i < np; i++ {
 			d.palette[i] = color.RGBA{d.tmp[3*i+0], d.tmp[3*i+1], d.tmp[3*i+2], 0xff}
 		}
+		for i := np; i < 256; i++ {
+			// Initialize the rest of the palette to opaque black. The spec (section
+			// 11.2.3) says that "any out-of-range pixel value found in the image data
+			// is an error", but some real-world PNG files have out-of-range pixel
+			// values. We fall back to opaque black, the same as libpng 1.5.13;
+			// ImageMagick 6.5.7 returns an error.
+			d.palette[i] = color.RGBA{0x00, 0x00, 0x00, 0xff}
+		}
+		d.palette = d.palette[:np]
 	case cbTC8, cbTCA8, cbTC16, cbTCA16:
 		// As per the PNG spec, a PLTE chunk is optional (and for practical purposes,
 		// ignorable) for the ctTrueColor and ctTrueColorAlpha color types (section 4.1.2).
@@ -228,12 +230,12 @@ func (d *decoder) parsetRNS(length uint32) error {
 	case cbTC8, cbTC16:
 		return UnsupportedError("truecolor transparency")
 	case cbP1, cbP2, cbP4, cbP8:
-		if n > len(d.palette) {
-			return FormatError("bad tRNS length")
+		if len(d.palette) < n {
+			d.palette = d.palette[:n]
 		}
 		for i := 0; i < n; i++ {
 			rgba := d.palette[i].(color.RGBA)
-			d.palette[i] = color.RGBA{rgba.R, rgba.G, rgba.B, d.tmp[i]}
+			d.palette[i] = color.NRGBA{rgba.R, rgba.G, rgba.B, d.tmp[i]}
 		}
 	case cbGA8, cbGA16, cbTCA8, cbTCA16:
 		return FormatError("tRNS, color type mismatch")
@@ -241,20 +243,6 @@ func (d *decoder) parsetRNS(length uint32) error {
 	return d.verifyChecksum()
 }
 
-// The Paeth filter function, as per the PNG specification.
-func paeth(a, b, c uint8) uint8 {
-	p := int(a) + int(b) - int(c)
-	pa := abs(p - int(a))
-	pb := abs(p - int(b))
-	pc := abs(p - int(c))
-	if pa <= pb && pa <= pc {
-		return a
-	} else if pb <= pc {
-		return b
-	}
-	return c
-}
-
 // Read presents one or more IDAT chunks as one continuous stream (minus the
 // intermediate chunk headers and footers). If the PNG data looked like:
 //   ... len0 IDAT xxx crc0 len1 IDAT yy crc1 len2 IEND crc2
@@ -300,7 +288,7 @@ func (d *decoder) decode() (image.Image, error) {
 	}
 	defer r.Close()
 	bitsPerPixel := 0
-	maxPalette := uint8(0)
+	pixOffset := 0
 	var (
 		gray     *image.Gray
 		rgba     *image.RGBA
@@ -328,7 +316,6 @@ func (d *decoder) decode() (image.Image, error) {
 		bitsPerPixel = d.depth
 		paletted = image.NewPaletted(image.Rect(0, 0, d.width, d.height), d.palette)
 		img = paletted
-		maxPalette = uint8(len(d.palette) - 1)
 	case cbTCA8:
 		bitsPerPixel = 32
 		nrgba = image.NewNRGBA(image.Rect(0, 0, d.width, d.height))
@@ -375,8 +362,8 @@ func (d *decoder) decode() (image.Image, error) {
 				cdat[i] += cdat[i-bytesPerPixel]
 			}
 		case ftUp:
-			for i := 0; i < len(cdat); i++ {
-				cdat[i] += pdat[i]
+			for i, p := range pdat {
+				cdat[i] += p
 			}
 		case ftAverage:
 			for i := 0; i < bytesPerPixel; i++ {
@@ -386,12 +373,7 @@ func (d *decoder) decode() (image.Image, error) {
 				cdat[i] += uint8((int(cdat[i-bytesPerPixel]) + int(pdat[i])) / 2)
 			}
 		case ftPaeth:
-			for i := 0; i < bytesPerPixel; i++ {
-				cdat[i] += paeth(0, pdat[i], 0)
-			}
-			for i := bytesPerPixel; i < len(cdat); i++ {
-				cdat[i] += paeth(cdat[i-bytesPerPixel], pdat[i], pdat[i-bytesPerPixel])
-			}
+			filterPaeth(cdat, pdat, bytesPerPixel)
 		default:
 			return nil, FormatError("bad filter type")
 		}
@@ -423,25 +405,31 @@ func (d *decoder) decode() (image.Image, error) {
 				}
 			}
 		case cbG8:
-			for x := 0; x < d.width; x++ {
-				gray.SetGray(x, y, color.Gray{cdat[x]})
-			}
+			copy(gray.Pix[pixOffset:], cdat)
+			pixOffset += gray.Stride
 		case cbGA8:
 			for x := 0; x < d.width; x++ {
 				ycol := cdat[2*x+0]
 				nrgba.SetNRGBA(x, y, color.NRGBA{ycol, ycol, ycol, cdat[2*x+1]})
 			}
 		case cbTC8:
+			pix, i, j := rgba.Pix, pixOffset, 0
 			for x := 0; x < d.width; x++ {
-				rgba.SetRGBA(x, y, color.RGBA{cdat[3*x+0], cdat[3*x+1], cdat[3*x+2], 0xff})
+				pix[i+0] = cdat[j+0]
+				pix[i+1] = cdat[j+1]
+				pix[i+2] = cdat[j+2]
+				pix[i+3] = 0xff
+				i += 4
+				j += 3
 			}
+			pixOffset += rgba.Stride
 		case cbP1:
 			for x := 0; x < d.width; x += 8 {
 				b := cdat[x/8]
 				for x2 := 0; x2 < 8 && x+x2 < d.width; x2++ {
 					idx := b >> 7
-					if idx > maxPalette {
-						return nil, FormatError("palette index out of range")
+					if len(paletted.Palette) <= int(idx) {
+						paletted.Palette = paletted.Palette[:int(idx)+1]
 					}
 					paletted.SetColorIndex(x+x2, y, idx)
 					b <<= 1
@@ -452,8 +440,8 @@ func (d *decoder) decode() (image.Image, error) {
 				b := cdat[x/4]
 				for x2 := 0; x2 < 4 && x+x2 < d.width; x2++ {
 					idx := b >> 6
-					if idx > maxPalette {
-						return nil, FormatError("palette index out of range")
+					if len(paletted.Palette) <= int(idx) {
+						paletted.Palette = paletted.Palette[:int(idx)+1]
 					}
 					paletted.SetColorIndex(x+x2, y, idx)
 					b <<= 2
@@ -464,24 +452,26 @@ func (d *decoder) decode() (image.Image, error) {
 				b := cdat[x/2]
 				for x2 := 0; x2 < 2 && x+x2 < d.width; x2++ {
 					idx := b >> 4
-					if idx > maxPalette {
-						return nil, FormatError("palette index out of range")
+					if len(paletted.Palette) <= int(idx) {
+						paletted.Palette = paletted.Palette[:int(idx)+1]
 					}
 					paletted.SetColorIndex(x+x2, y, idx)
 					b <<= 4
 				}
 			}
 		case cbP8:
-			for x := 0; x < d.width; x++ {
-				if cdat[x] > maxPalette {
-					return nil, FormatError("palette index out of range")
+			if len(paletted.Palette) != 255 {
+				for x := 0; x < d.width; x++ {
+					if len(paletted.Palette) <= int(cdat[x]) {
+						paletted.Palette = paletted.Palette[:int(cdat[x])+1]
+					}
 				}
-				paletted.SetColorIndex(x, y, cdat[x])
 			}
+			copy(paletted.Pix[pixOffset:], cdat)
+			pixOffset += paletted.Stride
 		case cbTCA8:
-			for x := 0; x < d.width; x++ {
-				nrgba.SetNRGBA(x, y, color.NRGBA{cdat[4*x+0], cdat[4*x+1], cdat[4*x+2], cdat[4*x+3]})
-			}
+			copy(nrgba.Pix[pixOffset:], cdat)
+			pixOffset += nrgba.Stride
 		case cbG16:
 			for x := 0; x < d.width; x++ {
 				ycol := uint16(cdat[2*x+0])<<8 | uint16(cdat[2*x+1])
@@ -662,10 +652,11 @@ func DecodeConfig(r io.Reader) (image.Config, error) {
 			}
 			return image.Config{}, err
 		}
-		if d.stage == dsSeenIHDR && d.cb != cbP8 {
+		paletted := d.cb == cbP8 || d.cb == cbP4 || d.cb == cbP2 || d.cb == cbP1
+		if d.stage == dsSeenIHDR && !paletted {
 			break
 		}
-		if d.stage == dsSeenPLTE && d.cb == cbP8 {
+		if d.stage == dsSeenPLTE && paletted {
 			break
 		}
 	}
diff --git a/src/pkg/image/png/reader_test.go b/src/pkg/image/png/reader_test.go
index 24c4ea448..ac0d949a9 100644
--- a/src/pkg/image/png/reader_test.go
+++ b/src/pkg/image/png/reader_test.go
@@ -10,6 +10,7 @@ import (
 	"image"
 	"image/color"
 	"io"
+	"io/ioutil"
 	"os"
 	"strings"
 	"testing"
@@ -37,6 +38,14 @@ var filenames = []string{
 	"basn6a16",
 }
 
+var filenamesPaletted = []string{
+	"basn3p01",
+	"basn3p02",
+	"basn3p04",
+	"basn3p08",
+	"basn3p08-trns",
+}
+
 var filenamesShort = []string{
 	"basn0g01",
 	"basn0g04-31",
@@ -106,13 +115,18 @@ func sng(w io.WriteCloser, filename string, png image.Image) {
 		lastAlpha := -1
 		io.WriteString(w, "PLTE {\n")
 		for i, c := range cpm {
-			r, g, b, a := c.RGBA()
-			if a != 0xffff {
+			var r, g, b, a uint8
+			switch c := c.(type) {
+			case color.RGBA:
+				r, g, b, a = c.R, c.G, c.B, 0xff
+			case color.NRGBA:
+				r, g, b, a = c.R, c.G, c.B, c.A
+			default:
+				panic("unknown palette color type")
+			}
+			if a != 0xff {
 				lastAlpha = i
 			}
-			r >>= 8
-			g >>= 8
-			b >>= 8
 			fmt.Fprintf(w, "    (%3d,%3d,%3d)     # rgb = (0x%02x,0x%02x,0x%02x)\n", r, g, b, r, g, b)
 		}
 		io.WriteString(w, "}\n")
@@ -202,7 +216,7 @@ func TestReader(t *testing.T) {
 		}
 
 		piper, pipew := io.Pipe()
-		pb := bufio.NewReader(piper)
+		pb := bufio.NewScanner(piper)
 		go sng(pipew, fn, img)
 		defer piper.Close()
 
@@ -213,7 +227,7 @@ func TestReader(t *testing.T) {
 			continue
 		}
 		defer sf.Close()
-		sb := bufio.NewReader(sf)
+		sb := bufio.NewScanner(sf)
 		if err != nil {
 			t.Error(fn, err)
 			continue
@@ -221,24 +235,28 @@ func TestReader(t *testing.T) {
 
 		// Compare the two, in SNG format, line by line.
 		for {
-			ps, perr := pb.ReadString('\n')
-			ss, serr := sb.ReadString('\n')
-			if perr == io.EOF && serr == io.EOF {
+			pdone := pb.Scan()
+			sdone := sb.Scan()
+			if pdone && sdone {
 				break
 			}
-			if perr != nil {
-				t.Error(fn, perr)
-				break
-			}
-			if serr != nil {
-				t.Error(fn, serr)
+			if pdone || sdone {
+				t.Errorf("%s: Different sizes", fn)
 				break
 			}
+			ps := pb.Text()
+			ss := sb.Text()
 			if ps != ss {
 				t.Errorf("%s: Mismatch\n%sversus\n%s\n", fn, ps, ss)
 				break
 			}
 		}
+		if pb.Err() != nil {
+			t.Error(fn, pb.Err())
+		}
+		if sb.Err() != nil {
+			t.Error(fn, sb.Err())
+		}
 	}
 }
 
@@ -267,3 +285,66 @@ func TestReaderError(t *testing.T) {
 		}
 	}
 }
+
+func TestPalettedDecodeConfig(t *testing.T) {
+	for _, fn := range filenamesPaletted {
+		f, err := os.Open("testdata/pngsuite/" + fn + ".png")
+		if err != nil {
+			t.Errorf("%s: open failed: %v", fn, err)
+			continue
+		}
+		defer f.Close()
+		cfg, err := DecodeConfig(f)
+		if err != nil {
+			t.Errorf("%s: %v", fn, err)
+			continue
+		}
+		pal, ok := cfg.ColorModel.(color.Palette)
+		if !ok {
+			t.Errorf("%s: expected paletted color model", fn)
+			continue
+		}
+		if pal == nil {
+			t.Errorf("%s: palette not initialized", fn)
+			continue
+		}
+	}
+}
+
+func benchmarkDecode(b *testing.B, filename string, bytesPerPixel int) {
+	b.StopTimer()
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+	s := string(data)
+	cfg, err := DecodeConfig(strings.NewReader(s))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(cfg.Width * cfg.Height * bytesPerPixel))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Decode(strings.NewReader(s))
+	}
+}
+
+func BenchmarkDecodeGray(b *testing.B) {
+	benchmarkDecode(b, "testdata/benchGray.png", 1)
+}
+
+func BenchmarkDecodeNRGBAGradient(b *testing.B) {
+	benchmarkDecode(b, "testdata/benchNRGBA-gradient.png", 4)
+}
+
+func BenchmarkDecodeNRGBAOpaque(b *testing.B) {
+	benchmarkDecode(b, "testdata/benchNRGBA-opaque.png", 4)
+}
+
+func BenchmarkDecodePaletted(b *testing.B) {
+	benchmarkDecode(b, "testdata/benchPaletted.png", 1)
+}
+
+func BenchmarkDecodeRGB(b *testing.B) {
+	benchmarkDecode(b, "testdata/benchRGB.png", 4)
+}
diff --git a/src/pkg/image/png/testdata/benchGray.png b/src/pkg/image/png/testdata/benchGray.png
new file mode 100644
index 000000000..42bc6c3a0
--- /dev/null
+++ b/src/pkg/image/png/testdata/benchGray.png
diff --git a/src/pkg/image/png/testdata/benchNRGBA-gradient.png b/src/pkg/image/png/testdata/benchNRGBA-gradient.png
new file mode 100644
index 000000000..961934cca
--- /dev/null
+++ b/src/pkg/image/png/testdata/benchNRGBA-gradient.png
diff --git a/src/pkg/image/png/testdata/benchNRGBA-opaque.png b/src/pkg/image/png/testdata/benchNRGBA-opaque.png
new file mode 100644
index 000000000..ca4f4a037
--- /dev/null
+++ b/src/pkg/image/png/testdata/benchNRGBA-opaque.png
diff --git a/src/pkg/image/png/testdata/benchPaletted.png b/src/pkg/image/png/testdata/benchPaletted.png
new file mode 100644
index 000000000..4b4d5b992
--- /dev/null
+++ b/src/pkg/image/png/testdata/benchPaletted.png
diff --git a/src/pkg/image/png/testdata/benchRGB.png b/src/pkg/image/png/testdata/benchRGB.png
new file mode 100644
index 000000000..31ac65a3f
--- /dev/null
+++ b/src/pkg/image/png/testdata/benchRGB.png
diff --git a/src/pkg/image/png/writer.go b/src/pkg/image/png/writer.go
index 57c03792b..093d47193 100644
--- a/src/pkg/image/png/writer.go
+++ b/src/pkg/image/png/writer.go
@@ -21,7 +21,7 @@ type encoder struct {
 	err    error
 	header [8]byte
 	footer [4]byte
-	tmp    [3 * 256]byte
+	tmp    [4 * 256]byte
 }
 
 // Big-endian.
@@ -70,7 +70,7 @@ func (e *encoder) writeChunk(b []byte, name string) {
 		e.err = UnsupportedError(name + " chunk is too large: " + strconv.Itoa(len(b)))
 		return
 	}
-	writeUint32(e.header[0:4], n)
+	writeUint32(e.header[:4], n)
 	e.header[4] = name[0]
 	e.header[5] = name[1]
 	e.header[6] = name[2]
@@ -78,9 +78,9 @@ func (e *encoder) writeChunk(b []byte, name string) {
 	crc := crc32.NewIEEE()
 	crc.Write(e.header[4:8])
 	crc.Write(b)
-	writeUint32(e.footer[0:4], crc.Sum32())
+	writeUint32(e.footer[:4], crc.Sum32())
 
-	_, e.err = e.w.Write(e.header[0:8])
+	_, e.err = e.w.Write(e.header[:8])
 	if e.err != nil {
 		return
 	}
@@ -88,7 +88,7 @@ func (e *encoder) writeChunk(b []byte, name string) {
 	if e.err != nil {
 		return
 	}
-	_, e.err = e.w.Write(e.footer[0:4])
+	_, e.err = e.w.Write(e.footer[:4])
 }
 
 func (e *encoder) writeIHDR() {
@@ -122,36 +122,29 @@ func (e *encoder) writeIHDR() {
 	e.tmp[10] = 0 // default compression method
 	e.tmp[11] = 0 // default filter method
 	e.tmp[12] = 0 // non-interlaced
-	e.writeChunk(e.tmp[0:13], "IHDR")
+	e.writeChunk(e.tmp[:13], "IHDR")
 }
 
-func (e *encoder) writePLTE(p color.Palette) {
+func (e *encoder) writePLTEAndTRNS(p color.Palette) {
 	if len(p) < 1 || len(p) > 256 {
 		e.err = FormatError("bad palette length: " + strconv.Itoa(len(p)))
 		return
 	}
-	for i, c := range p {
-		r, g, b, _ := c.RGBA()
-		e.tmp[3*i+0] = uint8(r >> 8)
-		e.tmp[3*i+1] = uint8(g >> 8)
-		e.tmp[3*i+2] = uint8(b >> 8)
-	}
-	e.writeChunk(e.tmp[0:3*len(p)], "PLTE")
-}
-
-func (e *encoder) maybeWritetRNS(p color.Palette) {
 	last := -1
 	for i, c := range p {
-		_, _, _, a := c.RGBA()
-		if a != 0xffff {
+		c1 := color.NRGBAModel.Convert(c).(color.NRGBA)
+		e.tmp[3*i+0] = c1.R
+		e.tmp[3*i+1] = c1.G
+		e.tmp[3*i+2] = c1.B
+		if c1.A != 0xff {
 			last = i
 		}
-		e.tmp[i] = uint8(a >> 8)
+		e.tmp[3*256+i] = c1.A
 	}
-	if last == -1 {
-		return
+	e.writeChunk(e.tmp[:3*len(p)], "PLTE")
+	if last != -1 {
+		e.writeChunk(e.tmp[3*256:3*256+1+last], "tRNS")
 	}
-	e.writeChunk(e.tmp[:last+1], "tRNS")
 }
 
 // An encoder is an io.Writer that satisfies writes by writing PNG IDAT chunks,
@@ -297,26 +290,42 @@ func writeImage(w io.Writer, m image.Image, cb int) error {
 	}
 	pr := make([]uint8, 1+bpp*b.Dx())
 
+	gray, _ := m.(*image.Gray)
+	rgba, _ := m.(*image.RGBA)
+	paletted, _ := m.(*image.Paletted)
+	nrgba, _ := m.(*image.NRGBA)
+
 	for y := b.Min.Y; y < b.Max.Y; y++ {
 		// Convert from colors to bytes.
 		i := 1
 		switch cb {
 		case cbG8:
-			for x := b.Min.X; x < b.Max.X; x++ {
-				c := color.GrayModel.Convert(m.At(x, y)).(color.Gray)
-				cr[0][i] = c.Y
-				i++
+			if gray != nil {
+				offset := (y - b.Min.Y) * gray.Stride
+				copy(cr[0][1:], gray.Pix[offset:offset+b.Dx()])
+			} else {
+				for x := b.Min.X; x < b.Max.X; x++ {
+					c := color.GrayModel.Convert(m.At(x, y)).(color.Gray)
+					cr[0][i] = c.Y
+					i++
+				}
 			}
 		case cbTC8:
 			// We have previously verified that the alpha value is fully opaque.
 			cr0 := cr[0]
-			if rgba, _ := m.(*image.RGBA); rgba != nil {
-				j0 := (y - b.Min.Y) * rgba.Stride
+			stride, pix := 0, []byte(nil)
+			if rgba != nil {
+				stride, pix = rgba.Stride, rgba.Pix
+			} else if nrgba != nil {
+				stride, pix = nrgba.Stride, nrgba.Pix
+			}
+			if stride != 0 {
+				j0 := (y - b.Min.Y) * stride
 				j1 := j0 + b.Dx()*4
 				for j := j0; j < j1; j += 4 {
-					cr0[i+0] = rgba.Pix[j+0]
-					cr0[i+1] = rgba.Pix[j+1]
-					cr0[i+2] = rgba.Pix[j+2]
+					cr0[i+0] = pix[j+0]
+					cr0[i+1] = pix[j+1]
+					cr0[i+2] = pix[j+2]
 					i += 3
 				}
 			} else {
@@ -329,9 +338,9 @@ func writeImage(w io.Writer, m image.Image, cb int) error {
 				}
 			}
 		case cbP8:
-			if p, _ := m.(*image.Paletted); p != nil {
-				offset := (y - b.Min.Y) * p.Stride
-				copy(cr[0][1:], p.Pix[offset:offset+b.Dx()])
+			if paletted != nil {
+				offset := (y - b.Min.Y) * paletted.Stride
+				copy(cr[0][1:], paletted.Pix[offset:offset+b.Dx()])
 			} else {
 				pi := m.(image.PalettedImage)
 				for x := b.Min.X; x < b.Max.X; x++ {
@@ -340,14 +349,19 @@ func writeImage(w io.Writer, m image.Image, cb int) error {
 				}
 			}
 		case cbTCA8:
-			// Convert from image.Image (which is alpha-premultiplied) to PNG's non-alpha-premultiplied.
-			for x := b.Min.X; x < b.Max.X; x++ {
-				c := color.NRGBAModel.Convert(m.At(x, y)).(color.NRGBA)
-				cr[0][i+0] = c.R
-				cr[0][i+1] = c.G
-				cr[0][i+2] = c.B
-				cr[0][i+3] = c.A
-				i += 4
+			if nrgba != nil {
+				offset := (y - b.Min.Y) * nrgba.Stride
+				copy(cr[0][1:], nrgba.Pix[offset:offset+b.Dx()*4])
+			} else {
+				// Convert from image.Image (which is alpha-premultiplied) to PNG's non-alpha-premultiplied.
+				for x := b.Min.X; x < b.Max.X; x++ {
+					c := color.NRGBAModel.Convert(m.At(x, y)).(color.NRGBA)
+					cr[0][i+0] = c.R
+					cr[0][i+1] = c.G
+					cr[0][i+2] = c.B
+					cr[0][i+3] = c.A
+					i += 4
+				}
 			}
 		case cbG16:
 			for x := b.Min.X; x < b.Max.X; x++ {
@@ -412,7 +426,7 @@ func (e *encoder) writeIDATs() {
 	e.err = bw.Flush()
 }
 
-func (e *encoder) writeIEND() { e.writeChunk(e.tmp[0:0], "IEND") }
+func (e *encoder) writeIEND() { e.writeChunk(nil, "IEND") }
 
 // Encode writes the Image m to w in PNG format. Any Image may be encoded, but
 // images that are not image.NRGBA might be encoded lossily.
@@ -460,8 +474,7 @@ func Encode(w io.Writer, m image.Image) error {
 	_, e.err = io.WriteString(w, pngHeader)
 	e.writeIHDR()
 	if pal != nil {
-		e.writePLTE(pal)
-		e.maybeWritetRNS(pal)
+		e.writePLTEAndTRNS(pal)
 	}
 	e.writeIDATs()
 	e.writeIEND()
diff --git a/src/pkg/image/png/writer_test.go b/src/pkg/image/png/writer_test.go
index 644c4fb44..3116fc9ff 100644
--- a/src/pkg/image/png/writer_test.go
+++ b/src/pkg/image/png/writer_test.go
@@ -101,6 +101,49 @@ func TestSubImage(t *testing.T) {
 	}
 }
 
+func BenchmarkEncodeGray(b *testing.B) {
+	b.StopTimer()
+	img := image.NewGray(image.Rect(0, 0, 640, 480))
+	b.SetBytes(640 * 480 * 1)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Encode(ioutil.Discard, img)
+	}
+}
+
+func BenchmarkEncodeNRGBOpaque(b *testing.B) {
+	b.StopTimer()
+	img := image.NewNRGBA(image.Rect(0, 0, 640, 480))
+	// Set all pixels to 0xFF alpha to force opaque mode.
+	bo := img.Bounds()
+	for y := bo.Min.Y; y < bo.Max.Y; y++ {
+		for x := bo.Min.X; x < bo.Max.X; x++ {
+			img.Set(x, y, color.NRGBA{0, 0, 0, 255})
+		}
+	}
+	if !img.Opaque() {
+		b.Fatal("expected image to be opaque")
+	}
+	b.SetBytes(640 * 480 * 4)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Encode(ioutil.Discard, img)
+	}
+}
+
+func BenchmarkEncodeNRGBA(b *testing.B) {
+	b.StopTimer()
+	img := image.NewNRGBA(image.Rect(0, 0, 640, 480))
+	if img.Opaque() {
+		b.Fatal("expected image not to be opaque")
+	}
+	b.SetBytes(640 * 480 * 4)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Encode(ioutil.Discard, img)
+	}
+}
+
 func BenchmarkEncodePaletted(b *testing.B) {
 	b.StopTimer()
 	img := image.NewPaletted(image.Rect(0, 0, 640, 480), color.Palette{
@@ -138,7 +181,7 @@ func BenchmarkEncodeRGBA(b *testing.B) {
 	b.StopTimer()
 	img := image.NewRGBA(image.Rect(0, 0, 640, 480))
 	if img.Opaque() {
-		b.Fatal("expected image to not be opaque")
+		b.Fatal("expected image not to be opaque")
 	}
 	b.SetBytes(640 * 480 * 4)
 	b.StartTimer()
diff --git a/src/pkg/image/testdata/video-001.progressive.jpeg b/src/pkg/image/testdata/video-001.progressive.jpeg
new file mode 100644
index 000000000..b8cae2359
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.progressive.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.420.jpeg b/src/pkg/image/testdata/video-001.q50.420.jpeg
new file mode 100644
index 000000000..83fb0f8ab
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.420.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.420.progressive.jpeg b/src/pkg/image/testdata/video-001.q50.420.progressive.jpeg
new file mode 100644
index 000000000..b048eb205
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.420.progressive.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.422.jpeg b/src/pkg/image/testdata/video-001.q50.422.jpeg
new file mode 100644
index 000000000..60fff4ff9
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.422.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.422.progressive.jpeg b/src/pkg/image/testdata/video-001.q50.422.progressive.jpeg
new file mode 100644
index 000000000..926d005de
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.422.progressive.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.440.jpeg b/src/pkg/image/testdata/video-001.q50.440.jpeg
new file mode 100644
index 000000000..32eeeaef6
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.440.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.440.progressive.jpeg b/src/pkg/image/testdata/video-001.q50.440.progressive.jpeg
new file mode 100644
index 000000000..e641a3bbb
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.440.progressive.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.444.jpeg b/src/pkg/image/testdata/video-001.q50.444.jpeg
new file mode 100644
index 000000000..7d5743382
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.444.jpeg
diff --git a/src/pkg/image/testdata/video-001.q50.444.progressive.jpeg b/src/pkg/image/testdata/video-001.q50.444.progressive.jpeg
new file mode 100644
index 000000000..ff7d5f9ff
--- /dev/null
+++ b/src/pkg/image/testdata/video-001.q50.444.progressive.jpeg
diff --git a/src/pkg/image/testdata/video-005.gray.q50.2x2.jpeg b/src/pkg/image/testdata/video-005.gray.q50.2x2.jpeg
new file mode 100644
index 000000000..630b615f7
--- /dev/null
+++ b/src/pkg/image/testdata/video-005.gray.q50.2x2.jpeg
diff --git a/src/pkg/image/testdata/video-005.gray.q50.2x2.progressive.jpeg b/src/pkg/image/testdata/video-005.gray.q50.2x2.progressive.jpeg
new file mode 100644
index 000000000..c6b93608c
--- /dev/null
+++ b/src/pkg/image/testdata/video-005.gray.q50.2x2.progressive.jpeg
diff --git a/src/pkg/image/testdata/video-005.gray.q50.jpeg b/src/pkg/image/testdata/video-005.gray.q50.jpeg
new file mode 100644
index 000000000..c65b5a794
--- /dev/null
+++ b/src/pkg/image/testdata/video-005.gray.q50.jpeg
diff --git a/src/pkg/image/testdata/video-005.gray.q50.progressive.jpeg b/src/pkg/image/testdata/video-005.gray.q50.progressive.jpeg
new file mode 100644
index 000000000..24b70e8bf
--- /dev/null
+++ b/src/pkg/image/testdata/video-005.gray.q50.progressive.jpeg
diff --git a/src/pkg/image/ycbcr.go b/src/pkg/image/ycbcr.go
index c1a0b666f..5b73bef78 100644
--- a/src/pkg/image/ycbcr.go
+++ b/src/pkg/image/ycbcr.go
@@ -15,6 +15,7 @@ const (
 	YCbCrSubsampleRatio444 YCbCrSubsampleRatio = iota
 	YCbCrSubsampleRatio422
 	YCbCrSubsampleRatio420
+	YCbCrSubsampleRatio440
 )
 
 func (s YCbCrSubsampleRatio) String() string {
@@ -25,6 +26,8 @@ func (s YCbCrSubsampleRatio) String() string {
 		return "YCbCrSubsampleRatio422"
 	case YCbCrSubsampleRatio420:
 		return "YCbCrSubsampleRatio420"
+	case YCbCrSubsampleRatio440:
+		return "YCbCrSubsampleRatio440"
 	}
 	return "YCbCrSubsampleRatioUnknown"
 }
@@ -39,6 +42,7 @@ func (s YCbCrSubsampleRatio) String() string {
 //	For 4:4:4, CStride == YStride/1 && len(Cb) == len(Cr) == len(Y)/1.
 //	For 4:2:2, CStride == YStride/2 && len(Cb) == len(Cr) == len(Y)/2.
 //	For 4:2:0, CStride == YStride/2 && len(Cb) == len(Cr) == len(Y)/4.
+//	For 4:4:0, CStride == YStride/1 && len(Cb) == len(Cr) == len(Y)/2.
 type YCbCr struct {
 	Y, Cb, Cr      []uint8
 	YStride        int
@@ -82,6 +86,8 @@ func (p *YCbCr) COffset(x, y int) int {
 		return (y-p.Rect.Min.Y)*p.CStride + (x/2 - p.Rect.Min.X/2)
 	case YCbCrSubsampleRatio420:
 		return (y/2-p.Rect.Min.Y/2)*p.CStride + (x/2 - p.Rect.Min.X/2)
+	case YCbCrSubsampleRatio440:
+		return (y/2-p.Rect.Min.Y/2)*p.CStride + (x - p.Rect.Min.X)
 	}
 	// Default to 4:4:4 subsampling.
 	return (y-p.Rect.Min.Y)*p.CStride + (x - p.Rect.Min.X)
@@ -126,6 +132,9 @@ func NewYCbCr(r Rectangle, subsampleRatio YCbCrSubsampleRatio) *YCbCr {
 	case YCbCrSubsampleRatio420:
 		cw = (r.Max.X+1)/2 - r.Min.X/2
 		ch = (r.Max.Y+1)/2 - r.Min.Y/2
+	case YCbCrSubsampleRatio440:
+		cw = w
+		ch = (r.Max.Y+1)/2 - r.Min.Y/2
 	default:
 		// Default to 4:4:4 subsampling.
 		cw = w
diff --git a/src/pkg/image/ycbcr_test.go b/src/pkg/image/ycbcr_test.go
index 5fa95be3e..a5f448265 100644
--- a/src/pkg/image/ycbcr_test.go
+++ b/src/pkg/image/ycbcr_test.go
@@ -36,6 +36,7 @@ func TestYCbCr(t *testing.T) {
 		YCbCrSubsampleRatio444,
 		YCbCrSubsampleRatio422,
 		YCbCrSubsampleRatio420,
+		YCbCrSubsampleRatio440,
 	}
 	deltas := []Point{
 		Pt(0, 0),