1 files changed, 1616 insertions, 0 deletions
diff --git a/src/cmd/internal/rsc.io/x86/x86asm/decode.go b/src/cmd/internal/rsc.io/x86/x86asm/decode.go
new file mode 100644
index 000000000..91e8876c8
--- /dev/null
+++ b/src/cmd/internal/rsc.io/x86/x86asm/decode.go
@@ -0,0 +1,1616 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Table-driven decoding of x86 instructions.
+
+package x86asm
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"runtime"
+)
+
+// Set trace to true to cause the decoder to print the PC sequence
+// of the executed instruction codes. This is typically only useful
+// when you are running a test of a single input case.
+const trace = false
+
+// A decodeOp is a single instruction in the decoder bytecode program.
+//
+// The decodeOps correspond to consuming and conditionally branching
+// on input bytes, consuming additional fields, and then interpreting
+// consumed data as instruction arguments. The names of the xRead and xArg
+// operations are taken from the Intel manual conventions, for example
+// Volume 2, Section 3.1.1, page 487 of
+// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
+//
+// The actual decoding program is generated by ../x86map.
+//
+// TODO(rsc): We may be able to merge various of the memory operands
+// since we don't care about, say, the distinction between m80dec and m80bcd.
+// Similarly, mm and mm1 have identical meaning, as do xmm and xmm1.
+
+type decodeOp uint16
+
+const (
+	xFail  decodeOp = iota // invalid instruction (return)
+	xMatch                 // completed match
+	xJump                  // jump to pc
+
+	xCondByte     // switch on instruction byte value
+	xCondSlashR   // read and switch on instruction /r value
+	xCondPrefix   // switch on presence of instruction prefix
+	xCondIs64     // switch on 64-bit processor mode
+	xCondDataSize // switch on operand size
+	xCondAddrSize // switch on address size
+	xCondIsMem    // switch on memory vs register argument
+
+	xSetOp // set instruction opcode
+
+	xReadSlashR // read /r
+	xReadIb     // read ib
+	xReadIw     // read iw
+	xReadId     // read id
+	xReadIo     // read io
+	xReadCb     // read cb
+	xReadCw     // read cw
+	xReadCd     // read cd
+	xReadCp     // read cp
+	xReadCm     // read cm
+
+	xArg1            // arg 1
+	xArg3            // arg 3
+	xArgAL           // arg AL
+	xArgAX           // arg AX
+	xArgCL           // arg CL
+	xArgCR0dashCR7   // arg CR0-CR7
+	xArgCS           // arg CS
+	xArgDR0dashDR7   // arg DR0-DR7
+	xArgDS           // arg DS
+	xArgDX           // arg DX
+	xArgEAX          // arg EAX
+	xArgEDX          // arg EDX
+	xArgES           // arg ES
+	xArgFS           // arg FS
+	xArgGS           // arg GS
+	xArgImm16        // arg imm16
+	xArgImm32        // arg imm32
+	xArgImm64        // arg imm64
+	xArgImm8         // arg imm8
+	xArgImm8u        // arg imm8 but record as unsigned
+	xArgImm16u       // arg imm8 but record as unsigned
+	xArgM            // arg m
+	xArgM128         // arg m128
+	xArgM1428byte    // arg m14/28byte
+	xArgM16          // arg m16
+	xArgM16and16     // arg m16&16
+	xArgM16and32     // arg m16&32
+	xArgM16and64     // arg m16&64
+	xArgM16colon16   // arg m16:16
+	xArgM16colon32   // arg m16:32
+	xArgM16colon64   // arg m16:64
+	xArgM16int       // arg m16int
+	xArgM2byte       // arg m2byte
+	xArgM32          // arg m32
+	xArgM32and32     // arg m32&32
+	xArgM32fp        // arg m32fp
+	xArgM32int       // arg m32int
+	xArgM512byte     // arg m512byte
+	xArgM64          // arg m64
+	xArgM64fp        // arg m64fp
+	xArgM64int       // arg m64int
+	xArgM8           // arg m8
+	xArgM80bcd       // arg m80bcd
+	xArgM80dec       // arg m80dec
+	xArgM80fp        // arg m80fp
+	xArgM94108byte   // arg m94/108byte
+	xArgMm           // arg mm
+	xArgMm1          // arg mm1
+	xArgMm2          // arg mm2
+	xArgMm2M64       // arg mm2/m64
+	xArgMmM32        // arg mm/m32
+	xArgMmM64        // arg mm/m64
+	xArgMem          // arg mem
+	xArgMoffs16      // arg moffs16
+	xArgMoffs32      // arg moffs32
+	xArgMoffs64      // arg moffs64
+	xArgMoffs8       // arg moffs8
+	xArgPtr16colon16 // arg ptr16:16
+	xArgPtr16colon32 // arg ptr16:32
+	xArgR16          // arg r16
+	xArgR16op        // arg r16 with +rw in opcode
+	xArgR32          // arg r32
+	xArgR32M16       // arg r32/m16
+	xArgR32M8        // arg r32/m8
+	xArgR32op        // arg r32 with +rd in opcode
+	xArgR64          // arg r64
+	xArgR64M16       // arg r64/m16
+	xArgR64op        // arg r64 with +rd in opcode
+	xArgR8           // arg r8
+	xArgR8op         // arg r8 with +rb in opcode
+	xArgRAX          // arg RAX
+	xArgRDX          // arg RDX
+	xArgRM           // arg r/m
+	xArgRM16         // arg r/m16
+	xArgRM32         // arg r/m32
+	xArgRM64         // arg r/m64
+	xArgRM8          // arg r/m8
+	xArgReg          // arg reg
+	xArgRegM16       // arg reg/m16
+	xArgRegM32       // arg reg/m32
+	xArgRegM8        // arg reg/m8
+	xArgRel16        // arg rel16
+	xArgRel32        // arg rel32
+	xArgRel8         // arg rel8
+	xArgSS           // arg SS
+	xArgST           // arg ST, aka ST(0)
+	xArgSTi          // arg ST(i) with +i in opcode
+	xArgSreg         // arg Sreg
+	xArgTR0dashTR7   // arg TR0-TR7
+	xArgXmm          // arg xmm
+	xArgXMM0         // arg <XMM0>
+	xArgXmm1         // arg xmm1
+	xArgXmm2         // arg xmm2
+	xArgXmm2M128     // arg xmm2/m128
+	xArgXmm2M16      // arg xmm2/m16
+	xArgXmm2M32      // arg xmm2/m32
+	xArgXmm2M64      // arg xmm2/m64
+	xArgXmmM128      // arg xmm/m128
+	xArgXmmM32       // arg xmm/m32
+	xArgXmmM64       // arg xmm/m64
+	xArgRmf16        // arg r/m16 but force mod=3
+	xArgRmf32        // arg r/m32 but force mod=3
+	xArgRmf64        // arg r/m64 but force mod=3
+)
+
+// instPrefix returns an Inst describing just one prefix byte.
+// It is only used if there is a prefix followed by an unintelligible
+// or invalid instruction byte sequence.
+func instPrefix(b byte, mode int) (Inst, error) {
+	// When tracing it is useful to see what called instPrefix to report an error.
+	if trace {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("%s:%d\n", file, line)
+	}
+	p := Prefix(b)
+	switch p {
+	case PrefixDataSize:
+		if mode == 16 {
+			p = PrefixData32
+		} else {
+			p = PrefixData16
+		}
+	case PrefixAddrSize:
+		if mode == 32 {
+			p = PrefixAddr16
+		} else {
+			p = PrefixAddr32
+		}
+	}
+	// Note: using composite literal with Prefix key confuses 'bundle' tool.
+	inst := Inst{Len: 1}
+	inst.Prefix = Prefixes{p}
+	return inst, nil
+}
+
+// truncated reports a truncated instruction.
+// For now we use instPrefix but perhaps later we will return
+// a specific error here.
+func truncated(src []byte, mode int) (Inst, error) {
+	//	return Inst{}, len(src), ErrTruncated
+	return instPrefix(src[0], mode) // too long
+}
+
+// These are the errors returned by Decode.
+var (
+	ErrInvalidMode  = errors.New("invalid x86 mode in Decode")
+	ErrTruncated    = errors.New("truncated instruction")
+	ErrUnrecognized = errors.New("unrecognized instruction")
+)
+
+// decoderCover records coverage information for which parts
+// of the byte code have been executed.
+// TODO(rsc): This is for testing. Only use this if a flag is given.
+var decoderCover []bool
+
+// Decode decodes the leading bytes in src as a single instruction.
+// The mode arguments specifies the assumed processor mode:
+// 16, 32, or 64 for 16-, 32-, and 64-bit execution modes.
+func Decode(src []byte, mode int) (inst Inst, err error) {
+	return decode1(src, mode, false)
+}
+
+// decode1 is the implementation of Decode but takes an extra
+// gnuCompat flag to cause it to change its behavior to mimic
+// bugs (or at least unique features) of GNU libopcodes as used
+// by objdump. We don't believe that logic is the right thing to do
+// in general, but when testing against libopcodes it simplifies the
+// comparison if we adjust a few small pieces of logic.
+// The affected logic is in the conditional branch for "mandatory" prefixes,
+// case xCondPrefix.
+func decode1(src []byte, mode int, gnuCompat bool) (Inst, error) {
+	switch mode {
+	case 16, 32, 64:
+		// ok
+		// TODO(rsc): 64-bit mode not tested, probably not working.
+	default:
+		return Inst{}, ErrInvalidMode
+	}
+
+	// Maximum instruction size is 15 bytes.
+	// If we need to read more, return 'truncated instruction.
+	if len(src) > 15 {
+		src = src[:15]
+	}
+
+	var (
+		// prefix decoding information
+		pos           = 0    // position reading src
+		nprefix       = 0    // number of prefixes
+		lockIndex     = -1   // index of LOCK prefix in src and inst.Prefix
+		repIndex      = -1   // index of REP/REPN prefix in src and inst.Prefix
+		segIndex      = -1   // index of Group 2 prefix in src and inst.Prefix
+		dataSizeIndex = -1   // index of Group 3 prefix in src and inst.Prefix
+		addrSizeIndex = -1   // index of Group 4 prefix in src and inst.Prefix
+		rex           Prefix // rex byte if present (or 0)
+		rexUsed       Prefix // bits used in rex byte
+		rexIndex      = -1   // index of rex byte
+
+		addrMode = mode // address mode (width in bits)
+		dataMode = mode // operand mode (width in bits)
+
+		// decoded ModR/M fields
+		haveModrm bool
+		modrm     int
+		mod       int
+		regop     int
+		rm        int
+
+		// if ModR/M is memory reference, Mem form
+		mem     Mem
+		haveMem bool
+
+		// decoded SIB fields
+		haveSIB bool
+		sib     int
+		scale   int
+		index   int
+		base    int
+
+		// decoded immediate values
+		imm  int64
+		imm8 int8
+		immc int64
+
+		// output
+		opshift int
+		inst    Inst
+		narg    int // number of arguments written to inst
+	)
+
+	if mode == 64 {
+		dataMode = 32
+	}
+
+	// Prefixes are certainly the most complex and underspecified part of
+	// decoding x86 instructions. Although the manuals say things like
+	// up to four prefixes, one from each group, nearly everyone seems to
+	// agree that in practice as many prefixes as possible, including multiple
+	// from a particular group or repetitions of a given prefix, can be used on
+	// an instruction, provided the total instruction length including prefixes
+	// does not exceed the agreed-upon maximum of 15 bytes.
+	// Everyone also agrees that if one of these prefixes is the LOCK prefix
+	// and the instruction is not one of the instructions that can be used with
+	// the LOCK prefix or if the destination is not a memory operand,
+	// then the instruction is invalid and produces the #UD exception.
+	// However, that is the end of any semblance of agreement.
+	//
+	// What happens if prefixes are given that conflict with other prefixes?
+	// For example, the memory segment overrides CS, DS, ES, FS, GS, SS
+	// conflict with each other: only one segment can be in effect.
+	// Disassemblers seem to agree that later prefixes take priority over
+	// earlier ones. I have not taken the time to write assembly programs
+	// to check to see if the hardware agrees.
+	//
+	// What happens if prefixes are given that have no meaning for the
+	// specific instruction to which they are attached? It depends.
+	// If they really have no meaning, they are ignored. However, a future
+	// processor may assign a different meaning. As a disassembler, we
+	// don't really know whether we're seeing a meaningless prefix or one
+	// whose meaning we simply haven't been told yet.
+	//
+	// Combining the two questions, what happens when conflicting
+	// extension prefixes are given? No one seems to know for sure.
+	// For example, MOVQ is 66 0F D6 /r, MOVDQ2Q is F2 0F D6 /r,
+	// and MOVQ2DQ is F3 0F D6 /r. What is '66 F2 F3 0F D6 /r'?
+	// Which prefix wins? See the xCondPrefix prefix for more.
+	//
+	// Writing assembly test cases to divine which interpretation the
+	// CPU uses might clarify the situation, but more likely it would
+	// make the situation even less clear.
+
+	// Read non-REX prefixes.
+ReadPrefixes:
+	for ; pos < len(src); pos++ {
+		p := Prefix(src[pos])
+		switch p {
+		default:
+			nprefix = pos
+			break ReadPrefixes
+
+		// Group 1 - lock and repeat prefixes
+		// According to Intel, there should only be one from this set,
+		// but according to AMD both can be present.
+		case 0xF0:
+			if lockIndex >= 0 {
+				inst.Prefix[lockIndex] |= PrefixIgnored
+			}
+			lockIndex = pos
+		case 0xF2, 0xF3:
+			if repIndex >= 0 {
+				inst.Prefix[repIndex] |= PrefixIgnored
+			}
+			repIndex = pos
+
+		// Group 2 - segment override / branch hints
+		case 0x26, 0x2E, 0x36, 0x3E:
+			if mode == 64 {
+				p |= PrefixIgnored
+				break
+			}
+			fallthrough
+		case 0x64, 0x65:
+			if segIndex >= 0 {
+				inst.Prefix[segIndex] |= PrefixIgnored
+			}
+			segIndex = pos
+
+		// Group 3 - operand size override
+		case 0x66:
+			if mode == 16 {
+				dataMode = 32
+				p = PrefixData32
+			} else {
+				dataMode = 16
+				p = PrefixData16
+			}
+			if dataSizeIndex >= 0 {
+				inst.Prefix[dataSizeIndex] |= PrefixIgnored
+			}
+			dataSizeIndex = pos
+
+		// Group 4 - address size override
+		case 0x67:
+			if mode == 32 {
+				addrMode = 16
+				p = PrefixAddr16
+			} else {
+				addrMode = 32
+				p = PrefixAddr32
+			}
+			if addrSizeIndex >= 0 {
+				inst.Prefix[addrSizeIndex] |= PrefixIgnored
+			}
+			addrSizeIndex = pos
+		}
+
+		if pos >= len(inst.Prefix) {
+			return instPrefix(src[0], mode) // too long
+		}
+
+		inst.Prefix[pos] = p
+	}
+
+	// Read REX prefix.
+	if pos < len(src) && mode == 64 && Prefix(src[pos]).IsREX() {
+		rex = Prefix(src[pos])
+		rexIndex = pos
+		if pos >= len(inst.Prefix) {
+			return instPrefix(src[0], mode) // too long
+		}
+		inst.Prefix[pos] = rex
+		pos++
+		if rex&PrefixREXW != 0 {
+			dataMode = 64
+			if dataSizeIndex >= 0 {
+				inst.Prefix[dataSizeIndex] |= PrefixIgnored
+			}
+		}
+	}
+
+	// Decode instruction stream, interpreting decoding instructions.
+	// opshift gives the shift to use when saving the next
+	// opcode byte into inst.Opcode.
+	opshift = 24
+	if decoderCover == nil {
+		decoderCover = make([]bool, len(decoder))
+	}
+
+	// Decode loop, executing decoder program.
+	var oldPC, prevPC int
+Decode:
+	for pc := 1; ; { // TODO uint
+		oldPC = prevPC
+		prevPC = pc
+		if trace {
+			println("run", pc)
+		}
+		x := decoder[pc]
+		decoderCover[pc] = true
+		pc++
+
+		// Read and decode ModR/M if needed by opcode.
+		switch decodeOp(x) {
+		case xCondSlashR, xReadSlashR:
+			if haveModrm {
+				return Inst{Len: pos}, errInternal
+			}
+			haveModrm = true
+			if pos >= len(src) {
+				return truncated(src, mode)
+			}
+			modrm = int(src[pos])
+			pos++
+			if opshift >= 0 {
+				inst.Opcode |= uint32(modrm) << uint(opshift)
+				opshift -= 8
+			}
+			mod = modrm >> 6
+			regop = (modrm >> 3) & 07
+			rm = modrm & 07
+			if rex&PrefixREXR != 0 {
+				rexUsed |= PrefixREXR
+				regop |= 8
+			}
+			if addrMode == 16 {
+				// 16-bit modrm form
+				if mod != 3 {
+					haveMem = true
+					mem = addr16[rm]
+					if rm == 6 && mod == 0 {
+						mem.Base = 0
+					}
+
+					// Consume disp16 if present.
+					if mod == 0 && rm == 6 || mod == 2 {
+						if pos+2 > len(src) {
+							return truncated(src, mode)
+						}
+						mem.Disp = int64(binary.LittleEndian.Uint16(src[pos:]))
+						pos += 2
+					}
+
+					// Consume disp8 if present.
+					if mod == 1 {
+						if pos >= len(src) {
+							return truncated(src, mode)
+						}
+						mem.Disp = int64(int8(src[pos]))
+						pos++
+					}
+				}
+			} else {
+				haveMem = mod != 3
+
+				// 32-bit or 64-bit form
+				// Consume SIB encoding if present.
+				if rm == 4 && mod != 3 {
+					haveSIB = true
+					if pos >= len(src) {
+						return truncated(src, mode)
+					}
+					sib = int(src[pos])
+					pos++
+					if opshift >= 0 {
+						inst.Opcode |= uint32(sib) << uint(opshift)
+						opshift -= 8
+					}
+					scale = sib >> 6
+					index = (sib >> 3) & 07
+					base = sib & 07
+					if rex&PrefixREXB != 0 {
+						rexUsed |= PrefixREXB
+						base |= 8
+					}
+					if rex&PrefixREXX != 0 {
+						rexUsed |= PrefixREXX
+						index |= 8
+					}
+
+					mem.Scale = 1 << uint(scale)
+					if index == 4 {
+						// no mem.Index
+					} else {
+						mem.Index = baseRegForBits(addrMode) + Reg(index)
+					}
+					if base&7 == 5 && mod == 0 {
+						// no mem.Base
+					} else {
+						mem.Base = baseRegForBits(addrMode) + Reg(base)
+					}
+				} else {
+					if rex&PrefixREXB != 0 {
+						rexUsed |= PrefixREXB
+						rm |= 8
+					}
+					if mod == 0 && rm&7 == 5 || rm&7 == 4 {
+						// base omitted
+					} else if mod != 3 {
+						mem.Base = baseRegForBits(addrMode) + Reg(rm)
+					}
+				}
+
+				// Consume disp32 if present.
+				if mod == 0 && (rm&7 == 5 || haveSIB && base&7 == 5) || mod == 2 {
+					if pos+4 > len(src) {
+						return truncated(src, mode)
+					}
+					mem.Disp = int64(binary.LittleEndian.Uint32(src[pos:]))
+					pos += 4
+				}
+
+				// Consume disp8 if present.
+				if mod == 1 {
+					if pos >= len(src) {
+						return truncated(src, mode)
+					}
+					mem.Disp = int64(int8(src[pos]))
+					pos++
+				}
+
+				// In 64-bit, mod=0 rm=5 is PC-relative instead of just disp.
+				// See Vol 2A. Table 2-7.
+				if mode == 64 && mod == 0 && rm&7 == 5 {
+					if addrMode == 32 {
+						mem.Base = EIP
+					} else {
+						mem.Base = RIP
+					}
+				}
+			}
+
+			if segIndex >= 0 {
+				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
+			}
+		}
+
+		// Execute single opcode.
+		switch decodeOp(x) {
+		default:
+			println("bad op", x, "at", pc-1, "from", oldPC)
+			return Inst{Len: pos}, errInternal
+
+		case xFail:
+			inst.Op = 0
+			break Decode
+
+		case xMatch:
+			break Decode
+
+		case xJump:
+			pc = int(decoder[pc])
+
+		// Conditional branches.
+
+		case xCondByte:
+			if pos >= len(src) {
+				return truncated(src, mode)
+			}
+			b := src[pos]
+			n := int(decoder[pc])
+			pc++
+			for i := 0; i < n; i++ {
+				xb, xpc := decoder[pc], int(decoder[pc+1])
+				pc += 2
+				if b == byte(xb) {
+					pc = xpc
+					pos++
+					if opshift >= 0 {
+						inst.Opcode |= uint32(b) << uint(opshift)
+						opshift -= 8
+					}
+					continue Decode
+				}
+			}
+			// xCondByte is the only conditional with a fall through,
+			// so that it can be used to pick off special cases before
+			// an xCondSlash. If the fallthrough instruction is xFail,
+			// advance the position so that the decoded instruction
+			// size includes the byte we just compared against.
+			if decodeOp(decoder[pc]) == xJump {
+				pc = int(decoder[pc+1])
+			}
+			if decodeOp(decoder[pc]) == xFail {
+				pos++
+			}
+
+		case xCondIs64:
+			if mode == 64 {
+				pc = int(decoder[pc+1])
+			} else {
+				pc = int(decoder[pc])
+			}
+
+		case xCondIsMem:
+			mem := haveMem
+			if !haveModrm {
+				if pos >= len(src) {
+					return instPrefix(src[0], mode) // too long
+				}
+				mem = src[pos]>>6 != 3
+			}
+			if mem {
+				pc = int(decoder[pc+1])
+			} else {
+				pc = int(decoder[pc])
+			}
+
+		case xCondDataSize:
+			switch dataMode {
+			case 16:
+				if dataSizeIndex >= 0 {
+					inst.Prefix[dataSizeIndex] |= PrefixImplicit
+				}
+				pc = int(decoder[pc])
+			case 32:
+				if dataSizeIndex >= 0 {
+					inst.Prefix[dataSizeIndex] |= PrefixImplicit
+				}
+				pc = int(decoder[pc+1])
+			case 64:
+				rexUsed |= PrefixREXW
+				pc = int(decoder[pc+2])
+			}
+
+		case xCondAddrSize:
+			switch addrMode {
+			case 16:
+				if addrSizeIndex >= 0 {
+					inst.Prefix[addrSizeIndex] |= PrefixImplicit
+				}
+				pc = int(decoder[pc])
+			case 32:
+				if addrSizeIndex >= 0 {
+					inst.Prefix[addrSizeIndex] |= PrefixImplicit
+				}
+				pc = int(decoder[pc+1])
+			case 64:
+				pc = int(decoder[pc+2])
+			}
+
+		case xCondPrefix:
+			// Conditional branch based on presence or absence of prefixes.
+			// The conflict cases here are completely undocumented and
+			// differ significantly between GNU libopcodes and Intel xed.
+			// I have not written assembly code to divine what various CPUs
+			// do, but it wouldn't surprise me if they are not consistent either.
+			//
+			// The basic idea is to switch on the presence of a prefix, so that
+			// for example:
+			//
+			//	xCondPrefix, 4
+			//	0xF3, 123,
+			//	0xF2, 234,
+			//	0x66, 345,
+			//	0, 456
+			//
+			// branch to 123 if the F3 prefix is present, 234 if the F2 prefix
+			// is present, 66 if the 345 prefix is present, and 456 otherwise.
+			// The prefixes are given in descending order so that the 0 will be last.
+			//
+			// It is unclear what should happen if multiple conditions are
+			// satisfied: what if F2 and F3 are both present, or if 66 and F2
+			// are present, or if all three are present? The one chosen becomes
+			// part of the opcode and the others do not. Perhaps the answer
+			// depends on the specific opcodes in question.
+			//
+			// The only clear example is that CRC32 is F2 0F 38 F1 /r, and
+			// it comes in 16-bit and 32-bit forms based on the 66 prefix,
+			// so 66 F2 0F 38 F1 /r should be treated as F2 taking priority,
+			// with the 66 being only an operand size override, and probably
+			// F2 66 0F 38 F1 /r should be treated the same.
+			// Perhaps that rule is specific to the case of CRC32, since no
+			// 66 0F 38 F1 instruction is defined (today) (that we know of).
+			// However, both libopcodes and xed seem to generalize this
+			// example and choose F2/F3 in preference to 66, and we
+			// do the same.
+			//
+			// Next, what if both F2 and F3 are present? Which wins?
+			// The Intel xed rule, and ours, is that the one that occurs last wins.
+			// The GNU libopcodes rule, which we implement only in gnuCompat mode,
+			// is that F3 beats F2 unless F3 has no special meaning, in which
+			// case F3 can be a modified on an F2 special meaning.
+			//
+			// Concretely,
+			//	66 0F D6 /r is MOVQ
+			//	F2 0F D6 /r is MOVDQ2Q
+			//	F3 0F D6 /r is MOVQ2DQ.
+			//
+			//	F2 66 0F D6 /r is 66 + MOVDQ2Q always.
+			//	66 F2 0F D6 /r is 66 + MOVDQ2Q always.
+			//	F3 66 0F D6 /r is 66 + MOVQ2DQ always.
+			//	66 F3 0F D6 /r is 66 + MOVQ2DQ always.
+			//	F2 F3 0F D6 /r is F2 + MOVQ2DQ always.
+			//	F3 F2 0F D6 /r is F3 + MOVQ2DQ in Intel xed, but F2 + MOVQ2DQ in GNU libopcodes.
+			//	Adding 66 anywhere in the prefix section of the
+			//	last two cases does not change the outcome.
+			//
+			// Finally, what if there is a variant in which 66 is a mandatory
+			// prefix rather than an operand size override, but we know of
+			// no corresponding F2/F3 form, and we see both F2/F3 and 66.
+			// Does F2/F3 still take priority, so that the result is an unknown
+			// instruction, or does the 66 take priority, so that the extended
+			// 66 instruction should be interpreted as having a REP/REPN prefix?
+			// Intel xed does the former and GNU libopcodes does the latter.
+			// We side with Intel xed, unless we are trying to match libopcodes
+			// more closely during the comparison-based test suite.
+			//
+			// In 64-bit mode REX.W is another valid prefix to test for, but
+			// there is less ambiguity about that. When present, REX.W is
+			// always the first entry in the table.
+			n := int(decoder[pc])
+			pc++
+			sawF3 := false
+			for j := 0; j < n; j++ {
+				prefix := Prefix(decoder[pc+2*j])
+				if prefix.IsREX() {
+					rexUsed |= prefix
+					if rex&prefix == prefix {
+						pc = int(decoder[pc+2*j+1])
+						continue Decode
+					}
+					continue
+				}
+				ok := false
+				if prefix == 0 {
+					ok = true
+				} else if prefix.IsREX() {
+					rexUsed |= prefix
+					if rex&prefix == prefix {
+						ok = true
+					}
+				} else {
+					if prefix == 0xF3 {
+						sawF3 = true
+					}
+					switch prefix {
+					case PrefixLOCK:
+						if lockIndex >= 0 {
+							inst.Prefix[lockIndex] |= PrefixImplicit
+							ok = true
+						}
+					case PrefixREP, PrefixREPN:
+						if repIndex >= 0 && inst.Prefix[repIndex]&0xFF == prefix {
+							inst.Prefix[repIndex] |= PrefixImplicit
+							ok = true
+						}
+						if gnuCompat && !ok && prefix == 0xF3 && repIndex >= 0 && (j+1 >= n || decoder[pc+2*(j+1)] != 0xF2) {
+							// Check to see if earlier prefix F3 is present.
+							for i := repIndex - 1; i >= 0; i-- {
+								if inst.Prefix[i]&0xFF == prefix {
+									inst.Prefix[i] |= PrefixImplicit
+									ok = true
+								}
+							}
+						}
+						if gnuCompat && !ok && prefix == 0xF2 && repIndex >= 0 && !sawF3 && inst.Prefix[repIndex]&0xFF == 0xF3 {
+							// Check to see if earlier prefix F2 is present.
+							for i := repIndex - 1; i >= 0; i-- {
+								if inst.Prefix[i]&0xFF == prefix {
+									inst.Prefix[i] |= PrefixImplicit
+									ok = true
+								}
+							}
+						}
+					case PrefixCS, PrefixDS, PrefixES, PrefixFS, PrefixGS, PrefixSS:
+						if segIndex >= 0 && inst.Prefix[segIndex]&0xFF == prefix {
+							inst.Prefix[segIndex] |= PrefixImplicit
+							ok = true
+						}
+					case PrefixDataSize:
+						// Looking for 66 mandatory prefix.
+						// The F2/F3 mandatory prefixes take priority when both are present.
+						// If we got this far in the xCondPrefix table and an F2/F3 is present,
+						// it means the table didn't have any entry for that prefix. But if 66 has
+						// special meaning, perhaps F2/F3 have special meaning that we don't know.
+						// Intel xed works this way, treating the F2/F3 as inhibiting the 66.
+						// GNU libopcodes allows the 66 to match. We do what Intel xed does
+						// except in gnuCompat mode.
+						if repIndex >= 0 && !gnuCompat {
+							inst.Op = 0
+							break Decode
+						}
+						if dataSizeIndex >= 0 {
+							inst.Prefix[dataSizeIndex] |= PrefixImplicit
+							ok = true
+						}
+					case PrefixAddrSize:
+						if addrSizeIndex >= 0 {
+							inst.Prefix[addrSizeIndex] |= PrefixImplicit
+							ok = true
+						}
+					}
+				}
+				if ok {
+					pc = int(decoder[pc+2*j+1])
+					continue Decode
+				}
+			}
+			inst.Op = 0
+			break Decode
+
+		case xCondSlashR:
+			pc = int(decoder[pc+regop&7])
+
+		// Input.
+
+		case xReadSlashR:
+			// done above
+
+		case xReadIb:
+			if pos >= len(src) {
+				return truncated(src, mode)
+			}
+			imm8 = int8(src[pos])
+			pos++
+
+		case xReadIw:
+			if pos+2 > len(src) {
+				return truncated(src, mode)
+			}
+			imm = int64(binary.LittleEndian.Uint16(src[pos:]))
+			pos += 2
+
+		case xReadId:
+			if pos+4 > len(src) {
+				return truncated(src, mode)
+			}
+			imm = int64(binary.LittleEndian.Uint32(src[pos:]))
+			pos += 4
+
+		case xReadIo:
+			if pos+8 > len(src) {
+				return truncated(src, mode)
+			}
+			imm = int64(binary.LittleEndian.Uint64(src[pos:]))
+			pos += 8
+
+		case xReadCb:
+			if pos >= len(src) {
+				return truncated(src, mode)
+			}
+			immc = int64(src[pos])
+			pos++
+
+		case xReadCw:
+			if pos+2 > len(src) {
+				return truncated(src, mode)
+			}
+			immc = int64(binary.LittleEndian.Uint16(src[pos:]))
+			pos += 2
+
+		case xReadCm:
+			if addrMode == 16 {
+				if pos+2 > len(src) {
+					return truncated(src, mode)
+				}
+				immc = int64(binary.LittleEndian.Uint16(src[pos:]))
+				pos += 2
+			} else if addrMode == 32 {
+				if pos+4 > len(src) {
+					return truncated(src, mode)
+				}
+				immc = int64(binary.LittleEndian.Uint32(src[pos:]))
+				pos += 4
+			} else {
+				if pos+8 > len(src) {
+					return truncated(src, mode)
+				}
+				immc = int64(binary.LittleEndian.Uint64(src[pos:]))
+				pos += 8
+			}
+		case xReadCd:
+			if pos+4 > len(src) {
+				return truncated(src, mode)
+			}
+			immc = int64(binary.LittleEndian.Uint32(src[pos:]))
+			pos += 4
+
+		case xReadCp:
+			if pos+6 > len(src) {
+				return truncated(src, mode)
+			}
+			w := binary.LittleEndian.Uint32(src[pos:])
+			w2 := binary.LittleEndian.Uint16(src[pos+4:])
+			immc = int64(w2)<<32 | int64(w)
+			pos += 6
+
+		// Output.
+
+		case xSetOp:
+			inst.Op = Op(decoder[pc])
+			pc++
+
+		case xArg1,
+			xArg3,
+			xArgAL,
+			xArgAX,
+			xArgCL,
+			xArgCS,
+			xArgDS,
+			xArgDX,
+			xArgEAX,
+			xArgEDX,
+			xArgES,
+			xArgFS,
+			xArgGS,
+			xArgRAX,
+			xArgRDX,
+			xArgSS,
+			xArgST,
+			xArgXMM0:
+			inst.Args[narg] = fixedArg[x]
+			narg++
+
+		case xArgImm8:
+			inst.Args[narg] = Imm(imm8)
+			narg++
+
+		case xArgImm8u:
+			inst.Args[narg] = Imm(uint8(imm8))
+			narg++
+
+		case xArgImm16:
+			inst.Args[narg] = Imm(int16(imm))
+			narg++
+
+		case xArgImm16u:
+			inst.Args[narg] = Imm(uint16(imm))
+			narg++
+
+		case xArgImm32:
+			inst.Args[narg] = Imm(int32(imm))
+			narg++
+
+		case xArgImm64:
+			inst.Args[narg] = Imm(imm)
+			narg++
+
+		case xArgM,
+			xArgM128,
+			xArgM1428byte,
+			xArgM16,
+			xArgM16and16,
+			xArgM16and32,
+			xArgM16and64,
+			xArgM16colon16,
+			xArgM16colon32,
+			xArgM16colon64,
+			xArgM16int,
+			xArgM2byte,
+			xArgM32,
+			xArgM32and32,
+			xArgM32fp,
+			xArgM32int,
+			xArgM512byte,
+			xArgM64,
+			xArgM64fp,
+			xArgM64int,
+			xArgM8,
+			xArgM80bcd,
+			xArgM80dec,
+			xArgM80fp,
+			xArgM94108byte,
+			xArgMem:
+			if !haveMem {
+				inst.Op = 0
+				break Decode
+			}
+			inst.Args[narg] = mem
+			inst.MemBytes = int(memBytes[decodeOp(x)])
+			narg++
+
+		case xArgPtr16colon16:
+			inst.Args[narg] = Imm(immc >> 16)
+			inst.Args[narg+1] = Imm(immc & (1<<16 - 1))
+			narg += 2
+
+		case xArgPtr16colon32:
+			inst.Args[narg] = Imm(immc >> 32)
+			inst.Args[narg+1] = Imm(immc & (1<<32 - 1))
+			narg += 2
+
+		case xArgMoffs8, xArgMoffs16, xArgMoffs32, xArgMoffs64:
+			// TODO(rsc): Can address be 64 bits?
+			mem = Mem{Disp: int64(immc)}
+			if segIndex >= 0 {
+				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
+				inst.Prefix[segIndex] |= PrefixImplicit
+			}
+			inst.Args[narg] = mem
+			inst.MemBytes = int(memBytes[decodeOp(x)])
+			narg++
+
+		case xArgR8, xArgR16, xArgR32, xArgR64, xArgXmm, xArgXmm1, xArgDR0dashDR7:
+			base := baseReg[x]
+			index := Reg(regop)
+			if rex != 0 && base == AL && index >= 4 {
+				rexUsed |= PrefixREX
+				index -= 4
+				base = SPB
+			}
+			inst.Args[narg] = base + index
+			narg++
+
+		case xArgMm, xArgMm1, xArgTR0dashTR7:
+			inst.Args[narg] = baseReg[x] + Reg(regop&7)
+			narg++
+
+		case xArgCR0dashCR7:
+			// AMD documents an extension that the LOCK prefix
+			// can be used in place of a REX prefix in order to access
+			// CR8 from 32-bit mode. The LOCK prefix is allowed in
+			// all modes, provided the corresponding CPUID bit is set.
+			if lockIndex >= 0 {
+				inst.Prefix[lockIndex] |= PrefixImplicit
+				regop += 8
+			}
+			inst.Args[narg] = CR0 + Reg(regop)
+			narg++
+
+		case xArgSreg:
+			regop &= 7
+			if regop >= 6 {
+				inst.Op = 0
+				break Decode
+			}
+			inst.Args[narg] = ES + Reg(regop)
+			narg++
+
+		case xArgRmf16, xArgRmf32, xArgRmf64:
+			base := baseReg[x]
+			index := Reg(modrm & 07)
+			if rex&PrefixREXB != 0 {
+				rexUsed |= PrefixREXB
+				index += 8
+			}
+			inst.Args[narg] = base + index
+			narg++
+
+		case xArgR8op, xArgR16op, xArgR32op, xArgR64op, xArgSTi:
+			n := inst.Opcode >> uint(opshift+8) & 07
+			base := baseReg[x]
+			index := Reg(n)
+			if rex&PrefixREXB != 0 && decodeOp(x) != xArgSTi {
+				rexUsed |= PrefixREXB
+				index += 8
+			}
+			if rex != 0 && base == AL && index >= 4 {
+				rexUsed |= PrefixREX
+				index -= 4
+				base = SPB
+			}
+			inst.Args[narg] = base + index
+			narg++
+
+		case xArgRM8, xArgRM16, xArgRM32, xArgRM64, xArgR32M16, xArgR32M8, xArgR64M16,
+			xArgMmM32, xArgMmM64, xArgMm2M64,
+			xArgXmm2M16, xArgXmm2M32, xArgXmm2M64, xArgXmmM64, xArgXmmM128, xArgXmmM32, xArgXmm2M128:
+			if haveMem {
+				inst.Args[narg] = mem
+				inst.MemBytes = int(memBytes[decodeOp(x)])
+			} else {
+				base := baseReg[x]
+				index := Reg(rm)
+				switch decodeOp(x) {
+				case xArgMmM32, xArgMmM64, xArgMm2M64:
+					// There are only 8 MMX registers, so these ignore the REX.X bit.
+					index &= 7
+				case xArgRM8:
+					if rex != 0 && index >= 4 {
+						rexUsed |= PrefixREX
+						index -= 4
+						base = SPB
+					}
+				}
+				inst.Args[narg] = base + index
+			}
+			narg++
+
+		case xArgMm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
+			if haveMem {
+				inst.Op = 0
+				break Decode
+			}
+			inst.Args[narg] = baseReg[x] + Reg(rm&7)
+			narg++
+
+		case xArgXmm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
+			if haveMem {
+				inst.Op = 0
+				break Decode
+			}
+			inst.Args[narg] = baseReg[x] + Reg(rm)
+			narg++
+
+		case xArgRel8:
+			inst.Args[narg] = Rel(int8(immc))
+			narg++
+
+		case xArgRel16:
+			inst.Args[narg] = Rel(int16(immc))
+			narg++
+
+		case xArgRel32:
+			inst.Args[narg] = Rel(int32(immc))
+			narg++
+		}
+	}
+
+	if inst.Op == 0 {
+		// Invalid instruction.
+		if nprefix > 0 {
+			return instPrefix(src[0], mode) // invalid instruction
+		}
+		return Inst{Len: pos}, ErrUnrecognized
+	}
+
+	// Matched! Hooray!
+
+	// 90 decodes as XCHG EAX, EAX but is NOP.
+	// 66 90 decodes as XCHG AX, AX and is NOP too.
+	// 48 90 decodes as XCHG RAX, RAX and is NOP too.
+	// 43 90 decodes as XCHG R8D, EAX and is *not* NOP.
+	// F3 90 decodes as REP XCHG EAX, EAX but is PAUSE.
+	// It's all too special to handle in the decoding tables, at least for now.
+	if inst.Op == XCHG && inst.Opcode>>24 == 0x90 {
+		if inst.Args[0] == RAX || inst.Args[0] == EAX || inst.Args[0] == AX {
+			inst.Op = NOP
+			if dataSizeIndex >= 0 {
+				inst.Prefix[dataSizeIndex] &^= PrefixImplicit
+			}
+			inst.Args[0] = nil
+			inst.Args[1] = nil
+		}
+		if repIndex >= 0 && inst.Prefix[repIndex] == 0xF3 {
+			inst.Prefix[repIndex] |= PrefixImplicit
+			inst.Op = PAUSE
+			inst.Args[0] = nil
+			inst.Args[1] = nil
+		} else if gnuCompat {
+			for i := nprefix - 1; i >= 0; i-- {
+				if inst.Prefix[i]&0xFF == 0xF3 {
+					inst.Prefix[i] |= PrefixImplicit
+					inst.Op = PAUSE
+					inst.Args[0] = nil
+					inst.Args[1] = nil
+					break
+				}
+			}
+		}
+	}
+
+	// defaultSeg returns the default segment for an implicit
+	// memory reference: the final override if present, or else DS.
+	defaultSeg := func() Reg {
+		if segIndex >= 0 {
+			inst.Prefix[segIndex] |= PrefixImplicit
+			return prefixToSegment(inst.Prefix[segIndex])
+		}
+		return DS
+	}
+
+	// Add implicit arguments not present in the tables.
+	// Normally we shy away from making implicit arguments explicit,
+	// following the Intel manuals, but adding the arguments seems
+	// the best way to express the effect of the segment override prefixes.
+	// TODO(rsc): Perhaps add these to the tables and
+	// create bytecode instructions for them.
+	usedAddrSize := false
+	switch inst.Op {
+	case INSB, INSW, INSD:
+		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
+		inst.Args[1] = DX
+		usedAddrSize = true
+
+	case OUTSB, OUTSW, OUTSD:
+		inst.Args[0] = DX
+		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
+		usedAddrSize = true
+
+	case MOVSB, MOVSW, MOVSD, MOVSQ:
+		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
+		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
+		usedAddrSize = true
+
+	case CMPSB, CMPSW, CMPSD, CMPSQ:
+		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
+		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
+		usedAddrSize = true
+
+	case LODSB, LODSW, LODSD, LODSQ:
+		switch inst.Op {
+		case LODSB:
+			inst.Args[0] = AL
+		case LODSW:
+			inst.Args[0] = AX
+		case LODSD:
+			inst.Args[0] = EAX
+		case LODSQ:
+			inst.Args[0] = RAX
+		}
+		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
+		usedAddrSize = true
+
+	case STOSB, STOSW, STOSD, STOSQ:
+		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
+		switch inst.Op {
+		case STOSB:
+			inst.Args[1] = AL
+		case STOSW:
+			inst.Args[1] = AX
+		case STOSD:
+			inst.Args[1] = EAX
+		case STOSQ:
+			inst.Args[1] = RAX
+		}
+		usedAddrSize = true
+
+	case SCASB, SCASW, SCASD, SCASQ:
+		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
+		switch inst.Op {
+		case SCASB:
+			inst.Args[0] = AL
+		case SCASW:
+			inst.Args[0] = AX
+		case SCASD:
+			inst.Args[0] = EAX
+		case SCASQ:
+			inst.Args[0] = RAX
+		}
+		usedAddrSize = true
+
+	case XLATB:
+		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + BX - AX}
+		usedAddrSize = true
+	}
+
+	// If we used the address size annotation to construct the
+	// argument list, mark that prefix as implicit: it doesn't need
+	// to be shown when printing the instruction.
+	if haveMem || usedAddrSize {
+		if addrSizeIndex >= 0 {
+			inst.Prefix[addrSizeIndex] |= PrefixImplicit
+		}
+	}
+
+	// Similarly, if there's some memory operand, the segment
+	// will be shown there and doesn't need to be shown as an
+	// explicit prefix.
+	if haveMem {
+		if segIndex >= 0 {
+			inst.Prefix[segIndex] |= PrefixImplicit
+		}
+	}
+
+	// Branch predict prefixes are overloaded segment prefixes,
+	// since segment prefixes don't make sense on conditional jumps.
+	// Rewrite final instance to prediction prefix.
+	// The set of instructions to which the prefixes apply (other then the
+	// Jcc conditional jumps) is not 100% clear from the manuals, but
+	// the disassemblers seem to agree about the LOOP and JCXZ instructions,
+	// so we'll follow along.
+	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
+	if isCondJmp[inst.Op] || isLoop[inst.Op] || inst.Op == JCXZ || inst.Op == JECXZ || inst.Op == JRCXZ {
+	PredictLoop:
+		for i := nprefix - 1; i >= 0; i-- {
+			p := inst.Prefix[i]
+			switch p & 0xFF {
+			case PrefixCS:
+				inst.Prefix[i] = PrefixPN
+				break PredictLoop
+			case PrefixDS:
+				inst.Prefix[i] = PrefixPT
+				break PredictLoop
+			}
+		}
+	}
+
+	// The BND prefix is part of the Intel Memory Protection Extensions (MPX).
+	// A REPN applied to certain control transfers is a BND prefix to bound
+	// the range of possible destinations. There's surprisingly little documentation
+	// about this, so we just do what libopcodes and xed agree on.
+	// In particular, it's unclear why a REPN applied to LOOP or JCXZ instructions
+	// does not turn into a BND.
+	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
+	if isCondJmp[inst.Op] || inst.Op == JMP || inst.Op == CALL || inst.Op == RET {
+		for i := nprefix - 1; i >= 0; i-- {
+			p := inst.Prefix[i]
+			if p&^PrefixIgnored == PrefixREPN {
+				inst.Prefix[i] = PrefixBND
+				break
+			}
+		}
+	}
+
+	// The LOCK prefix only applies to certain instructions, and then only
+	// to instances of the instruction with a memory destination.
+	// Other uses of LOCK are invalid and cause a processor exception,
+	// in contrast to the "just ignore it" spirit applied to all other prefixes.
+	// Mark invalid lock prefixes.
+	hasLock := false
+	if lockIndex >= 0 && inst.Prefix[lockIndex]&PrefixImplicit == 0 {
+		switch inst.Op {
+		// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
+		case ADD, ADC, AND, BTC, BTR, BTS, CMPXCHG, CMPXCHG8B, CMPXCHG16B, DEC, INC, NEG, NOT, OR, SBB, SUB, XOR, XADD, XCHG:
+			if isMem(inst.Args[0]) {
+				hasLock = true
+				break
+			}
+			fallthrough
+		default:
+			inst.Prefix[lockIndex] |= PrefixInvalid
+		}
+	}
+
+	// In certain cases, all of which require a memory destination,
+	// the REPN and REP prefixes are interpreted as XACQUIRE and XRELEASE
+	// from the Intel Transactional Synchroniation Extensions (TSX).
+	//
+	// The specific rules are:
+	// (1) Any instruction with a valid LOCK prefix can have XACQUIRE or XRELEASE.
+	// (2) Any XCHG, which always has an implicit LOCK, can have XACQUIRE or XRELEASE.
+	// (3) Any 0x88-, 0x89-, 0xC6-, or 0xC7-opcode MOV can have XRELEASE.
+	if isMem(inst.Args[0]) {
+		if inst.Op == XCHG {
+			hasLock = true
+		}
+
+		for i := len(inst.Prefix) - 1; i >= 0; i-- {
+			p := inst.Prefix[i] &^ PrefixIgnored
+			switch p {
+			case PrefixREPN:
+				if hasLock {
+					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXACQUIRE
+				}
+
+			case PrefixREP:
+				if hasLock {
+					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
+				}
+
+				if inst.Op == MOV {
+					op := (inst.Opcode >> 24) &^ 1
+					if op == 0x88 || op == 0xC6 {
+						inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
+					}
+				}
+			}
+		}
+	}
+
+	// If REP is used on a non-REP-able instruction, mark the prefix as ignored.
+	if repIndex >= 0 {
+		switch inst.Prefix[repIndex] {
+		case PrefixREP, PrefixREPN:
+			switch inst.Op {
+			// According to the manuals, the REP/REPE prefix applies to all of these,
+			// while the REPN applies only to some of them. However, both libopcodes
+			// and xed show both prefixes explicitly for all instructions, so we do the same.
+			// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
+			case INSB, INSW, INSD,
+				MOVSB, MOVSW, MOVSD, MOVSQ,
+				OUTSB, OUTSW, OUTSD,
+				LODSB, LODSW, LODSD, LODSQ,
+				CMPSB, CMPSW, CMPSD, CMPSQ,
+				SCASB, SCASW, SCASD, SCASQ,
+				STOSB, STOSW, STOSD, STOSQ:
+				// ok
+			default:
+				inst.Prefix[repIndex] |= PrefixIgnored
+			}
+		}
+	}
+
+	// If REX was present, mark implicit if all the 1 bits were consumed.
+	if rexIndex >= 0 {
+		if rexUsed != 0 {
+			rexUsed |= PrefixREX
+		}
+		if rex&^rexUsed == 0 {
+			inst.Prefix[rexIndex] |= PrefixImplicit
+		}
+	}
+
+	inst.DataSize = dataMode
+	inst.AddrSize = addrMode
+	inst.Mode = mode
+	inst.Len = pos
+	return inst, nil
+}
+
+var errInternal = errors.New("internal error")
+
+// addr16 records the eight 16-bit addressing modes.
+var addr16 = [8]Mem{
+	{Base: BX, Scale: 1, Index: SI},
+	{Base: BX, Scale: 1, Index: DI},
+	{Base: BP, Scale: 1, Index: SI},
+	{Base: BP, Scale: 1, Index: DI},
+	{Base: SI},
+	{Base: DI},
+	{Base: BP},
+	{Base: BX},
+}
+
+// baseReg returns the base register for a given register size in bits.
+func baseRegForBits(bits int) Reg {
+	switch bits {
+	case 8:
+		return AL
+	case 16:
+		return AX
+	case 32:
+		return EAX
+	case 64:
+		return RAX
+	}
+	return 0
+}
+
+// baseReg records the base register for argument types that specify
+// a range of registers indexed by op, regop, or rm.
+var baseReg = [...]Reg{
+	xArgDR0dashDR7: DR0,
+	xArgMm1:        M0,
+	xArgMm2:        M0,
+	xArgMm2M64:     M0,
+	xArgMm:         M0,
+	xArgMmM32:      M0,
+	xArgMmM64:      M0,
+	xArgR16:        AX,
+	xArgR16op:      AX,
+	xArgR32:        EAX,
+	xArgR32M16:     EAX,
+	xArgR32M8:      EAX,
+	xArgR32op:      EAX,
+	xArgR64:        RAX,
+	xArgR64M16:     RAX,
+	xArgR64op:      RAX,
+	xArgR8:         AL,
+	xArgR8op:       AL,
+	xArgRM16:       AX,
+	xArgRM32:       EAX,
+	xArgRM64:       RAX,
+	xArgRM8:        AL,
+	xArgRmf16:      AX,
+	xArgRmf32:      EAX,
+	xArgRmf64:      RAX,
+	xArgSTi:        F0,
+	xArgTR0dashTR7: TR0,
+	xArgXmm1:       X0,
+	xArgXmm2:       X0,
+	xArgXmm2M128:   X0,
+	xArgXmm2M16:    X0,
+	xArgXmm2M32:    X0,
+	xArgXmm2M64:    X0,
+	xArgXmm:        X0,
+	xArgXmmM128:    X0,
+	xArgXmmM32:     X0,
+	xArgXmmM64:     X0,
+}
+
+// prefixToSegment returns the segment register
+// corresponding to a particular segment prefix.
+func prefixToSegment(p Prefix) Reg {
+	switch p &^ PrefixImplicit {
+	case PrefixCS:
+		return CS
+	case PrefixDS:
+		return DS
+	case PrefixES:
+		return ES
+	case PrefixFS:
+		return FS
+	case PrefixGS:
+		return GS
+	case PrefixSS:
+		return SS
+	}
+	return 0
+}
+
+// fixedArg records the fixed arguments corresponding to the given bytecodes.
+var fixedArg = [...]Arg{
+	xArg1:    Imm(1),
+	xArg3:    Imm(3),
+	xArgAL:   AL,
+	xArgAX:   AX,
+	xArgDX:   DX,
+	xArgEAX:  EAX,
+	xArgEDX:  EDX,
+	xArgRAX:  RAX,
+	xArgRDX:  RDX,
+	xArgCL:   CL,
+	xArgCS:   CS,
+	xArgDS:   DS,
+	xArgES:   ES,
+	xArgFS:   FS,
+	xArgGS:   GS,
+	xArgSS:   SS,
+	xArgST:   F0,
+	xArgXMM0: X0,
+}
+
+// memBytes records the size of the memory pointed at
+// by a memory argument of the given form.
+var memBytes = [...]int8{
+	xArgM128:       128 / 8,
+	xArgM16:        16 / 8,
+	xArgM16and16:   (16 + 16) / 8,
+	xArgM16colon16: (16 + 16) / 8,
+	xArgM16colon32: (16 + 32) / 8,
+	xArgM16int:     16 / 8,
+	xArgM2byte:     2,
+	xArgM32:        32 / 8,
+	xArgM32and32:   (32 + 32) / 8,
+	xArgM32fp:      32 / 8,
+	xArgM32int:     32 / 8,
+	xArgM64:        64 / 8,
+	xArgM64fp:      64 / 8,
+	xArgM64int:     64 / 8,
+	xArgMm2M64:     64 / 8,
+	xArgMmM32:      32 / 8,
+	xArgMmM64:      64 / 8,
+	xArgMoffs16:    16 / 8,
+	xArgMoffs32:    32 / 8,
+	xArgMoffs64:    64 / 8,
+	xArgMoffs8:     8 / 8,
+	xArgR32M16:     16 / 8,
+	xArgR32M8:      8 / 8,
+	xArgR64M16:     16 / 8,
+	xArgRM16:       16 / 8,
+	xArgRM32:       32 / 8,
+	xArgRM64:       64 / 8,
+	xArgRM8:        8 / 8,
+	xArgXmm2M128:   128 / 8,
+	xArgXmm2M16:    16 / 8,
+	xArgXmm2M32:    32 / 8,
+	xArgXmm2M64:    64 / 8,
+	xArgXmm:        128 / 8,
+	xArgXmmM128:    128 / 8,
+	xArgXmmM32:     32 / 8,
+	xArgXmmM64:     64 / 8,
+}
+
+// isCondJmp records the conditional jumps.
+var isCondJmp = [maxOp + 1]bool{
+	JA:  true,
+	JAE: true,
+	JB:  true,
+	JBE: true,
+	JE:  true,
+	JG:  true,
+	JGE: true,
+	JL:  true,
+	JLE: true,
+	JNE: true,
+	JNO: true,
+	JNP: true,
+	JNS: true,
+	JO:  true,
+	JP:  true,
+	JS:  true,
+}
+
+// isLoop records the loop operators.
+var isLoop = [maxOp + 1]bool{
+	LOOP:   true,
+	LOOPE:  true,
+	LOOPNE: true,
+	JECXZ:  true,
+	JRCXZ:  true,
+}