// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package norm const MaxSegmentSize = maxByteBufferSize // An Iter iterates over a string or byte slice, while normalizing it // to a given Form. type Iter struct { rb reorderBuffer info runeInfo // first character saved from previous iteration next iterFunc // implementation of next depends on form p int // current position in input source outStart int // start of current segment in output buffer inStart int // start of current segment in input source maxp int // position in output buffer after which not to start a new segment maxseg int // for tracking an excess of combining characters tccc uint8 done bool } type iterFunc func(*Iter, []byte) int // SetInput initializes i to iterate over src after normalizing it to Form f. func (i *Iter) SetInput(f Form, src []byte) { i.rb.init(f, src) if i.rb.f.composing { i.next = nextComposed } else { i.next = nextDecomposed } i.p = 0 if i.done = len(src) == 0; !i.done { i.info = i.rb.f.info(i.rb.src, i.p) } } // SetInputString initializes i to iterate over src after normalizing it to Form f. func (i *Iter) SetInputString(f Form, src string) { i.rb.initString(f, src) if i.rb.f.composing { i.next = nextComposed } else { i.next = nextDecomposed } i.p = 0 if i.done = len(src) == 0; !i.done { i.info = i.rb.f.info(i.rb.src, i.p) } } // Pos returns the byte position at which the next call to Next will commence processing. func (i *Iter) Pos() int { return i.p } // Done returns true if there is no more input to process. func (i *Iter) Done() bool { return i.done } // Next writes f(i.input[i.Pos():n]...) to buffer buf, where n is the // largest boundary of i.input such that the result fits in buf. // It returns the number of bytes written to buf. // len(buf) should be at least MaxSegmentSize. // Done must be false before calling Next. func (i *Iter) Next(buf []byte) int { return i.next(i, buf) } func (i *Iter) initNext(outn, inStart int) { i.outStart = 0 i.inStart = inStart i.maxp = outn - MaxSegmentSize i.maxseg = MaxSegmentSize } // setStart resets the start of the new segment to the given position. // It returns true if there is not enough room for the new segment. func (i *Iter) setStart(outp, inp int) bool { if outp > i.maxp { return true } i.outStart = outp i.inStart = inp i.maxseg = outp + MaxSegmentSize return false } func min(a, b int) int { if a < b { return a } return b } // nextDecomposed is the implementation of Next for forms NFD and NFKD. func nextDecomposed(i *Iter, out []byte) int { var outp int i.initNext(len(out), i.p) doFast: inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart for { if sz := int(i.info.size); sz <= 1 { // ASCII or illegal byte. Either way, advance by 1. i.p++ outp++ max := min(i.rb.nsrc, len(out)-outp+i.p) if np := i.rb.src.skipASCII(i.p, max); np > i.p { outp += np - i.p i.p = np if i.p >= i.rb.nsrc { break } // ASCII may combine with consecutive runes. if i.setStart(outp-1, i.p-1) { i.p-- outp-- i.info.size = 1 break } } } else if d := i.info.decomposition(); d != nil { i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) p := outp + len(d) if p > i.maxseg && i.setStart(outp, i.p) { return outp } copy(out[outp:], d) outp = p i.p += sz inCopyStart, outCopyStart = i.p, outp } else if r := i.rb.src.hangul(i.p); r != 0 { i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) for { outp += decomposeHangul(out[outp:], r) i.p += hangulUTF8Size if r = i.rb.src.hangul(i.p); r == 0 { break } if i.setStart(outp, i.p) { return outp } } inCopyStart, outCopyStart = i.p, outp } else { p := outp + sz if p > i.maxseg && i.setStart(outp, i.p) { break } outp = p i.p += sz } if i.p >= i.rb.nsrc { break } prevCC := i.info.tccc i.info = i.rb.f.info(i.rb.src, i.p) if cc := i.info.ccc; cc == 0 { if i.setStart(outp, i.p) { break } } else if cc < prevCC { goto doNorm } } if inCopyStart != i.p { i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) } i.done = i.p >= i.rb.nsrc return outp doNorm: // Insert what we have decomposed so far in the reorderBuffer. // As we will only reorder, there will always be enough room. i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) if !i.rb.insertDecomposed(out[i.outStart:outp]) { // Start over to prevent decompositions from crossing segment boundaries. // This is a rare occurance. i.p = i.inStart i.info = i.rb.f.info(i.rb.src, i.p) } outp = i.outStart for { if !i.rb.insert(i.rb.src, i.p, i.info) { break } if i.p += int(i.info.size); i.p >= i.rb.nsrc { outp += i.rb.flushCopy(out[outp:]) i.done = true return outp } i.info = i.rb.f.info(i.rb.src, i.p) if i.info.ccc == 0 { break } } // new segment or too many combining characters: exit normalization if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) { return outp } goto doFast } // nextComposed is the implementation of Next for forms NFC and NFKC. func nextComposed(i *Iter, out []byte) int { var outp int i.initNext(len(out), i.p) doFast: inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart var prevCC uint8 for { if !i.info.isYesC() { goto doNorm } if cc := i.info.ccc; cc == 0 { if i.setStart(outp, i.p) { break } } else if cc < prevCC { goto doNorm } prevCC = i.info.tccc sz := int(i.info.size) if sz == 0 { sz = 1 // illegal rune: copy byte-by-byte } p := outp + sz if p > i.maxseg && i.setStart(outp, i.p) { break } outp = p i.p += sz max := min(i.rb.nsrc, len(out)-outp+i.p) if np := i.rb.src.skipASCII(i.p, max); np > i.p { outp += np - i.p i.p = np if i.p >= i.rb.nsrc { break } // ASCII may combine with consecutive runes. if i.setStart(outp-1, i.p-1) { i.p-- outp-- i.info = runeInfo{size: 1} break } } if i.p >= i.rb.nsrc { break } i.info = i.rb.f.info(i.rb.src, i.p) } if inCopyStart != i.p { i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) } i.done = i.p >= i.rb.nsrc return outp doNorm: i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.inStart) outp, i.p = i.outStart, i.inStart i.info = i.rb.f.info(i.rb.src, i.p) for { if !i.rb.insert(i.rb.src, i.p, i.info) { break } if i.p += int(i.info.size); i.p >= i.rb.nsrc { i.rb.compose() outp += i.rb.flushCopy(out[outp:]) i.done = true return outp } i.info = i.rb.f.info(i.rb.src, i.p) if i.info.boundaryBefore() { break } } i.rb.compose() if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) { return outp } goto doFast }