package charset import ( "unicode/utf8" ) func init() { registerClass("utf8", toUTF8, toUTF8) } type translateToUTF8 struct { scratch []byte } var errorBytes = []byte(string(utf8.RuneError)) const errorRuneLen = len(string(utf8.RuneError)) func (p *translateToUTF8) Translate(data []byte, eof bool) (int, []byte, error) { p.scratch = ensureCap(p.scratch, (len(data))*errorRuneLen) buf := p.scratch[:0] for i := 0; i < len(data); { // fast path for ASCII if b := data[i]; b < utf8.RuneSelf { buf = append(buf, b) i++ continue } _, size := utf8.DecodeRune(data[i:]) if size == 1 { if !eof && !utf8.FullRune(data) { // When DecodeRune has converted only a single // byte, we know there must be some kind of error // because we know the byte's not ASCII. // If we aren't at EOF, and it's an incomplete // rune encoding, then we return to process // the final bytes in a subsequent call. return i, buf, nil } buf = append(buf, errorBytes...) } else { buf = append(buf, data[i:i+size]...) } i += size } return len(data), buf, nil } func toUTF8(arg string) (Translator, error) { return new(translateToUTF8), nil }