matterbridge/vendor/github.com/pelletier/go-toml/v2/internal/characters/utf8.go

package characters

import (
	"unicode/utf8"
)

type utf8Err struct {
	Index int
	Size  int
}

func (u utf8Err) Zero() bool {
	return u.Size == 0
}

// Verified that a given string is only made of valid UTF-8 characters allowed
// by the TOML spec:
//
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters other than tab (U+0000
// to U+0008, U+000A to U+001F, U+007F).
//
// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
// when a character is not allowed.
//
// The returned utf8Err is Zero() if the string is valid, or contains the byte
// index and size of the invalid character.
//
// quotation mark => already checked
// backslash => already checked
// 0-0x8 => invalid
// 0x9 => tab, ok
// 0xA - 0x1F => invalid
// 0x7F => invalid
func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
	offset := 0
	for len(p) >= 8 {
		// Combining two 32 bit loads allows the same code to be used
		// for 32 and 64 bit platforms.
		// The compiler can generate a 32bit load for first32 and second32
		// on many platforms. See test/codegen/memcombine.go.
		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
		if (first32|second32)&0x80808080 != 0 {
			// Found a non ASCII byte (>= RuneSelf).
			break
		}

		for i, b := range p[:8] {
			if InvalidAscii(b) {
				err.Index = offset + i
				err.Size = 1
				return
			}
		}

		p = p[8:]
		offset += 8
	}
	n := len(p)
	for i := 0; i < n; {
		pi := p[i]
		if pi < utf8.RuneSelf {
			if InvalidAscii(pi) {
				err.Index = offset + i
				err.Size = 1
				return
			}
			i++
			continue
		}
		x := first[pi]
		if x == xx {
			// Illegal starter byte.
			err.Index = offset + i
			err.Size = 1
			return
		}
		size := int(x & 7)
		if i+size > n {
			// Short or invalid.
			err.Index = offset + i
			err.Size = n - i
			return
		}
		accept := acceptRanges[x>>4]
		if c := p[i+1]; c < accept.lo || accept.hi < c {
			err.Index = offset + i
			err.Size = 2
			return
		} else if size == 2 {
		} else if c := p[i+2]; c < locb || hicb < c {
			err.Index = offset + i
			err.Size = 3
			return
		} else if size == 3 {
		} else if c := p[i+3]; c < locb || hicb < c {
			err.Index = offset + i
			err.Size = 4
			return
		}
		i += size
	}
	return
}

// Return the size of the next rune if valid, 0 otherwise.
func Utf8ValidNext(p []byte) int {
	c := p[0]

	if c < utf8.RuneSelf {
		if InvalidAscii(c) {
			return 0
		}
		return 1
	}

	x := first[c]
	if x == xx {
		// Illegal starter byte.
		return 0
	}
	size := int(x & 7)
	if size > len(p) {
		// Short or invalid.
		return 0
	}
	accept := acceptRanges[x>>4]
	if c := p[1]; c < accept.lo || accept.hi < c {
		return 0
	} else if size == 2 {
	} else if c := p[2]; c < locb || hicb < c {
		return 0
	} else if size == 3 {
	} else if c := p[3]; c < locb || hicb < c {
		return 0
	}

	return size
}

// acceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
type acceptRange struct {
	lo uint8 // lowest value for second byte.
	hi uint8 // highest value for second byte.
}

// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
var acceptRanges = [16]acceptRange{
	0: {locb, hicb},
	1: {0xA0, hicb},
	2: {locb, 0x9F},
	3: {0x90, hicb},
	4: {locb, 0x8F},
}

// first is information about the first byte in a UTF-8 sequence.
var first = [256]uint8{
	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}

const (
	// The default lowest and highest continuation byte.
	locb = 0b10000000
	hicb = 0b10111111

	// These names of these constants are chosen to give nice alignment in the
	// table below. The first nibble is an index into acceptRanges or F for
	// special one-byte cases. The second nibble is the Rune length or the
	// Status for the special one-byte case.
	xx = 0xF1 // invalid: size 1
	as = 0xF0 // ASCII: size 1
	s1 = 0x02 // accept 0, size 2
	s2 = 0x13 // accept 1, size 3
	s3 = 0x03 // accept 0, size 3
	s4 = 0x23 // accept 2, size 3
	s5 = 0x34 // accept 3, size 4
	s6 = 0x04 // accept 0, size 4
	s7 = 0x44 // accept 4, size 4
)
Update dependencies (#1951) 2023-01-28 21:57:53 +00:00			`package characters`
Update dependencies (#1813) 2022-04-25 21:50:10 +00:00
			`import (`
			`"unicode/utf8"`
			`)`

			`type utf8Err struct {`
			`Index int`
			`Size int`
			`}`

			`func (u utf8Err) Zero() bool {`
			`return u.Size == 0`
			`}`

			`// Verified that a given string is only made of valid UTF-8 characters allowed`
			`// by the TOML spec:`
			`//`
			`// Any Unicode character may be used except those that must be escaped:`
			`// quotation mark, backslash, and the control characters other than tab (U+0000`
			`// to U+0008, U+000A to U+001F, U+007F).`
			`//`
			`// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early`
			`// when a character is not allowed.`
			`//`
			`// The returned utf8Err is Zero() if the string is valid, or contains the byte`
			`// index and size of the invalid character.`
			`//`
			`// quotation mark => already checked`
			`// backslash => already checked`
			`// 0-0x8 => invalid`
			`// 0x9 => tab, ok`
			`// 0xA - 0x1F => invalid`
			`// 0x7F => invalid`
Update dependencies (#1951) 2023-01-28 21:57:53 +00:00			`func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {`
Update dependencies (#1813) 2022-04-25 21:50:10 +00:00			`// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.`
			`offset := 0`
			`for len(p) >= 8 {`
			`// Combining two 32 bit loads allows the same code to be used`
			`// for 32 and 64 bit platforms.`
			`// The compiler can generate a 32bit load for first32 and second32`
			`// on many platforms. See test/codegen/memcombine.go.`
			`first32 := uint32(p[0]) \| uint32(p[1])<<8 \| uint32(p[2])<<16 \| uint32(p[3])<<24`
			`second32 := uint32(p[4]) \| uint32(p[5])<<8 \| uint32(p[6])<<16 \| uint32(p[7])<<24`
			`if (first32\|second32)&0x80808080 != 0 {`
			`// Found a non ASCII byte (>= RuneSelf).`
			`break`
			`}`

			`for i, b := range p[:8] {`
Update dependencies (#1951) 2023-01-28 21:57:53 +00:00			`if InvalidAscii(b) {`
Update dependencies (#1813) 2022-04-25 21:50:10 +00:00			`err.Index = offset + i`
			`err.Size = 1`
			`return`
			`}`
			`}`

			`p = p[8:]`
			`offset += 8`
			`}`
			`n := len(p)`
			`for i := 0; i < n; {`
			`pi := p[i]`
			`if pi < utf8.RuneSelf {`
Update dependencies (#1951) 2023-01-28 21:57:53 +00:00			`if InvalidAscii(pi) {`
Update dependencies (#1813) 2022-04-25 21:50:10 +00:00			`err.Index = offset + i`
			`err.Size = 1`
			`return`
			`}`
			`i++`
			`continue`
			`}`
			`x := first[pi]`
			`if x == xx {`
			`// Illegal starter byte.`
			`err.Index = offset + i`
			`err.Size = 1`
			`return`
			`}`
			`size := int(x & 7)`
			`if i+size > n {`
			`// Short or invalid.`
			`err.Index = offset + i`
			`err.Size = n - i`
			`return`
			`}`
			`accept := acceptRanges[x>>4]`
			`if c := p[i+1]; c < accept.lo \|\| accept.hi < c {`
			`err.Index = offset + i`
			`err.Size = 2`
			`return`
			`} else if size == 2 {`
			`} else if c := p[i+2]; c < locb \|\| hicb < c {`
			`err.Index = offset + i`
			`err.Size = 3`
			`return`
			`} else if size == 3 {`
			`} else if c := p[i+3]; c < locb \|\| hicb < c {`
			`err.Index = offset + i`
			`err.Size = 4`
			`return`
			`}`
			`i += size`
			`}`
			`return`
			`}`

			`// Return the size of the next rune if valid, 0 otherwise.`
Update dependencies (#1951) 2023-01-28 21:57:53 +00:00			`func Utf8ValidNext(p []byte) int {`
Update dependencies (#1813) 2022-04-25 21:50:10 +00:00			`c := p[0]`

			`if c < utf8.RuneSelf {`
Update dependencies (#1951) 2023-01-28 21:57:53 +00:00			`if InvalidAscii(c) {`
Update dependencies (#1813) 2022-04-25 21:50:10 +00:00			`return 0`
			`}`
			`return 1`
			`}`

			`x := first[c]`
			`if x == xx {`
			`// Illegal starter byte.`
			`return 0`
			`}`
			`size := int(x & 7)`
			`if size > len(p) {`
			`// Short or invalid.`
			`return 0`
			`}`
			`accept := acceptRanges[x>>4]`
			`if c := p[1]; c < accept.lo \|\| accept.hi < c {`
			`return 0`
			`} else if size == 2 {`
			`} else if c := p[2]; c < locb \|\| hicb < c {`
			`return 0`
			`} else if size == 3 {`
			`} else if c := p[3]; c < locb \|\| hicb < c {`
			`return 0`
			`}`

			`return size`
			`}`

			`// acceptRange gives the range of valid values for the second byte in a UTF-8`
			`// sequence.`
			`type acceptRange struct {`
			`lo uint8 // lowest value for second byte.`
			`hi uint8 // highest value for second byte.`
			`}`

			`// acceptRanges has size 16 to avoid bounds checks in the code that uses it.`
			`var acceptRanges = [16]acceptRange{`
			`0: {locb, hicb},`
			`1: {0xA0, hicb},`
			`2: {locb, 0x9F},`
			`3: {0x90, hicb},`
			`4: {locb, 0x8F},`
			`}`

			`// first is information about the first byte in a UTF-8 sequence.`
			`var first = [256]uint8{`
			`// 1 2 3 4 5 6 7 8 9 A B C D E F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F`
			`as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F`
			`// 1 2 3 4 5 6 7 8 9 A B C D E F`
			`xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F`
			`xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F`
			`xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF`
			`xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF`
			`xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF`
			`s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF`
			`s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF`
			`s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF`
			`}`

			`const (`
			`// The default lowest and highest continuation byte.`
			`locb = 0b10000000`
			`hicb = 0b10111111`

			`// These names of these constants are chosen to give nice alignment in the`
			`// table below. The first nibble is an index into acceptRanges or F for`
			`// special one-byte cases. The second nibble is the Rune length or the`
			`// Status for the special one-byte case.`
			`xx = 0xF1 // invalid: size 1`
			`as = 0xF0 // ASCII: size 1`
			`s1 = 0x02 // accept 0, size 2`
			`s2 = 0x13 // accept 1, size 3`
			`s3 = 0x03 // accept 0, size 3`
			`s4 = 0x23 // accept 2, size 3`
			`s5 = 0x34 // accept 3, size 4`
			`s6 = 0x04 // accept 0, size 4`
			`s7 = 0x44 // accept 4, size 4`
			`)`