4
0
mirror of https://github.com/cwinfo/matterbridge.git synced 2025-06-27 19:19:24 +00:00

Add go-charset and chardet to vendor

This commit is contained in:
Wim
2017-07-07 23:34:05 +02:00
parent 2338c69d40
commit a0938d9386
47 changed files with 3974 additions and 0 deletions

View File

@ -0,0 +1,65 @@
package charset
import (
"bytes"
"fmt"
"unicode/utf8"
)
func init() {
registerClass("ascii", fromASCII, toASCII)
}
const errorByte = '?'
type translateFromASCII bool
type codePointError struct {
i int
cp rune
charset string
}
func (e *codePointError) Error() string {
return fmt.Sprintf("Parse error at index %n: Code point %n is undefined in %s", e.i, e.cp, e.charset)
}
func (strict translateFromASCII) Translate(data []byte, eof bool) (int, []byte, error) {
buf := bytes.NewBuffer(make([]byte, 0, len(data)))
for i, c := range data {
if c > 0 && c < 128 {
buf.WriteByte(c)
if c < 32 && c != 10 && c != 13 && c != 9 {
// badly formed
}
} else {
if strict {
return 0, nil, &codePointError{i, rune(c), "US-ASCII"}
}
buf.WriteRune(utf8.RuneError)
}
}
return len(data), buf.Bytes(), nil
}
type translateToASCII bool
func (strict translateToASCII) Translate(data []byte, eof bool) (int, []byte, error) {
buf := bytes.NewBuffer(make([]byte, 0, len(data)))
for _, c := range data {
if c > 0 && c < 128 {
buf.WriteByte(c)
} else {
buf.WriteByte(errorByte)
}
}
return len(data), buf.Bytes(), nil
}
func fromASCII(arg string) (Translator, error) {
return new(translateFromASCII), nil
}
func toASCII(arg string) (Translator, error) {
return new(translateToASCII), nil
}

View File

@ -0,0 +1,88 @@
package charset
import (
"fmt"
"unicode/utf8"
)
func init() {
registerClass("big5", fromBig5, nil)
}
// Big5 consists of 89 fonts of 157 chars each
const (
big5Max = 13973
big5Font = 157
big5Data = "big5.dat"
)
type translateFromBig5 struct {
font int
scratch []byte
big5map []rune
}
func (p *translateFromBig5) Translate(data []byte, eof bool) (int, []byte, error) {
p.scratch = p.scratch[:0]
n := 0
for len(data) > 0 {
c := int(data[0])
data = data[1:]
n++
if p.font == -1 {
// idle state
if c >= 0xa1 {
p.font = c
continue
}
if c == 26 {
c = '\n'
}
continue
}
f := p.font
p.font = -1
r := utf8.RuneError
switch {
case c >= 64 && c <= 126:
c -= 64
case c >= 161 && c <= 254:
c = c - 161 + 63
default:
// bad big5 char
f = 255
}
if f <= 254 {
f -= 161
ix := f*big5Font + c
if ix < len(p.big5map) {
r = p.big5map[ix]
}
if r == -1 {
r = utf8.RuneError
}
}
p.scratch = appendRune(p.scratch, r)
}
return n, p.scratch, nil
}
type big5Key bool
func fromBig5(arg string) (Translator, error) {
big5map, err := cache(big5Key(false), func() (interface{}, error) {
data, err := readFile(big5Data)
if err != nil {
return nil, fmt.Errorf("charset: cannot open big5 data file: %v", err)
}
big5map := []rune(string(data))
if len(big5map) != big5Max {
return nil, fmt.Errorf("charset: corrupt big5 data")
}
return big5map, nil
})
if err != nil {
return nil, err
}
return &translateFromBig5{big5map: big5map.([]rune), font: -1}, nil
}

View File

@ -0,0 +1,301 @@
// The charset package implements translation between character sets.
// It uses Unicode as the intermediate representation.
// Because it can be large, the character set data is separated
// from the charset package. It can be embedded in the Go
// executable by importing the data package:
//
// import _ "github.com/paulrosania/go-charset/data"
//
// It can also made available in a data directory (by settting CharsetDir).
package charset
import (
"io"
"strings"
"unicode/utf8"
)
// Charset holds information about a given character set.
type Charset struct {
Name string // Canonical name of character set.
Aliases []string // Known aliases.
Desc string // Description.
NoFrom bool // Not possible to translate from this charset.
NoTo bool // Not possible to translate to this charset.
}
// Translator represents a character set converter.
// The Translate method translates the given data,
// and returns the number of bytes of data consumed,
// a slice containing the converted data (which may be
// overwritten on the next call to Translate), and any
// conversion error. If eof is true, the data represents
// the final bytes of the input.
type Translator interface {
Translate(data []byte, eof bool) (n int, cdata []byte, err error)
}
// A Factory can be used to make character set translators.
type Factory interface {
// TranslatorFrom creates a translator that will translate from the named character
// set to UTF-8.
TranslatorFrom(name string) (Translator, error) // Create a Translator from this character set to.
// TranslatorTo creates a translator that will translate from UTF-8 to the named character set.
TranslatorTo(name string) (Translator, error) // Create a Translator To this character set.
// Names returns all the character set names accessibile through the factory.
Names() []string
// Info returns information on the named character set. It returns nil if the
// factory doesn't recognise the given name.
Info(name string) *Charset
}
var factories = []Factory{localFactory{}}
// Register registers a new Factory which will be consulted when NewReader
// or NewWriter needs a character set translator for a given name.
func Register(factory Factory) {
factories = append(factories, factory)
}
// NewReader returns a new Reader that translates from the named
// character set to UTF-8 as it reads r.
func NewReader(charset string, r io.Reader) (io.Reader, error) {
tr, err := TranslatorFrom(charset)
if err != nil {
return nil, err
}
return NewTranslatingReader(r, tr), nil
}
// NewWriter returns a new WriteCloser writing to w. It converts writes
// of UTF-8 text into writes on w of text in the named character set.
// The Close is necessary to flush any remaining partially translated
// characters to the output.
func NewWriter(charset string, w io.Writer) (io.WriteCloser, error) {
tr, err := TranslatorTo(charset)
if err != nil {
return nil, err
}
return NewTranslatingWriter(w, tr), nil
}
// Info returns information about a character set, or nil
// if the character set is not found.
func Info(name string) *Charset {
for _, f := range factories {
if info := f.Info(name); info != nil {
return info
}
}
return nil
}
// Names returns the canonical names of all supported character sets, in alphabetical order.
func Names() []string {
// TODO eliminate duplicates
var names []string
for _, f := range factories {
names = append(names, f.Names()...)
}
return names
}
// TranslatorFrom returns a translator that will translate from
// the named character set to UTF-8.
func TranslatorFrom(charset string) (Translator, error) {
var err error
var tr Translator
for _, f := range factories {
tr, err = f.TranslatorFrom(charset)
if err == nil {
break
}
}
if tr == nil {
return nil, err
}
return tr, nil
}
// TranslatorTo returns a translator that will translate from UTF-8
// to the named character set.
func TranslatorTo(charset string) (Translator, error) {
var err error
var tr Translator
for _, f := range factories {
tr, err = f.TranslatorTo(charset)
if err == nil {
break
}
}
if tr == nil {
return nil, err
}
return tr, nil
}
func normalizedChar(c rune) rune {
switch {
case c >= 'A' && c <= 'Z':
c = c - 'A' + 'a'
case c == '_':
c = '-'
}
return c
}
// NormalisedName returns s with all Roman capitals
// mapped to lower case, and '_' mapped to '-'
func NormalizedName(s string) string {
return strings.Map(normalizedChar, s)
}
type translatingWriter struct {
w io.Writer
tr Translator
buf []byte // unconsumed data from writer.
}
// NewTranslatingWriter returns a new WriteCloser writing to w.
// It passes the written bytes through the given Translator.
func NewTranslatingWriter(w io.Writer, tr Translator) io.WriteCloser {
return &translatingWriter{w: w, tr: tr}
}
func (w *translatingWriter) Write(data []byte) (rn int, rerr error) {
wdata := data
if len(w.buf) > 0 {
w.buf = append(w.buf, data...)
wdata = w.buf
}
n, cdata, err := w.tr.Translate(wdata, false)
if err != nil {
// TODO
}
if n > 0 {
_, err = w.w.Write(cdata)
if err != nil {
return 0, err
}
}
w.buf = w.buf[:0]
if n < len(wdata) {
w.buf = append(w.buf, wdata[n:]...)
}
return len(data), nil
}
func (p *translatingWriter) Close() error {
for {
n, data, err := p.tr.Translate(p.buf, true)
p.buf = p.buf[n:]
if err != nil {
// TODO
}
// If the Translator produces no data
// at EOF, then assume that it never will.
if len(data) == 0 {
break
}
n, err = p.w.Write(data)
if err != nil {
return err
}
if n < len(data) {
return io.ErrShortWrite
}
if len(p.buf) == 0 {
break
}
}
return nil
}
type translatingReader struct {
r io.Reader
tr Translator
cdata []byte // unconsumed data from converter.
rdata []byte // unconverted data from reader.
err error // final error from reader.
}
// NewTranslatingReader returns a new Reader that
// translates data using the given Translator as it reads r.
func NewTranslatingReader(r io.Reader, tr Translator) io.Reader {
return &translatingReader{r: r, tr: tr}
}
func (r *translatingReader) Read(buf []byte) (int, error) {
for {
if len(r.cdata) > 0 {
n := copy(buf, r.cdata)
r.cdata = r.cdata[n:]
return n, nil
}
if r.err == nil {
r.rdata = ensureCap(r.rdata, len(r.rdata)+len(buf))
n, err := r.r.Read(r.rdata[len(r.rdata):cap(r.rdata)])
// Guard against non-compliant Readers.
if n == 0 && err == nil {
err = io.EOF
}
r.rdata = r.rdata[0 : len(r.rdata)+n]
r.err = err
} else if len(r.rdata) == 0 {
break
}
nc, cdata, cvterr := r.tr.Translate(r.rdata, r.err != nil)
if cvterr != nil {
// TODO
}
r.cdata = cdata
// Ensure that we consume all bytes at eof
// if the converter refuses them.
if nc == 0 && r.err != nil {
nc = len(r.rdata)
}
// Copy unconsumed data to the start of the rdata buffer.
r.rdata = r.rdata[0:copy(r.rdata, r.rdata[nc:])]
}
return 0, r.err
}
// ensureCap returns s with a capacity of at least n bytes.
// If cap(s) < n, then it returns a new copy of s with the
// required capacity.
func ensureCap(s []byte, n int) []byte {
if n <= cap(s) {
return s
}
// logic adapted from appendslice1 in runtime
m := cap(s)
if m == 0 {
m = n
} else {
for {
if m < 1024 {
m += m
} else {
m += m / 4
}
if m >= n {
break
}
}
}
t := make([]byte, len(s), m)
copy(t, s)
return t
}
func appendRune(buf []byte, r rune) []byte {
n := len(buf)
buf = ensureCap(buf, n+utf8.UTFMax)
nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r)
return buf[0 : n+nu]
}

View File

@ -0,0 +1,133 @@
package charset
import (
"fmt"
"unicode/utf8"
)
func init() {
registerClass("cp", fromCodePage, toCodePage)
}
type translateFromCodePage struct {
byte2rune *[256]rune
scratch []byte
}
type cpKeyFrom string
type cpKeyTo string
func (p *translateFromCodePage) Translate(data []byte, eof bool) (int, []byte, error) {
p.scratch = ensureCap(p.scratch, len(data)*utf8.UTFMax)[:0]
buf := p.scratch
for _, x := range data {
r := p.byte2rune[x]
if r < utf8.RuneSelf {
buf = append(buf, byte(r))
continue
}
size := utf8.EncodeRune(buf[len(buf):cap(buf)], r)
buf = buf[0 : len(buf)+size]
}
return len(data), buf, nil
}
type toCodePageInfo struct {
rune2byte map[rune]byte
// same gives the number of runes at start of code page that map exactly to
// unicode.
same rune
}
type translateToCodePage struct {
toCodePageInfo
scratch []byte
}
func (p *translateToCodePage) Translate(data []byte, eof bool) (int, []byte, error) {
p.scratch = ensureCap(p.scratch, len(data))
buf := p.scratch[:0]
for i := 0; i < len(data); {
r := rune(data[i])
size := 1
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(data[i:])
if size == 1 && !eof && !utf8.FullRune(data[i:]) {
return i, buf, nil
}
}
var b byte
if r < p.same {
b = byte(r)
} else {
var ok bool
b, ok = p.rune2byte[r]
if !ok {
b = '?'
}
}
buf = append(buf, b)
i += size
}
return len(data), buf, nil
}
func fromCodePage(arg string) (Translator, error) {
runes, err := cache(cpKeyFrom(arg), func() (interface{}, error) {
data, err := readFile(arg)
if err != nil {
return nil, err
}
runes := []rune(string(data))
if len(runes) != 256 {
return nil, fmt.Errorf("charset: %q has wrong rune count (%d)", arg, len(runes))
}
r := new([256]rune)
copy(r[:], runes)
return r, nil
})
if err != nil {
return nil, err
}
return &translateFromCodePage{byte2rune: runes.(*[256]rune)}, nil
}
func toCodePage(arg string) (Translator, error) {
m, err := cache(cpKeyTo(arg), func() (interface{}, error) {
data, err := readFile(arg)
if err != nil {
return nil, err
}
info := toCodePageInfo{
rune2byte: make(map[rune]byte),
same: 256,
}
atStart := true
i := rune(0)
for _, r := range string(data) {
if atStart {
if r == i {
i++
continue
}
info.same = i
atStart = false
}
info.rune2byte[r] = byte(i)
i++
}
// TODO fix tables
// fmt.Printf("%s, same = %d\n", arg, info.same)
if i != 256 {
return nil, fmt.Errorf("charset: %q has wrong rune count (%d)", arg, i)
}
return info, nil
})
if err != nil {
return nil, err
}
return &translateToCodePage{toCodePageInfo: m.(toCodePageInfo)}, nil
}

View File

@ -0,0 +1,195 @@
package charset
import (
"fmt"
"unicode/utf8"
)
func init() {
registerClass("cp932", fromCP932, nil)
}
// encoding details
// (Traditional) Shift-JIS
//
// 00..1f control characters
// 20 space
// 21..7f JIS X 0201:1976/1997 roman (see notes)
// 80 undefined
// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
// a0 undefined
// a1..df JIS X 0201:1976/1997 katakana
// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
// eb..ff undefined
//
// CP932 (windows-31J)
//
// this encoding scheme extends Shift-JIS in the following way
//
// eb..ec undefined (marked as lead bytes - see notes below)
// ed..ee lead byte of NEC-selected IBM extended characters
// ef undefined (marked as lead byte - see notes below)
// f0..f9 lead byte of User defined GAIJI (see note below)
// fa..fc lead byte of IBM extended characters
// fd..ff undefined
//
//
// Notes
//
// JISX 0201:1976/1997 roman
// this is the same as ASCII but with 0x5c (ASCII code for '\')
// representing the Yen currency symbol '¥' (U+00a5)
// This mapping is contentious, some conversion packages implent it
// others do not.
// The mapping files from The Unicode Consortium show cp932 mapping
// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
// symbol (¥) and 0x7e ('~') to overline (¯)
//
// CP932 double-byte character codes:
//
// eb-ec, ef, f0-f9:
// Marked as DBCS LEAD BYTEs in the unicode mapping data
// obtained from:
// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
//
// but there are no defined mappings for codes in this range.
// It is not clear whether or not an implementation should
// consume one or two bytes before emitting an error char.
const (
kanaPages = 1
kanaPageSize = 63
kanaChar0 = 0xa1
cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
cp932PageSize = 189 // 40..fc (including 7f)
cp932Char0 = 0x40
)
type jisTables struct {
page0 [256]rune
dbcsoff [256]int
cp932 []rune
}
type translateFromCP932 struct {
tables *jisTables
scratch []byte
}
func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
tables := p.tables
p.scratch = p.scratch[:0]
n := 0
for i := 0; i < len(data); i++ {
b := data[i]
r := tables.page0[b]
if r != -1 {
p.scratch = appendRune(p.scratch, r)
n++
continue
}
// DBCS
i++
if i >= len(data) {
break
}
pnum := tables.dbcsoff[b]
ix := int(data[i]) - cp932Char0
if pnum == -1 || ix < 0 || ix >= cp932PageSize {
r = utf8.RuneError
} else {
r = tables.cp932[pnum*cp932PageSize+ix]
}
p.scratch = appendRune(p.scratch, r)
n += 2
}
return n, p.scratch, nil
}
type cp932Key bool
func fromCP932(arg string) (Translator, error) {
shiftJIS := arg == "shiftjis"
tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
tables := new(jisTables)
kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
if err != nil {
return nil, err
}
tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
if err != nil {
return nil, err
}
// jisx0201kana is mapped into 0xA1..0xDF
for i := 0; i < kanaPageSize; i++ {
tables.page0[i+kanaChar0] = kana[i]
}
// 00..7f same as ascii in cp932
for i := rune(0); i < 0x7f; i++ {
tables.page0[i] = i
}
if shiftJIS {
// shift-jis uses JIS X 0201 for the ASCII range
// this is the same as ASCII apart from
// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
tables.page0['\\'] = '¥'
tables.page0['~'] = '¯'
}
// pre-calculate DBCS page numbers to mapping file page numbers
// and mark codes in page0 that are DBCS lead bytes
pnum := 0
for i := 0x81; i <= 0x84; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
for i := 0x87; i <= 0x9f; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
for i := 0xe0; i <= 0xea; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
if shiftJIS {
return tables, nil
}
// add in cp932 extensions
for i := 0xed; i <= 0xee; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
for i := 0xfa; i <= 0xfc; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
return tables, nil
})
if err != nil {
return nil, err
}
return &translateFromCP932{tables: tables.(*jisTables)}, nil
}
func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
data, err := readFile(name)
if err != nil {
return nil, err
}
m := []rune(string(data))
if len(m) != pgsize*npages {
return nil, fmt.Errorf("%q: incorrect length data", name)
}
return m, nil
}

View File

@ -0,0 +1,40 @@
package charset
import (
"io"
"io/ioutil"
"os"
"path/filepath"
)
var files = make(map[string]func() (io.ReadCloser, error))
// RegisterDataFile registers the existence of a given data
// file with the given name that may be used by a character-set converter.
// It is intended to be used by packages that wish to embed
// data in the executable binary, and should not be
// used normally.
func RegisterDataFile(name string, open func() (io.ReadCloser, error)) {
files[name] = open
}
// CharsetDir gives the location of the default data file directory.
// This directory will be used for files with names that have not
// been registered with RegisterDataFile.
var CharsetDir = "/usr/local/lib/go-charset/datafiles"
func readFile(name string) (data []byte, err error) {
var r io.ReadCloser
if open := files[name]; open != nil {
r, err = open()
if err != nil {
return
}
} else {
r, err = os.Open(filepath.Join(CharsetDir, name))
if err != nil {
return
}
}
return ioutil.ReadAll(r)
}

View File

@ -0,0 +1,184 @@
// The iconv package provides an interface to the GNU iconv character set
// conversion library (see http://www.gnu.org/software/libiconv/).
// It automatically registers all the character sets with the charset package,
// so it is usually used simply for the side effects of importing it.
// Example:
// import (
// "go-charset.googlecode.com/hg/charset"
// _ "go-charset.googlecode.com/hg/charset/iconv"
// )
package iconv
//#cgo darwin LDFLAGS: -liconv
//#include <stdlib.h>
//#include <iconv.h>
//#include <errno.h>
//iconv_t iconv_open_error = (iconv_t)-1;
//size_t iconv_error = (size_t)-1;
import "C"
import (
"errors"
"fmt"
"github.com/paulrosania/go-charset/charset"
"runtime"
"strings"
"syscall"
"unicode/utf8"
"unsafe"
)
type iconvTranslator struct {
cd C.iconv_t
invalid rune
scratch []byte
}
func canonicalChar(c rune) rune {
if c >= 'a' && c <= 'z' {
return c - 'a' + 'A'
}
return c
}
func canonicalName(s string) string {
return strings.Map(canonicalChar, s)
}
func init() {
charset.Register(iconvFactory{})
}
type iconvFactory struct {
}
func (iconvFactory) TranslatorFrom(name string) (charset.Translator, error) {
return Translator("UTF-8", name, utf8.RuneError)
}
func (iconvFactory) TranslatorTo(name string) (charset.Translator, error) {
// BUG This is wrong. The target character set may not be ASCII
// compatible. There's no easy solution to this other than
// removing the offending code point.
return Translator(name, "UTF-8", '?')
}
// Translator returns a Translator that translates between
// the named character sets. When an invalid multibyte
// character is found, the bytes in invalid are substituted instead.
func Translator(toCharset, fromCharset string, invalid rune) (charset.Translator, error) {
cto, cfrom := C.CString(toCharset), C.CString(fromCharset)
cd, err := C.iconv_open(cto, cfrom)
C.free(unsafe.Pointer(cfrom))
C.free(unsafe.Pointer(cto))
if cd == C.iconv_open_error {
if err == syscall.EINVAL {
return nil, errors.New("iconv: conversion not supported")
}
return nil, err
}
t := &iconvTranslator{cd: cd, invalid: invalid}
runtime.SetFinalizer(t, func(*iconvTranslator) {
C.iconv_close(cd)
})
return t, nil
}
func (iconvFactory) Names() []string {
all := aliases()
names := make([]string, 0, len(all))
for name, aliases := range all {
if aliases[0] == name {
names = append(names, name)
}
}
return names
}
func (iconvFactory) Info(name string) *charset.Charset {
name = strings.ToLower(name)
all := aliases()
a, ok := all[name]
if !ok {
return nil
}
return &charset.Charset{
Name: name,
Aliases: a,
}
}
func (p *iconvTranslator) Translate(data []byte, eof bool) (rn int, rd []byte, rerr error) {
n := 0
p.scratch = p.scratch[:0]
for len(data) > 0 {
p.scratch = ensureCap(p.scratch, len(p.scratch)+len(data)*utf8.UTFMax)
cData := (*C.char)(unsafe.Pointer(&data[:1][0]))
nData := C.size_t(len(data))
ns := len(p.scratch)
cScratch := (*C.char)(unsafe.Pointer(&p.scratch[ns : ns+1][0]))
nScratch := C.size_t(cap(p.scratch) - ns)
r, err := C.iconv(p.cd, &cData, &nData, &cScratch, &nScratch)
p.scratch = p.scratch[0 : cap(p.scratch)-int(nScratch)]
n += len(data) - int(nData)
data = data[len(data)-int(nData):]
if r != C.iconv_error || err == nil {
return n, p.scratch, nil
}
switch err := err.(syscall.Errno); err {
case C.EILSEQ:
// invalid multibyte sequence - skip one byte and continue
p.scratch = appendRune(p.scratch, p.invalid)
n++
data = data[1:]
case C.EINVAL:
// incomplete multibyte sequence
return n, p.scratch, nil
case C.E2BIG:
// output buffer not large enough; try again with larger buffer.
p.scratch = ensureCap(p.scratch, cap(p.scratch)+utf8.UTFMax)
default:
panic(fmt.Sprintf("unexpected error code: %v", err))
}
}
return n, p.scratch, nil
}
// ensureCap returns s with a capacity of at least n bytes.
// If cap(s) < n, then it returns a new copy of s with the
// required capacity.
func ensureCap(s []byte, n int) []byte {
if n <= cap(s) {
return s
}
// logic adapted from appendslice1 in runtime
m := cap(s)
if m == 0 {
m = n
} else {
for {
if m < 1024 {
m += m
} else {
m += m / 4
}
if m >= n {
break
}
}
}
t := make([]byte, len(s), m)
copy(t, s)
return t
}
func appendRune(buf []byte, r rune) []byte {
n := len(buf)
buf = ensureCap(buf, n+utf8.UTFMax)
nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r)
return buf[0 : n+nu]
}

View File

@ -0,0 +1,80 @@
// +build !linux
// This file is systemdependent because not all versions
// of iconv have the iconvlist function.
package iconv
//#cgo darwin LDFLAGS: -liconv
//#cgo freebsd LDFLAGS: -liconv
//#cgo windows LDFLAGS: -liconv
//#include <stdlib.h>
//#include <string.h>
//#include <iconv.h>
//#include <errno.h>
//
//typedef struct nameList nameList;
//struct nameList {
// int n;
// char **names;
// nameList *next;
//};
//
//int
//addNames(unsigned int n, const char *const *names, void *data) {
// // we can't call back to Go because of the stack size issue,
// // so copy all the names.
// nameList *hd, *e;
// int i;
//
// hd = data;
// e = malloc(sizeof(nameList));
// e->n = n;
// e->names = malloc(sizeof(char*) * n);
// for(i = 0; i < n; i++){
// e->names[i] = strdup(names[i]);
// }
// e->next = hd->next;
// hd->next = e;
// return 0;
//}
//
//nameList *
//listNames(void) {
// nameList hd;
// hd.next = 0;
// iconvlist(addNames, &hd);
// return hd.next;
//}
import "C"
import (
"strings"
"sync"
"unsafe"
)
var getAliasesOnce sync.Once
var allAliases = map[string][]string{}
func aliases() map[string][]string {
getAliasesOnce.Do(getAliases)
return allAliases
}
func getAliases() {
var next *C.nameList
for p := C.listNames(); p != nil; p = next {
next = p.next
aliases := make([]string, p.n)
pnames := (*[1e9]*C.char)(unsafe.Pointer(p.names))
for i := range aliases {
aliases[i] = strings.ToLower(C.GoString(pnames[i]))
C.free(unsafe.Pointer(pnames[i]))
}
C.free(unsafe.Pointer(p.names))
C.free(unsafe.Pointer(p))
for _, alias := range aliases {
allAliases[alias] = aliases
}
}
}

View File

@ -0,0 +1,176 @@
// +build linux
// We just use a list of names obtained from iconv on a platform
// that allows iconvlist. We could invoke the iconv command,
// but that might fail too, and it gives no information about aliases.
package iconv
import (
"sync"
)
func aliases() map[string][]string {
initAliasesOnce.Do(initAliases)
return allAliases
}
var initAliasesOnce sync.Once
var allAliases map[string][]string
func initAliases() {
allAliases = make(map[string][]string)
for _, a := range aliasData {
for _, alias := range a {
allAliases[alias] = a
}
}
}
var aliasData = [][]string{
{"437", "cp437", "ibm437", "cspc8codepage437"},
{"850", "cp850", "ibm850", "cspc850multilingual"},
{"852", "cp852", "ibm852", "cspcp852"},
{"855", "cp855", "ibm855", "csibm855"},
{"857", "cp857", "ibm857", "csibm857"},
{"860", "cp860", "ibm860", "csibm860"},
{"861", "cp-is", "cp861", "ibm861", "csibm861"},
{"862", "cp862", "ibm862", "cspc862latinhebrew"},
{"863", "cp863", "ibm863", "csibm863"},
{"865", "cp865", "ibm865", "csibm865"},
{"866", "cp866", "ibm866", "csibm866"},
{"869", "cp-gr", "cp869", "ibm869", "csibm869"},
{"ansi-x3.4-1968", "ansi-x3.4-1986", "ascii", "cp367", "ibm367", "iso-ir-6", "iso646-us", "iso-646.irv:1991", "us", "us-ascii", "csascii"},
{"arabic", "asmo-708", "ecma-114", "iso-8859-6", "iso-ir-127", "iso8859-6", "iso-8859-6", "iso-8859-6:1987", "csisolatinarabic"},
{"armscii-8"},
{"atari", "atarist"},
{"big5-2003"},
{"big-5", "big-five", "big5", "bigfive", "cn-big5", "csbig5"},
{"big5-hkscs:1999"},
{"big5-hkscs:2001"},
{"big5-hkscs", "big5-hkscs:2004", "big5hkscs"},
{"c99"},
{"chinese", "gb-2312-80", "iso-ir-58", "csiso58gb231280"},
{"cn", "gb-1988-80", "iso-ir-57", "iso646-cn", "csiso57gb1988"},
{"cn-gb", "euc-cn", "euccn", "gb2312", "csgb2312"},
{"cn-gb-isoir165", "iso-ir-165"},
{"cp1046"},
{"cp1124"},
{"cp1125"},
{"cp1129"},
{"cp1131"},
{"cp1133", "ibm-cp1133"},
{"cp1161", "ibm-1161", "ibm1161", "csibm1161"},
{"cp1162", "ibm-1162", "ibm1162", "csibm1162"},
{"cp1163", "ibm-1163", "ibm1163", "csibm1163"},
{"cp1250", "ms-ee", "windows-1250"},
{"cp1251", "ms-cyrl", "windows-1251"},
{"cp1252", "ms-ansi", "windows-1252"},
{"cp1253", "ms-greek", "windows-1253"},
{"cp1254", "ms-turk", "windows-1254"},
{"cp1255", "ms-hebr", "windows-1255"},
{"cp1256", "ms-arab", "windows-1256"},
{"cp1257", "winbaltrim", "windows-1257"},
{"cp1258", "windows-1258"},
{"cp1361", "johab"},
{"cp154", "cyrillic-asian", "pt154", "ptcp154", "csptcp154"},
{"cp737"},
{"cp775", "ibm775", "cspc775baltic"},
{"cp819", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso-8859-1", "iso-8859-1:1987", "l1", "latin1", "csisolatin1"},
{"cp853"},
{"cp856"},
{"cp858"},
{"cp864", "ibm864", "csibm864"},
{"cp874", "windows-874"},
{"cp922"},
{"cp932"},
{"cp936", "ms936", "windows-936"},
{"cp943"},
{"cp949", "uhc"},
{"cp950"},
{"cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso-8859-5", "iso-8859-5:1988", "csisolatincyrillic"},
{"dec-hanyu"},
{"dec-kanji"},
{"ecma-118", "elot-928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso-8859-7", "iso-8859-7:1987", "iso-8859-7:2003", "csisolatingreek"},
{"euc-jis-2004", "euc-jisx0213"},
{"euc-jp", "eucjp", "extended-unix-code-packed-format-for-japanese", "cseucpkdfmtjapanese"},
{"euc-kr", "euckr", "cseuckr"},
{"euc-tw", "euctw", "cseuctw"},
{"gb18030"},
{"gbk"},
{"georgian-academy"},
{"georgian-ps"},
{"hebrew", "iso-8859-8", "iso-ir-138", "iso8859-8", "iso-8859-8", "iso-8859-8:1988", "csisolatinhebrew"},
{"hp-roman8", "r8", "roman8", "cshproman8"},
{"hz", "hz-gb-2312"},
{"iso-10646-ucs-2", "ucs-2", "csunicode"},
{"iso-10646-ucs-4", "ucs-4", "csucs4"},
{"iso-2022-cn", "csiso2022cn"},
{"iso-2022-cn-ext"},
{"iso-2022-jp-1"},
{"iso-2022-jp-2004", "iso-2022-jp-3"},
{"iso-2022-jp-2", "csiso2022jp2"},
{"iso-2022-jp", "csiso2022jp"},
{"iso-2022-kr", "csiso2022kr"},
{"iso-8859-10", "iso-ir-157", "iso8859-10", "iso-8859-10", "iso-8859-10:1992", "l6", "latin6", "csisolatin6"},
{"iso-8859-11", "iso8859-11", "iso-8859-11"},
{"iso-8859-13", "iso-ir-179", "iso8859-13", "iso-8859-13", "l7", "latin7"},
{"iso-8859-14", "iso-celtic", "iso-ir-199", "iso8859-14", "iso-8859-14", "iso-8859-14:1998", "l8", "latin8"},
{"iso-8859-15", "iso-ir-203", "iso8859-15", "iso-8859-15", "iso-8859-15:1998", "latin-9"},
{"iso-8859-16", "iso-ir-226", "iso8859-16", "iso-8859-16", "iso-8859-16:2001", "l10", "latin10"},
{"iso-8859-2", "iso-ir-101", "iso8859-2", "iso-8859-2", "iso-8859-2:1987", "l2", "latin2", "csisolatin2"},
{"iso-8859-3", "iso-ir-109", "iso8859-3", "iso-8859-3", "iso-8859-3:1988", "l3", "latin3", "csisolatin3"},
{"iso-8859-4", "iso-ir-110", "iso8859-4", "iso-8859-4", "iso-8859-4:1988", "l4", "latin4", "csisolatin4"},
{"iso-8859-9", "iso-ir-148", "iso8859-9", "iso-8859-9", "iso-8859-9:1989", "l5", "latin5", "csisolatin5"},
{"iso-ir-149", "korean", "ksc-5601", "ks-c-5601-1987", "ks-c-5601-1989", "csksc56011987"},
{"iso-ir-14", "iso646-jp", "jis-c6220-1969-ro", "jp", "csiso14jisc6220ro"},
{"iso-ir-159", "jis-x0212", "jis-x0212-1990", "jis-x0212.1990-0", "x0212", "csiso159jisx02121990"},
{"iso-ir-166", "tis-620", "tis620", "tis620-0", "tis620.2529-1", "tis620.2533-0", "tis620.2533-1"},
{"iso-ir-230", "tds565"},
{"iso-ir-87", "jis0208", "jis-c6226-1983", "jis-x0208", "jis-x0208-1983", "jis-x0208-1990", "x0208", "csiso87jisx0208"},
{"java"},
{"jisx0201-1976", "jis-x0201", "x0201", "cshalfwidthkatakana"},
{"koi8-r", "cskoi8r"},
{"koi8-ru"},
{"koi8-t"},
{"koi8-u"},
{"kz-1048", "rk1048", "strk1048-2002", "cskz1048"},
{"macarabic"},
{"maccentraleurope"},
{"maccroatian"},
{"maccyrillic"},
{"macgreek"},
{"machebrew"},
{"maciceland"},
{"mac", "macintosh", "macroman", "csmacintosh"},
{"macromania"},
{"macthai"},
{"macturkish"},
{"macukraine"},
{"ms-kanji", "shift-jis", "shift-jis", "sjis", "csshiftjis"},
{" MS-Windows", "Japanese", "(cp932)"},
{"mulelao-1"},
{"nextstep"},
{"riscos-latin1"},
{"shift-jis-2004", "shift-jisx0213"},
{"tcvn", "tcvn-5712", "tcvn5712-1", "tcvn5712-1:1993"},
{"ucs-2be", "unicode-1-1", "unicodebig", "csunicode11"},
{"ucs-2-internal"},
{"ucs-2le", "unicodelittle"},
{"ucs-2-swapped"},
{"ucs-4be"},
{"ucs-4-internal"},
{"ucs-4le"},
{"ucs-4-swapped"},
{"unicode-1-1-utf-7", "utf-7", "csunicode11utf7"},
{"utf-16"},
{"utf-16be"},
{"utf-16le"},
{"utf-32"},
{"utf-32be"},
{"utf-32le"},
{"utf-8"},
{"utf-8-mac", "utf8-mac"},
{"viscii", "viscii1.1-1", "csviscii"},
{"windows-31j", "cp932"},
}

View File

@ -0,0 +1,162 @@
package charset
import (
"encoding/json"
"fmt"
"os"
"sync"
)
var (
readLocalCharsetsOnce sync.Once
localCharsets = make(map[string]*localCharset)
)
type localCharset struct {
Charset
arg string
*class
}
// A class of character sets.
// Each class can be instantiated with an argument specified in the config file.
// Many character sets can use a single class.
type class struct {
from, to func(arg string) (Translator, error)
}
// The set of classes, indexed by class name.
var classes = make(map[string]*class)
func registerClass(charset string, from, to func(arg string) (Translator, error)) {
classes[charset] = &class{from, to}
}
type localFactory struct{}
func (f localFactory) TranslatorFrom(name string) (Translator, error) {
f.init()
name = NormalizedName(name)
cs := localCharsets[name]
if cs == nil {
return nil, fmt.Errorf("character set %q not found", name)
}
if cs.from == nil {
return nil, fmt.Errorf("cannot translate from %q", name)
}
return cs.from(cs.arg)
}
func (f localFactory) TranslatorTo(name string) (Translator, error) {
f.init()
name = NormalizedName(name)
cs := localCharsets[name]
if cs == nil {
return nil, fmt.Errorf("character set %q not found", name)
}
if cs.to == nil {
return nil, fmt.Errorf("cannot translate to %q", name)
}
return cs.to(cs.arg)
}
func (f localFactory) Names() []string {
f.init()
var names []string
for name, cs := range localCharsets {
// add names only for non-aliases.
if localCharsets[cs.Name] == cs {
names = append(names, name)
}
}
return names
}
func (f localFactory) Info(name string) *Charset {
f.init()
lcs := localCharsets[NormalizedName(name)]
if lcs == nil {
return nil
}
// copy the charset info so that callers can't mess with it.
cs := lcs.Charset
return &cs
}
func (f localFactory) init() {
readLocalCharsetsOnce.Do(readLocalCharsets)
}
// charsetEntry is the data structure for one entry in the JSON config file.
// If Alias is non-empty, it should be the canonical name of another
// character set; otherwise Class should be the name
// of an entry in classes, and Arg is the argument for
// instantiating it.
type charsetEntry struct {
Aliases []string
Desc string
Class string
Arg string
}
// readCharsets reads the JSON config file.
// It's done once only, when first needed.
func readLocalCharsets() {
csdata, err := readFile("charsets.json")
if err != nil {
fmt.Fprintf(os.Stderr, "charset: cannot open \"charsets.json\": %v\n", err)
return
}
var entries map[string]charsetEntry
err = json.Unmarshal(csdata, &entries)
if err != nil {
fmt.Fprintf(os.Stderr, "charset: cannot decode config file: %v\n", err)
}
for name, e := range entries {
class := classes[e.Class]
if class == nil {
continue
}
name = NormalizedName(name)
for i, a := range e.Aliases {
e.Aliases[i] = NormalizedName(a)
}
cs := &localCharset{
Charset: Charset{
Name: name,
Aliases: e.Aliases,
Desc: e.Desc,
NoFrom: class.from == nil,
NoTo: class.to == nil,
},
arg: e.Arg,
class: class,
}
localCharsets[cs.Name] = cs
for _, a := range cs.Aliases {
localCharsets[a] = cs
}
}
}
// A general cache store that local character set translators
// can use for persistent storage of data.
var (
cacheMutex sync.Mutex
cacheStore = make(map[interface{}]interface{})
)
func cache(key interface{}, f func() (interface{}, error)) (interface{}, error) {
cacheMutex.Lock()
defer cacheMutex.Unlock()
if x := cacheStore[key]; x != nil {
return x, nil
}
x, err := f()
if err != nil {
return nil, err
}
cacheStore[key] = x
return x, err
}

View File

@ -0,0 +1,110 @@
package charset
import (
"encoding/binary"
"errors"
"unicode/utf8"
)
func init() {
registerClass("utf16", fromUTF16, toUTF16)
}
type translateFromUTF16 struct {
first bool
endian binary.ByteOrder
scratch []byte
}
func (p *translateFromUTF16) Translate(data []byte, eof bool) (int, []byte, error) {
data = data[0 : len(data)&^1] // round to even number of bytes.
if len(data) < 2 {
return 0, nil, nil
}
n := 0
if p.first && p.endian == nil {
switch binary.BigEndian.Uint16(data) {
case 0xfeff:
p.endian = binary.BigEndian
data = data[2:]
n += 2
case 0xfffe:
p.endian = binary.LittleEndian
data = data[2:]
n += 2
default:
p.endian = guessEndian(data)
}
p.first = false
}
p.scratch = p.scratch[:0]
for ; len(data) > 0; data = data[2:] {
p.scratch = appendRune(p.scratch, rune(p.endian.Uint16(data)))
n += 2
}
return n, p.scratch, nil
}
func guessEndian(data []byte) binary.ByteOrder {
// XXX TODO
return binary.LittleEndian
}
type translateToUTF16 struct {
first bool
endian binary.ByteOrder
scratch []byte
}
func (p *translateToUTF16) Translate(data []byte, eof bool) (int, []byte, error) {
p.scratch = ensureCap(p.scratch[:0], (len(data)+1)*2)
if p.first {
p.scratch = p.scratch[0:2]
p.endian.PutUint16(p.scratch, 0xfeff)
p.first = false
}
n := 0
for len(data) > 0 {
if !utf8.FullRune(data) && !eof {
break
}
r, size := utf8.DecodeRune(data)
// TODO if r > 65535?
slen := len(p.scratch)
p.scratch = p.scratch[0 : slen+2]
p.endian.PutUint16(p.scratch[slen:], uint16(r))
data = data[size:]
n += size
}
return n, p.scratch, nil
}
func getEndian(arg string) (binary.ByteOrder, error) {
switch arg {
case "le":
return binary.LittleEndian, nil
case "be":
return binary.BigEndian, nil
case "":
return nil, nil
}
return nil, errors.New("charset: unknown utf16 endianness")
}
func fromUTF16(arg string) (Translator, error) {
endian, err := getEndian(arg)
if err != nil {
return nil, err
}
return &translateFromUTF16{first: true, endian: endian}, nil
}
func toUTF16(arg string) (Translator, error) {
endian, err := getEndian(arg)
if err != nil {
return nil, err
}
return &translateToUTF16{first: false, endian: endian}, nil
}

View File

@ -0,0 +1,51 @@
package charset
import (
"unicode/utf8"
)
func init() {
registerClass("utf8", toUTF8, toUTF8)
}
type translateToUTF8 struct {
scratch []byte
}
var errorBytes = []byte(string(utf8.RuneError))
const errorRuneLen = len(string(utf8.RuneError))
func (p *translateToUTF8) Translate(data []byte, eof bool) (int, []byte, error) {
p.scratch = ensureCap(p.scratch, (len(data))*errorRuneLen)
buf := p.scratch[:0]
for i := 0; i < len(data); {
// fast path for ASCII
if b := data[i]; b < utf8.RuneSelf {
buf = append(buf, b)
i++
continue
}
_, size := utf8.DecodeRune(data[i:])
if size == 1 {
if !eof && !utf8.FullRune(data) {
// When DecodeRune has converted only a single
// byte, we know there must be some kind of error
// because we know the byte's not ASCII.
// If we aren't at EOF, and it's an incomplete
// rune encoding, then we return to process
// the final bytes in a subsequent call.
return i, buf, nil
}
buf = append(buf, errorBytes...)
} else {
buf = append(buf, data[i:i+size]...)
}
i += size
}
return len(data), buf, nil
}
func toUTF8(arg string) (Translator, error) {
return new(translateToUTF8), nil
}