Remove big vendor
parent
ec4a28e0eb
commit
281f915f17
|
|
@ -1,335 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package encoding defines an interface for character encodings, such as Shift
|
||||
// JIS and Windows 1252, that can convert to and from UTF-8.
|
||||
//
|
||||
// Encoding implementations are provided in other packages, such as
|
||||
// golang.org/x/text/encoding/charmap and
|
||||
// golang.org/x/text/encoding/japanese.
|
||||
package encoding // import "golang.org/x/text/encoding"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"strconv"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// TODO:
|
||||
// - There seems to be some inconsistency in when decoders return errors
|
||||
// and when not. Also documentation seems to suggest they shouldn't return
|
||||
// errors at all (except for UTF-16).
|
||||
// - Encoders seem to rely on or at least benefit from the input being in NFC
|
||||
// normal form. Perhaps add an example how users could prepare their output.
|
||||
|
||||
// Encoding is a character set encoding that can be transformed to and from
|
||||
// UTF-8.
|
||||
type Encoding interface {
|
||||
// NewDecoder returns a Decoder.
|
||||
NewDecoder() *Decoder
|
||||
|
||||
// NewEncoder returns an Encoder.
|
||||
NewEncoder() *Encoder
|
||||
}
|
||||
|
||||
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
|
||||
//
|
||||
// Transforming source bytes that are not of that encoding will not result in an
|
||||
// error per se. Each byte that cannot be transcoded will be represented in the
|
||||
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
|
||||
type Decoder struct {
|
||||
transform.Transformer
|
||||
|
||||
// This forces external creators of Decoders to use names in struct
|
||||
// initializers, allowing for future extendibility without having to break
|
||||
// code.
|
||||
_ struct{}
|
||||
}
|
||||
|
||||
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
|
||||
// bytes or nil, err if any error occurred.
|
||||
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
|
||||
b, _, err := transform.Bytes(d, b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// String converts the given encoded string to UTF-8. It returns the converted
|
||||
// string or "", err if any error occurred.
|
||||
func (d *Decoder) String(s string) (string, error) {
|
||||
s, _, err := transform.String(d, s)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Reader wraps another Reader to decode its bytes.
|
||||
//
|
||||
// The Decoder may not be used for any other operation as long as the returned
|
||||
// Reader is in use.
|
||||
func (d *Decoder) Reader(r io.Reader) io.Reader {
|
||||
return transform.NewReader(r, d)
|
||||
}
|
||||
|
||||
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
|
||||
//
|
||||
// Each rune that cannot be transcoded will result in an error. In this case,
|
||||
// the transform will consume all source byte up to, not including the offending
|
||||
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
|
||||
// `\uFFFD`. To return early with an error instead, use transform.Chain to
|
||||
// preprocess the data with a UTF8Validator.
|
||||
type Encoder struct {
|
||||
transform.Transformer
|
||||
|
||||
// This forces external creators of Encoders to use names in struct
|
||||
// initializers, allowing for future extendibility without having to break
|
||||
// code.
|
||||
_ struct{}
|
||||
}
|
||||
|
||||
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
|
||||
// any error occurred.
|
||||
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
|
||||
b, _, err := transform.Bytes(e, b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// String converts a string from UTF-8. It returns the converted string or
|
||||
// "", err if any error occurred.
|
||||
func (e *Encoder) String(s string) (string, error) {
|
||||
s, _, err := transform.String(e, s)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Writer wraps another Writer to encode its UTF-8 output.
|
||||
//
|
||||
// The Encoder may not be used for any other operation as long as the returned
|
||||
// Writer is in use.
|
||||
func (e *Encoder) Writer(w io.Writer) io.Writer {
|
||||
return transform.NewWriter(w, e)
|
||||
}
|
||||
|
||||
// ASCIISub is the ASCII substitute character, as recommended by
|
||||
// https://unicode.org/reports/tr36/#Text_Comparison
|
||||
const ASCIISub = '\x1a'
|
||||
|
||||
// Nop is the nop encoding. Its transformed bytes are the same as the source
|
||||
// bytes; it does not replace invalid UTF-8 sequences.
|
||||
var Nop Encoding = nop{}
|
||||
|
||||
type nop struct{}
|
||||
|
||||
func (nop) NewDecoder() *Decoder {
|
||||
return &Decoder{Transformer: transform.Nop}
|
||||
}
|
||||
func (nop) NewEncoder() *Encoder {
|
||||
return &Encoder{Transformer: transform.Nop}
|
||||
}
|
||||
|
||||
// Replacement is the replacement encoding. Decoding from the replacement
|
||||
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
|
||||
// the replacement encoding yields the same as the source bytes except that
|
||||
// invalid UTF-8 is converted to '\uFFFD'.
|
||||
//
|
||||
// It is defined at http://encoding.spec.whatwg.org/#replacement
|
||||
var Replacement Encoding = replacement{}
|
||||
|
||||
type replacement struct{}
|
||||
|
||||
func (replacement) NewDecoder() *Decoder {
|
||||
return &Decoder{Transformer: replacementDecoder{}}
|
||||
}
|
||||
|
||||
func (replacement) NewEncoder() *Encoder {
|
||||
return &Encoder{Transformer: replacementEncoder{}}
|
||||
}
|
||||
|
||||
func (replacement) ID() (mib identifier.MIB, other string) {
|
||||
return identifier.Replacement, ""
|
||||
}
|
||||
|
||||
type replacementDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if len(dst) < 3 {
|
||||
return 0, 0, transform.ErrShortDst
|
||||
}
|
||||
if atEOF {
|
||||
const fffd = "\ufffd"
|
||||
dst[0] = fffd[0]
|
||||
dst[1] = fffd[1]
|
||||
dst[2] = fffd[2]
|
||||
nDst = 3
|
||||
}
|
||||
return nDst, len(src), nil
|
||||
}
|
||||
|
||||
type replacementEncoder struct{ transform.NopResetter }
|
||||
|
||||
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
|
||||
// repertoire of the destination encoding with HTML escape sequences.
|
||||
//
|
||||
// This wrapper exists to comply to URL and HTML forms requiring a
|
||||
// non-terminating legacy encoder. The produced sequences may lead to data
|
||||
// loss as they are indistinguishable from legitimate input. To avoid this
|
||||
// issue, use UTF-8 encodings whenever possible.
|
||||
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
|
||||
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
|
||||
}
|
||||
|
||||
// ReplaceUnsupported wraps encoders to replace source runes outside the
|
||||
// repertoire of the destination encoding with an encoding-specific
|
||||
// replacement.
|
||||
//
|
||||
// This wrapper is only provided for backwards compatibility and legacy
|
||||
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
|
||||
func ReplaceUnsupported(e *Encoder) *Encoder {
|
||||
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
|
||||
}
|
||||
|
||||
type errorHandler struct {
|
||||
*Encoder
|
||||
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
|
||||
}
|
||||
|
||||
// TODO: consider making this error public in some form.
|
||||
type repertoireError interface {
|
||||
Replacement() byte
|
||||
}
|
||||
|
||||
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
|
||||
for err != nil {
|
||||
rerr, ok := err.(repertoireError)
|
||||
if !ok {
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
r, sz := utf8.DecodeRune(src[nSrc:])
|
||||
n, ok := h.handler(dst[nDst:], r, rerr)
|
||||
if !ok {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
err = nil
|
||||
nDst += n
|
||||
if nSrc += sz; nSrc < len(src) {
|
||||
var dn, sn int
|
||||
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
|
||||
nDst += dn
|
||||
nSrc += sn
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
||||
buf := [8]byte{}
|
||||
b := strconv.AppendUint(buf[:0], uint64(r), 10)
|
||||
if n = len(b) + len("&#;"); n >= len(dst) {
|
||||
return 0, false
|
||||
}
|
||||
dst[0] = '&'
|
||||
dst[1] = '#'
|
||||
dst[copy(dst[2:], b)+2] = ';'
|
||||
return n, true
|
||||
}
|
||||
|
||||
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
||||
if len(dst) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
dst[0] = err.Replacement()
|
||||
return 1, true
|
||||
}
|
||||
|
||||
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
|
||||
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
|
||||
|
||||
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
|
||||
// input byte that is not valid UTF-8.
|
||||
var UTF8Validator transform.Transformer = utf8Validator{}
|
||||
|
||||
type utf8Validator struct{ transform.NopResetter }
|
||||
|
||||
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
n := len(src)
|
||||
if n > len(dst) {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; {
|
||||
if c := src[i]; c < utf8.RuneSelf {
|
||||
dst[i] = c
|
||||
i++
|
||||
continue
|
||||
}
|
||||
_, size := utf8.DecodeRune(src[i:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
err = ErrInvalidUTF8
|
||||
if !atEOF && !utf8.FullRune(src[i:]) {
|
||||
err = transform.ErrShortSrc
|
||||
}
|
||||
return i, i, err
|
||||
}
|
||||
if i+size > len(dst) {
|
||||
return i, i, transform.ErrShortDst
|
||||
}
|
||||
for ; size > 0; size-- {
|
||||
dst[i] = src[i]
|
||||
i++
|
||||
}
|
||||
}
|
||||
if len(src) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
}
|
||||
return n, n, err
|
||||
}
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS}
|
||||
|
|
@ -1,225 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// EUCJP is the EUC-JP encoding.
|
||||
var EUCJP encoding.Encoding = &eucJP
|
||||
|
||||
var eucJP = internal.Encoding{
|
||||
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}},
|
||||
"EUC-JP",
|
||||
identifier.EUCPkdFmtJapanese,
|
||||
}
|
||||
|
||||
type eucJPDecoder struct{ transform.NopResetter }
|
||||
|
||||
// See https://encoding.spec.whatwg.org/#euc-jp-decoder.
|
||||
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case c0 == 0x8e:
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
break
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case c1 < 0xa1:
|
||||
r, size = utf8.RuneError, 1
|
||||
case c1 > 0xdf:
|
||||
r, size = utf8.RuneError, 2
|
||||
if c1 == 0xff {
|
||||
size = 1
|
||||
}
|
||||
default:
|
||||
r, size = rune(c1)+(0xff61-0xa1), 2
|
||||
}
|
||||
case c0 == 0x8f:
|
||||
if nSrc+2 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe {
|
||||
size = 2
|
||||
}
|
||||
break
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c1 < 0xa1 || 0xfe < c1 {
|
||||
r, size = utf8.RuneError, 1
|
||||
break
|
||||
}
|
||||
c2 := src[nSrc+2]
|
||||
if c2 < 0xa1 || 0xfe < c2 {
|
||||
r, size = utf8.RuneError, 2
|
||||
break
|
||||
}
|
||||
r, size = utf8.RuneError, 3
|
||||
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) {
|
||||
r = rune(jis0212Decode[i])
|
||||
if r == 0 {
|
||||
r = utf8.RuneError
|
||||
}
|
||||
}
|
||||
|
||||
case 0xa1 <= c0 && c0 <= 0xfe:
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
break
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c1 < 0xa1 || 0xfe < c1 {
|
||||
r, size = utf8.RuneError, 1
|
||||
break
|
||||
}
|
||||
r, size = utf8.RuneError, 2
|
||||
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) {
|
||||
r = rune(jis0208Decode[i])
|
||||
if r == 0 {
|
||||
r = utf8.RuneError
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
r, size = utf8.RuneError, 1
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type eucJPEncoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
goto write2
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2or3:
|
||||
if r>>tableShift == jis0208 {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = 0x8f
|
||||
nDst++
|
||||
}
|
||||
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
|
||||
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = 0x8e
|
||||
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 6 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
|
|
@ -1,299 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// ISO2022JP is the ISO-2022-JP encoding.
|
||||
var ISO2022JP encoding.Encoding = &iso2022JP
|
||||
|
||||
var iso2022JP = internal.Encoding{
|
||||
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
|
||||
"ISO-2022-JP",
|
||||
identifier.ISO2022JP,
|
||||
}
|
||||
|
||||
func iso2022JPNewDecoder() transform.Transformer {
|
||||
return new(iso2022JPDecoder)
|
||||
}
|
||||
|
||||
func iso2022JPNewEncoder() transform.Transformer {
|
||||
return new(iso2022JPEncoder)
|
||||
}
|
||||
|
||||
const (
|
||||
asciiState = iota
|
||||
katakanaState
|
||||
jis0208State
|
||||
jis0212State
|
||||
)
|
||||
|
||||
const asciiEsc = 0x1b
|
||||
|
||||
type iso2022JPDecoder int
|
||||
|
||||
func (d *iso2022JPDecoder) Reset() {
|
||||
*d = asciiState
|
||||
}
|
||||
|
||||
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
c0 := src[nSrc]
|
||||
if c0 >= utf8.RuneSelf {
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
|
||||
if c0 == asciiEsc {
|
||||
if nSrc+2 >= len(src) {
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
// TODO: is it correct to only skip 1??
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
size = 3
|
||||
c1 := src[nSrc+1]
|
||||
c2 := src[nSrc+2]
|
||||
switch {
|
||||
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
|
||||
*d = jis0208State
|
||||
continue
|
||||
case c1 == '$' && c2 == '(': // 0x24 0x28
|
||||
if nSrc+3 >= len(src) {
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
size = 4
|
||||
if src[nSrc+3] == 'D' {
|
||||
*d = jis0212State
|
||||
continue
|
||||
}
|
||||
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
|
||||
*d = asciiState
|
||||
continue
|
||||
case c1 == '(' && c2 == 'I': // 0x28 0x49
|
||||
*d = katakanaState
|
||||
continue
|
||||
}
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
|
||||
switch *d {
|
||||
case asciiState:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case katakanaState:
|
||||
if c0 < 0x21 || 0x60 <= c0 {
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
r, size = rune(c0)+(0xff61-0x21), 1
|
||||
|
||||
default:
|
||||
if c0 == 0x0a {
|
||||
*d = asciiState
|
||||
r, size = rune(c0), 1
|
||||
goto write
|
||||
}
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
size = 2
|
||||
c1 := src[nSrc+1]
|
||||
i := int(c0-0x21)*94 + int(c1-0x21)
|
||||
if *d == jis0208State && i < len(jis0208Decode) {
|
||||
r = rune(jis0208Decode[i])
|
||||
} else if *d == jis0212State && i < len(jis0212Decode) {
|
||||
r = rune(jis0212Decode[i])
|
||||
} else {
|
||||
r = '\ufffd'
|
||||
goto write
|
||||
}
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
write:
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type iso2022JPEncoder int
|
||||
|
||||
func (e *iso2022JPEncoder) Reset() {
|
||||
*e = asciiState
|
||||
}
|
||||
|
||||
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
//
|
||||
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
|
||||
// is not used by the iso-2022-jp encoder due to lack of widespread support".
|
||||
//
|
||||
// TODO: do we have to special-case U+00A5 and U+203E, as per
|
||||
// http://encoding.spec.whatwg.org/#iso-2022-jp
|
||||
// Doing so would mean that "\u00a5" would not be preserved
|
||||
// after an encode-decode round trip.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
goto writeKatakana
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
}
|
||||
|
||||
// Switch back to ASCII state in case of error so that an ASCII
|
||||
// replacement character can be written in the correct state.
|
||||
if *e != asciiState {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = asciiState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
if *e != asciiState {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = asciiState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
} else if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
writeJIS:
|
||||
if *e != jis0208State {
|
||||
if nDst+5 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = jis0208State
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '$'
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
} else if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
|
||||
dst[nDst+1] = 0x21 + uint8(r)&codeMask
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
writeKatakana:
|
||||
if *e != katakanaState {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = katakanaState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'I'
|
||||
nDst += 3
|
||||
} else if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r - (0xff61 - 0x21))
|
||||
nDst++
|
||||
continue
|
||||
}
|
||||
if atEOF && err == nil && *e != asciiState {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
} else {
|
||||
*e = asciiState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
// TODO: Emoji extensions?
|
||||
// https://www.unicode.org/faq/emoji_dingbats.html
|
||||
// https://www.unicode.org/Public/UNIDATA/EmojiSources.txt
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type entry struct {
|
||||
jisCode, table int
|
||||
}
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n")
|
||||
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n")
|
||||
|
||||
reverse := [65536]entry{}
|
||||
for i := range reverse {
|
||||
reverse[i].table = -1
|
||||
}
|
||||
|
||||
tables := []struct {
|
||||
url string
|
||||
name string
|
||||
}{
|
||||
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"},
|
||||
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"},
|
||||
}
|
||||
for i, table := range tables {
|
||||
res, err := http.Get(table.url)
|
||||
if err != nil {
|
||||
log.Fatalf("%q: Get: %v", table.url, err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := 0, uint16(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("%q: could not parse %q", table.url, s)
|
||||
}
|
||||
if x < 0 || 120*94 <= x {
|
||||
log.Fatalf("%q: JIS code %d is out of range", table.url, x)
|
||||
}
|
||||
mapping[x] = y
|
||||
if reverse[y].table == -1 {
|
||||
reverse[y] = entry{jisCode: x, table: i}
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("%q: scanner error: %v", table.url, err)
|
||||
}
|
||||
|
||||
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n",
|
||||
table.name, table.name, table.url)
|
||||
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name)
|
||||
for i, m := range mapping {
|
||||
if m != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, m)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v.table == -1 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const (\n")
|
||||
fmt.Printf("\tjis0208 = 1\n")
|
||||
fmt.Printf("\tjis0212 = 2\n")
|
||||
fmt.Printf("\tcodeMask = 0x7f\n")
|
||||
fmt.Printf("\tcodeShift = 7\n")
|
||||
fmt.Printf("\ttableShift = 14\n")
|
||||
fmt.Printf(")\n\n")
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("//\n")
|
||||
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
|
||||
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
|
||||
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
|
||||
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x.table == -1 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
|
||||
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
|
@ -1,189 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
|
||||
// Windows-31J.
|
||||
var ShiftJIS encoding.Encoding = &shiftJIS
|
||||
|
||||
var shiftJIS = internal.Encoding{
|
||||
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}},
|
||||
"Shift JIS",
|
||||
identifier.ShiftJIS,
|
||||
}
|
||||
|
||||
type shiftJISDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case 0xa1 <= c0 && c0 < 0xe0:
|
||||
r, size = rune(c0)+(0xff61-0xa1), 1
|
||||
|
||||
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd):
|
||||
if c0 <= 0x9f {
|
||||
c0 -= 0x70
|
||||
} else {
|
||||
c0 -= 0xb0
|
||||
}
|
||||
c0 = 2*c0 - 0x21
|
||||
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = '\ufffd', 1
|
||||
goto write
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case c1 < 0x40:
|
||||
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
|
||||
goto write
|
||||
case c1 < 0x7f:
|
||||
c0--
|
||||
c1 -= 0x40
|
||||
case c1 == 0x7f:
|
||||
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
|
||||
goto write
|
||||
case c1 < 0x9f:
|
||||
c0--
|
||||
c1 -= 0x41
|
||||
case c1 < 0xfd:
|
||||
c1 -= 0x9f
|
||||
default:
|
||||
r, size = '\ufffd', 2
|
||||
goto write
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) {
|
||||
r = rune(jis0208Decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
case c0 == 0x80:
|
||||
r, size = 0x80, 1
|
||||
|
||||
default:
|
||||
r, size = '\ufffd', 1
|
||||
}
|
||||
write:
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type shiftJISEncoder struct{ transform.NopResetter }
|
||||
|
||||
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
r -= 0xff61 - 0xa1
|
||||
goto write1
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write1:
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2:
|
||||
j1 := uint8(r>>codeShift) & codeMask
|
||||
j2 := uint8(r) & codeMask
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
if j1 <= 61 {
|
||||
dst[nDst+0] = 129 + j1/2
|
||||
} else {
|
||||
dst[nDst+0] = 193 + j1/2
|
||||
}
|
||||
if j1&1 == 0 {
|
||||
dst[nDst+1] = j2 + j2/63 + 64
|
||||
} else {
|
||||
dst[nDst+1] = j2 + 159
|
||||
}
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package korean
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{EUCKR}
|
||||
|
||||
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
|
||||
var EUCKR encoding.Encoding = &eucKR
|
||||
|
||||
var eucKR = internal.Encoding{
|
||||
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}},
|
||||
"EUC-KR",
|
||||
identifier.EUCKR,
|
||||
}
|
||||
|
||||
type eucKRDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case 0x81 <= c0 && c0 < 0xff:
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
break
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
size = 2
|
||||
if c0 < 0xc7 {
|
||||
r = 178 * rune(c0-0x81)
|
||||
switch {
|
||||
case 0x41 <= c1 && c1 < 0x5b:
|
||||
r += rune(c1) - (0x41 - 0*26)
|
||||
case 0x61 <= c1 && c1 < 0x7b:
|
||||
r += rune(c1) - (0x61 - 1*26)
|
||||
case 0x81 <= c1 && c1 < 0xff:
|
||||
r += rune(c1) - (0x81 - 2*26)
|
||||
default:
|
||||
goto decError
|
||||
}
|
||||
} else if 0xa1 <= c1 && c1 < 0xff {
|
||||
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1)
|
||||
} else {
|
||||
goto decError
|
||||
}
|
||||
if int(r) < len(decode) {
|
||||
r = rune(decode[r])
|
||||
if r != 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
decError:
|
||||
r = utf8.RuneError
|
||||
if c1 < utf8.RuneSelf {
|
||||
size = 1
|
||||
}
|
||||
|
||||
default:
|
||||
r, size = utf8.RuneError, 1
|
||||
break
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type eucKREncoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode6Low <= r && r < encode6High:
|
||||
if r = rune(encode6[r-encode6Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r >> 8)
|
||||
dst[nDst+1] = uint8(r)
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 7 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n")
|
||||
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n")
|
||||
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint16{}
|
||||
reverse := [65536]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint16(0), uint16(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x {
|
||||
log.Fatalf("EUC-KR code %d is out of range", x)
|
||||
}
|
||||
mapping[x] = y
|
||||
if reverse[y] == 0 {
|
||||
c0, c1 := uint16(0), uint16(0)
|
||||
if x < 178*(0xc7-0x81) {
|
||||
c0 = uint16(x/178) + 0x81
|
||||
c1 = uint16(x % 178)
|
||||
switch {
|
||||
case c1 < 1*26:
|
||||
c1 += 0x41
|
||||
case c1 < 2*26:
|
||||
c1 += 0x47
|
||||
default:
|
||||
c1 += 0x4d
|
||||
}
|
||||
} else {
|
||||
x -= 178 * (0xc7 - 0x81)
|
||||
c0 = uint16(x/94) + 0xc7
|
||||
c1 = uint16(x%94) + 0xa1
|
||||
}
|
||||
reverse[y] = c0<<8 | c1
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
|
||||
fmt.Printf("var decode = [...]uint16{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}
|
||||
|
|
@ -1,269 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
var (
|
||||
// GB18030 is the GB18030 encoding.
|
||||
GB18030 encoding.Encoding = &gbk18030
|
||||
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
|
||||
// and is also known as Code Page 936.
|
||||
GBK encoding.Encoding = &gbk
|
||||
)
|
||||
|
||||
var gbk = internal.Encoding{
|
||||
&internal.SimpleEncoding{
|
||||
gbkDecoder{gb18030: false},
|
||||
gbkEncoder{gb18030: false},
|
||||
},
|
||||
"GBK",
|
||||
identifier.GBK,
|
||||
}
|
||||
|
||||
var gbk18030 = internal.Encoding{
|
||||
&internal.SimpleEncoding{
|
||||
gbkDecoder{gb18030: true},
|
||||
gbkEncoder{gb18030: true},
|
||||
},
|
||||
"GB18030",
|
||||
identifier.GB18030,
|
||||
}
|
||||
|
||||
type gbkDecoder struct {
|
||||
transform.NopResetter
|
||||
gb18030 bool
|
||||
}
|
||||
|
||||
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
case c0 == 0x80:
|
||||
r, size = '€', 1
|
||||
|
||||
case c0 < 0xff:
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case 0x40 <= c1 && c1 < 0x7f:
|
||||
c1 -= 0x40
|
||||
case 0x80 <= c1 && c1 < 0xff:
|
||||
c1 -= 0x41
|
||||
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
|
||||
if nSrc+3 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
// The second byte here is always ASCII, so we can set size
|
||||
// to 1 in all cases.
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
c2 := src[nSrc+2]
|
||||
if c2 < 0x81 || 0xff <= c2 {
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
c3 := src[nSrc+3]
|
||||
if c3 < 0x30 || 0x3a <= c3 {
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
size = 4
|
||||
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
|
||||
if r < 39420 {
|
||||
i, j := 0, len(gb18030)
|
||||
for i < j {
|
||||
h := i + (j-i)/2
|
||||
if r >= rune(gb18030[h][0]) {
|
||||
i = h + 1
|
||||
} else {
|
||||
j = h
|
||||
}
|
||||
}
|
||||
dec := &gb18030[i-1]
|
||||
r += rune(dec[1]) - rune(dec[0])
|
||||
goto write
|
||||
}
|
||||
r -= 189000
|
||||
if 0 <= r && r < 0x100000 {
|
||||
r += 0x10000
|
||||
} else {
|
||||
r, size = utf8.RuneError, 1
|
||||
}
|
||||
goto write
|
||||
default:
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
r, size = utf8.RuneError, 1
|
||||
}
|
||||
|
||||
write:
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type gbkEncoder struct {
|
||||
transform.NopResetter
|
||||
gb18030 bool
|
||||
}
|
||||
|
||||
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, r2, size := rune(0), rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
if r == '€' {
|
||||
r = 0x80
|
||||
goto write1
|
||||
}
|
||||
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
|
||||
if e.gb18030 {
|
||||
if r < 0x10000 {
|
||||
i, j := 0, len(gb18030)
|
||||
for i < j {
|
||||
h := i + (j-i)/2
|
||||
if r >= rune(gb18030[h][1]) {
|
||||
i = h + 1
|
||||
} else {
|
||||
j = h
|
||||
}
|
||||
}
|
||||
dec := &gb18030[i-1]
|
||||
r += rune(dec[0]) - rune(dec[1])
|
||||
goto write4
|
||||
} else if r < 0x110000 {
|
||||
r += 189000 - 0x10000
|
||||
goto write4
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write1:
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r2 >> 8)
|
||||
dst[nDst+1] = uint8(r2)
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
write4:
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+3] = uint8(r%10 + 0x30)
|
||||
r /= 10
|
||||
dst[nDst+2] = uint8(r%126 + 0x81)
|
||||
r /= 126
|
||||
dst[nDst+1] = uint8(r%10 + 0x30)
|
||||
r /= 10
|
||||
dst[nDst+0] = uint8(r + 0x81)
|
||||
nDst += 4
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 5 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
|
|
@ -1,245 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// HZGB2312 is the HZ-GB2312 encoding.
|
||||
var HZGB2312 encoding.Encoding = &hzGB2312
|
||||
|
||||
var hzGB2312 = internal.Encoding{
|
||||
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
|
||||
"HZ-GB2312",
|
||||
identifier.HZGB2312,
|
||||
}
|
||||
|
||||
func hzGB2312NewDecoder() transform.Transformer {
|
||||
return new(hzGB2312Decoder)
|
||||
}
|
||||
|
||||
func hzGB2312NewEncoder() transform.Transformer {
|
||||
return new(hzGB2312Encoder)
|
||||
}
|
||||
|
||||
const (
|
||||
asciiState = iota
|
||||
gbState
|
||||
)
|
||||
|
||||
type hzGB2312Decoder int
|
||||
|
||||
func (d *hzGB2312Decoder) Reset() {
|
||||
*d = asciiState
|
||||
}
|
||||
|
||||
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
c0 := src[nSrc]
|
||||
if c0 >= utf8.RuneSelf {
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
|
||||
if c0 == '~' {
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r = utf8.RuneError
|
||||
goto write
|
||||
}
|
||||
size = 2
|
||||
switch src[nSrc+1] {
|
||||
case '{':
|
||||
*d = gbState
|
||||
continue
|
||||
case '}':
|
||||
*d = asciiState
|
||||
continue
|
||||
case '~':
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
dst[nDst] = '~'
|
||||
nDst++
|
||||
continue
|
||||
case '\n':
|
||||
continue
|
||||
default:
|
||||
r = utf8.RuneError
|
||||
goto write
|
||||
}
|
||||
}
|
||||
|
||||
if *d == asciiState {
|
||||
r, size = rune(c0), 1
|
||||
} else {
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
size = 2
|
||||
c1 := src[nSrc+1]
|
||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
|
||||
// error
|
||||
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
|
||||
r = rune(decode[i])
|
||||
if r != 0 {
|
||||
goto write
|
||||
}
|
||||
}
|
||||
if c1 > utf8.RuneSelf {
|
||||
// Be consistent and always treat non-ASCII as a single error.
|
||||
size = 1
|
||||
}
|
||||
r = utf8.RuneError
|
||||
}
|
||||
|
||||
write:
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type hzGB2312Encoder int
|
||||
|
||||
func (d *hzGB2312Encoder) Reset() {
|
||||
*d = asciiState
|
||||
}
|
||||
|
||||
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
if r == '~' {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '~'
|
||||
nDst += 2
|
||||
continue
|
||||
} else if *e != asciiState {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = asciiState
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '}'
|
||||
nDst += 2
|
||||
} else if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst += 1
|
||||
continue
|
||||
|
||||
}
|
||||
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
}
|
||||
|
||||
terminateInASCIIState:
|
||||
// Switch back to ASCII state in case of error so that an ASCII
|
||||
// replacement character can be written in the correct state.
|
||||
if *e != asciiState {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '}'
|
||||
nDst += 2
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
|
||||
writeGB:
|
||||
c0 := uint8(r>>8) - 0x80
|
||||
c1 := uint8(r) - 0x80
|
||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
|
||||
goto terminateInASCIIState
|
||||
}
|
||||
if *e == asciiState {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = gbState
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '{'
|
||||
nDst += 2
|
||||
} else if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = c0
|
||||
dst[nDst+1] = c1
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
// TODO: should one always terminate in ASCII state to make it safe to
|
||||
// concatenate two HZ-GB2312-encoded strings?
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n")
|
||||
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n")
|
||||
|
||||
printGB18030()
|
||||
printGBK()
|
||||
}
|
||||
|
||||
func printGB18030() {
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n")
|
||||
fmt.Printf("var gb18030 = [...][2]uint16{\n")
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint32(0), uint32(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0x10000 && y < 0x10000 {
|
||||
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
func printGBK() {
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint16{}
|
||||
reverse := [65536]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint16(0), uint16(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 126*190 <= x {
|
||||
log.Fatalf("GBK code %d is out of range", x)
|
||||
}
|
||||
mapping[x] = y
|
||||
if reverse[y] == 0 {
|
||||
c0, c1 := x/190, x%190
|
||||
if c1 >= 0x3f {
|
||||
c1++
|
||||
}
|
||||
reverse[y] = (0x81+c0)<<8 | (0x40 + c1)
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
|
||||
fmt.Printf("var decode = [...]uint16{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
|
@ -1,199 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package traditionalchinese
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{Big5}
|
||||
|
||||
// Big5 is the Big5 encoding, also known as Code Page 950.
|
||||
var Big5 encoding.Encoding = &big5
|
||||
|
||||
var big5 = internal.Encoding{
|
||||
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}},
|
||||
"Big5",
|
||||
identifier.Big5,
|
||||
}
|
||||
|
||||
type big5Decoder struct{ transform.NopResetter }
|
||||
|
||||
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size, s := rune(0), 0, ""
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case 0x81 <= c0 && c0 < 0xff:
|
||||
if nSrc+1 >= len(src) {
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case 0x40 <= c1 && c1 < 0x7f:
|
||||
c1 -= 0x40
|
||||
case 0xa1 <= c1 && c1 < 0xff:
|
||||
c1 -= 0x62
|
||||
case c1 < 0x40:
|
||||
r, size = utf8.RuneError, 1
|
||||
goto write
|
||||
default:
|
||||
r, size = utf8.RuneError, 2
|
||||
goto write
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
|
||||
if 1133 <= i && i < 1167 {
|
||||
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
|
||||
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
|
||||
switch i {
|
||||
case 1133:
|
||||
s = "\u00CA\u0304"
|
||||
goto writeStr
|
||||
case 1135:
|
||||
s = "\u00CA\u030C"
|
||||
goto writeStr
|
||||
case 1164:
|
||||
s = "\u00EA\u0304"
|
||||
goto writeStr
|
||||
case 1166:
|
||||
s = "\u00EA\u030C"
|
||||
goto writeStr
|
||||
}
|
||||
}
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
r, size = utf8.RuneError, 1
|
||||
}
|
||||
|
||||
write:
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
continue loop
|
||||
|
||||
writeStr:
|
||||
if nDst+len(s) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += copy(dst[nDst:], s)
|
||||
continue loop
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type big5Encoder struct{ transform.NopResetter }
|
||||
|
||||
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if r >= utf8.RuneSelf {
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode6Low <= r && r < encode6High:
|
||||
if r = rune(encode6[r-encode6Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode7Low <= r && r < encode7High:
|
||||
if r = rune(encode7[r-encode7Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r >> 8)
|
||||
dst[nDst+1] = uint8(r)
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 8 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
|
|
@ -1,140 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n")
|
||||
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n")
|
||||
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint32{}
|
||||
reverse := [65536 * 4]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint16(0), uint32(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 126*157 <= x {
|
||||
log.Fatalf("Big5 code %d is out of range", x)
|
||||
}
|
||||
mapping[x] = y
|
||||
|
||||
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
|
||||
// "The index pointer for code point in index is the first pointer
|
||||
// corresponding to code point in index", which would normally mean
|
||||
// that the code below should be guarded by "if reverse[y] == 0", but
|
||||
// last instead of first seems to match the behavior of
|
||||
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
|
||||
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
|
||||
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
|
||||
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
|
||||
c0, c1 := x/157, x%157
|
||||
if c1 < 0x3f {
|
||||
c1 += 0x40
|
||||
} else {
|
||||
c1 += 0x62
|
||||
}
|
||||
reverse[y] = (0x81+c0)<<8 | c1
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
|
||||
fmt.Printf("var decode = [...]uint32{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%08X,\n", i, v)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
|
@ -1161,12 +1161,6 @@
|
|||
"revision": "d0b11bdaac8adb652bff00e49bcacf992835621a",
|
||||
"revisionTime": "2019-02-15T13:44:14Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "tqqo7DEeFCclb58XbN44WwdpWww=",
|
||||
"path": "golang.org/x/text/encoding",
|
||||
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
|
||||
"revisionTime": "2018-12-15T17:52:45Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "HgcUFTOQF5jOYtTIj5obR3GVN9A=",
|
||||
"path": "golang.org/x/text/encoding/charmap",
|
||||
|
|
@ -1191,30 +1185,6 @@
|
|||
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
|
||||
"revisionTime": "2018-12-15T17:52:45Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "DhdZROnJq+cEcQ/sHY7GEq5wQ8U=",
|
||||
"path": "golang.org/x/text/encoding/japanese",
|
||||
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
|
||||
"revisionTime": "2018-12-15T17:52:45Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "qHQ79q9peY8ZkCMC8kJAb52BAWg=",
|
||||
"path": "golang.org/x/text/encoding/korean",
|
||||
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
|
||||
"revisionTime": "2018-12-15T17:52:45Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "55UdScb+EMOCPr7OW0hCwDsVxpg=",
|
||||
"path": "golang.org/x/text/encoding/simplifiedchinese",
|
||||
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
|
||||
"revisionTime": "2018-12-15T17:52:45Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "9EZF1SHTpjVmaT9sARitvGKUXOY=",
|
||||
"path": "golang.org/x/text/encoding/traditionalchinese",
|
||||
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
|
||||
"revisionTime": "2018-12-15T17:52:45Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "bAJTZJ3IGJdNmN/PSlRMRxWtxec=",
|
||||
"path": "golang.org/x/text/encoding/unicode",
|
||||
|
|
|
|||
Loading…
Reference in New Issue