Skip to content

Commit

Permalink
loaddata, lightning: support ascii, latin1, utf8 charset of source fi…
Browse files Browse the repository at this point in the history
…le (#42699)

ref #40499
  • Loading branch information
lance6716 authored Apr 3, 2023
1 parent edd4f26 commit 71fa840
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 5 deletions.
12 changes: 11 additions & 1 deletion br/pkg/lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,8 @@ const (
UTF8MB4
GB18030
GBK
Latin1
ASCII
)

// String return the string value of charset
Expand All @@ -779,6 +781,10 @@ func (c Charset) String() string {
return "gb18030"
case GBK:
return "gbk"
case Latin1:
return "latin1"
case ASCII:
return "ascii"
default:
return "unknown_charset"
}
Expand All @@ -789,12 +795,16 @@ func ParseCharset(dataCharacterSet string) (Charset, error) {
switch strings.ToLower(dataCharacterSet) {
case "", "binary":
return Binary, nil
case "utf8mb4":
case "utf8", "utf8mb4":
return UTF8MB4, nil
case "gb18030":
return GB18030, nil
case "gbk":
return GBK, nil
case "latin1":
return Latin1, nil
case "ascii":
return ASCII, nil
default:
return Binary, errors.Errorf("found unsupported data-character-set: %s", dataCharacterSet)
}
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/mydump/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_library(
"@com_github_xitongsys_parquet_go//source",
"@org_golang_x_exp//slices",
"@org_golang_x_text//encoding",
"@org_golang_x_text//encoding/charmap",
"@org_golang_x_text//encoding/simplifiedchinese",
"@org_uber_go_zap//:zap",
"@org_uber_go_zap//zapcore",
Expand Down
19 changes: 16 additions & 3 deletions br/pkg/lightning/mydump/charset_convertor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/pingcap/errors"
"github.com/pingcap/tidb/br/pkg/lightning/config"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/simplifiedchinese"
)

Expand Down Expand Up @@ -60,28 +61,38 @@ func NewCharsetConvertor(dataCharacterSet, dataInvalidCharReplace string) (*Char

func (cc *CharsetConvertor) initDecoder() error {
switch cc.sourceCharacterSet {
case config.Binary, config.UTF8MB4:
case config.Binary, config.UTF8MB4, config.ASCII:
return nil
case config.GB18030:
cc.decoder = simplifiedchinese.GB18030.NewDecoder()
return nil
case config.GBK:
cc.decoder = simplifiedchinese.GBK.NewDecoder()
return nil
case config.Latin1:
// use Windows1252 (not ISO 8859-1) to decode Latin1
// https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
cc.decoder = charmap.Windows1252.NewDecoder()
return nil
}
return errors.Errorf("not support %s as the conversion source yet", cc.sourceCharacterSet)
}

func (cc *CharsetConvertor) initEncoder() error {
switch cc.sourceCharacterSet {
case config.Binary, config.UTF8MB4:
case config.Binary, config.UTF8MB4, config.ASCII:
return nil
case config.GB18030:
cc.encoder = simplifiedchinese.GB18030.NewEncoder()
return nil
case config.GBK:
cc.encoder = simplifiedchinese.GBK.NewEncoder()
return nil
case config.Latin1:
// use Windows1252 (not ISO 8859-1) to encode Latin1
// https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
cc.encoder = charmap.Windows1252.NewEncoder()
return nil
}
return errors.Errorf("not support %s as the conversion source yet", cc.sourceCharacterSet)
}
Expand All @@ -105,7 +116,9 @@ func (cc *CharsetConvertor) Decode(src string) (string, error) {
func (cc *CharsetConvertor) precheck(src string) bool {
// No need to convert the charset encoding, just return the original data.
if len(src) == 0 || cc == nil ||
cc.sourceCharacterSet == config.Binary || cc.sourceCharacterSet == config.UTF8MB4 ||
cc.sourceCharacterSet == config.Binary ||
cc.sourceCharacterSet == config.UTF8MB4 ||
cc.sourceCharacterSet == config.ASCII ||
cc.decoder == nil || cc.encoder == nil {
return false
}
Expand Down
132 changes: 131 additions & 1 deletion executor/loadremotetest/one_csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,138 @@ func (s *mockGCSSuite) TestGBK() {
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "utf8mb4.tsv",
},
Content: []byte("1\t一丁丂七丄丅丆万丈三上下丌不与丏\n" +
"2\t丐丑丒专且丕世丗丘丙业丛东丝丞丢"),
})

s.tk.MustExec("TRUNCATE TABLE load_charset.gbk;")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8mb4.tsv?endpoint=%s'
INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.gbk;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "emoji.tsv",
},
Content: []byte("1\t一丁丂七😀😁😂😃\n" +
"2\t丐丑丒专😄😅😆😇"),
})

s.tk.MustExec("TRUNCATE TABLE load_charset.gbk;")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/emoji.tsv?endpoint=%s'
INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint)
err := s.tk.ExecToErr(sql)
checkClientErrorMessage(s.T(), err, `ERROR 1366 (HY000): Incorrect string value '\xF0\x9F\x98\x80' for column 'j'`)

sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/emoji.tsv?endpoint=%s'
IGNORE INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint)
s.tk.MustExec(sql)
require.Equal(s.T(), "Records: 2 Deleted: 0 Skipped: 0 Warnings: 2", s.tk.Session().GetSessionVars().StmtCtx.GetMessage())
s.tk.MustQuery("SELECT HEX(j) FROM load_charset.gbk;").Check(testkit.Rows(
"D2BBB6A18140C6DF3F3F3F3F",
"D8A4B3F38145D7A83F3F3F3F",
))

sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET unknown`, gcsEndpoint)
err := s.tk.ExecToErr(sql)
err = s.tk.ExecToErr(sql)
require.ErrorContains(s.T(), err, "Unknown character set: 'unknown'")
}

func (s *mockGCSSuite) TestOtherCharset() {
s.tk.MustExec("DROP DATABASE IF EXISTS load_charset;")
s.tk.MustExec("CREATE DATABASE load_charset;")
s.tk.MustExec(`CREATE TABLE load_charset.utf8 (
i INT, j VARCHAR(255)
) CHARACTER SET utf8;`)
s.tk.MustExec(`CREATE TABLE load_charset.utf8mb4 (
i INT, j VARCHAR(255)
) CHARACTER SET utf8mb4;`)

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "utf8.tsv",
},
Content: []byte("1\tကခဂဃ\n2\tငစဆဇ"),
})

sql := fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8.tsv?endpoint=%s'
INTO TABLE load_charset.utf8 CHARACTER SET utf8`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8;").Check(testkit.Rows(
"1 ကခဂဃ",
"2 ငစဆဇ",
))
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET utf8`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 ကခဂဃ",
"2 ငစဆဇ",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "latin1.tsv",
},
// "1\t‘’“”\n2\t¡¢£¤"
Content: []byte{0x31, 0x09, 0x91, 0x92, 0x93, 0x94, 0x0a, 0x32, 0x09, 0xa1, 0xa2, 0xa3, 0xa4},
})
s.tk.MustExec(`CREATE TABLE load_charset.latin1 (
i INT, j VARCHAR(255)
) CHARACTER SET latin1;`)
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/latin1.tsv?endpoint=%s'
INTO TABLE load_charset.latin1 CHARACTER SET latin1`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.latin1;").Check(testkit.Rows(
"1 ‘’“”",
"2 ¡¢£¤",
))

s.tk.MustExec("TRUNCATE TABLE load_charset.utf8mb4;")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/latin1.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET latin1`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 ‘’“”",
"2 ¡¢£¤",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "ascii.tsv",
},
Content: []byte{0, 1, 2, 3, 4, 5, 6, 7},
})
s.tk.MustExec(`CREATE TABLE load_charset.ascii (
j VARCHAR(255)
) CHARACTER SET ascii;`)
s.tk.MustExec(`CREATE TABLE load_charset.binary (
j VARCHAR(255)
) CHARACTER SET binary;`)
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/ascii.tsv?endpoint=%s'
INTO TABLE load_charset.ascii CHARACTER SET ascii`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT HEX(j) FROM load_charset.ascii;").Check(testkit.Rows(
"0001020304050607",
))
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/ascii.tsv?endpoint=%s'
INTO TABLE load_charset.binary CHARACTER SET binary`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT HEX(j) FROM load_charset.binary;").Check(testkit.Rows(
"0001020304050607",
))
}

0 comments on commit 71fa840

Please sign in to comment.