From 71fa84089e9612fcd85ccda62173d6bcd9273480 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 3 Apr 2023 11:12:56 +0800 Subject: [PATCH] loaddata, lightning: support ascii, latin1, utf8 charset of source file (#42699) ref pingcap/tidb#40499 --- br/pkg/lightning/config/config.go | 12 +- br/pkg/lightning/mydump/BUILD.bazel | 1 + br/pkg/lightning/mydump/charset_convertor.go | 19 ++- executor/loadremotetest/one_csv_test.go | 132 ++++++++++++++++++- 4 files changed, 159 insertions(+), 5 deletions(-) diff --git a/br/pkg/lightning/config/config.go b/br/pkg/lightning/config/config.go index 4e85b4520fa87..44d6360ed8ba1 100644 --- a/br/pkg/lightning/config/config.go +++ b/br/pkg/lightning/config/config.go @@ -766,6 +766,8 @@ const ( UTF8MB4 GB18030 GBK + Latin1 + ASCII ) // String return the string value of charset @@ -779,6 +781,10 @@ func (c Charset) String() string { return "gb18030" case GBK: return "gbk" + case Latin1: + return "latin1" + case ASCII: + return "ascii" default: return "unknown_charset" } @@ -789,12 +795,16 @@ func ParseCharset(dataCharacterSet string) (Charset, error) { switch strings.ToLower(dataCharacterSet) { case "", "binary": return Binary, nil - case "utf8mb4": + case "utf8", "utf8mb4": return UTF8MB4, nil case "gb18030": return GB18030, nil case "gbk": return GBK, nil + case "latin1": + return Latin1, nil + case "ascii": + return ASCII, nil default: return Binary, errors.Errorf("found unsupported data-character-set: %s", dataCharacterSet) } diff --git a/br/pkg/lightning/mydump/BUILD.bazel b/br/pkg/lightning/mydump/BUILD.bazel index 95d61b14465e8..e746ab8d9f18d 100644 --- a/br/pkg/lightning/mydump/BUILD.bazel +++ b/br/pkg/lightning/mydump/BUILD.bazel @@ -38,6 +38,7 @@ go_library( "@com_github_xitongsys_parquet_go//source", "@org_golang_x_exp//slices", "@org_golang_x_text//encoding", + "@org_golang_x_text//encoding/charmap", "@org_golang_x_text//encoding/simplifiedchinese", "@org_uber_go_zap//:zap", "@org_uber_go_zap//zapcore", diff --git a/br/pkg/lightning/mydump/charset_convertor.go b/br/pkg/lightning/mydump/charset_convertor.go index fa5f1ee5ef540..0255725752659 100644 --- a/br/pkg/lightning/mydump/charset_convertor.go +++ b/br/pkg/lightning/mydump/charset_convertor.go @@ -21,6 +21,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/tidb/br/pkg/lightning/config" "golang.org/x/text/encoding" + "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/simplifiedchinese" ) @@ -60,7 +61,7 @@ func NewCharsetConvertor(dataCharacterSet, dataInvalidCharReplace string) (*Char func (cc *CharsetConvertor) initDecoder() error { switch cc.sourceCharacterSet { - case config.Binary, config.UTF8MB4: + case config.Binary, config.UTF8MB4, config.ASCII: return nil case config.GB18030: cc.decoder = simplifiedchinese.GB18030.NewDecoder() @@ -68,13 +69,18 @@ func (cc *CharsetConvertor) initDecoder() error { case config.GBK: cc.decoder = simplifiedchinese.GBK.NewDecoder() return nil + case config.Latin1: + // use Windows1252 (not ISO 8859-1) to decode Latin1 + // https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html + cc.decoder = charmap.Windows1252.NewDecoder() + return nil } return errors.Errorf("not support %s as the conversion source yet", cc.sourceCharacterSet) } func (cc *CharsetConvertor) initEncoder() error { switch cc.sourceCharacterSet { - case config.Binary, config.UTF8MB4: + case config.Binary, config.UTF8MB4, config.ASCII: return nil case config.GB18030: cc.encoder = simplifiedchinese.GB18030.NewEncoder() @@ -82,6 +88,11 @@ func (cc *CharsetConvertor) initEncoder() error { case config.GBK: cc.encoder = simplifiedchinese.GBK.NewEncoder() return nil + case config.Latin1: + // use Windows1252 (not ISO 8859-1) to encode Latin1 + // https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html + cc.encoder = charmap.Windows1252.NewEncoder() + return nil } return errors.Errorf("not support %s as the conversion source yet", cc.sourceCharacterSet) } @@ -105,7 +116,9 @@ func (cc *CharsetConvertor) Decode(src string) (string, error) { func (cc *CharsetConvertor) precheck(src string) bool { // No need to convert the charset encoding, just return the original data. if len(src) == 0 || cc == nil || - cc.sourceCharacterSet == config.Binary || cc.sourceCharacterSet == config.UTF8MB4 || + cc.sourceCharacterSet == config.Binary || + cc.sourceCharacterSet == config.UTF8MB4 || + cc.sourceCharacterSet == config.ASCII || cc.decoder == nil || cc.encoder == nil { return false } diff --git a/executor/loadremotetest/one_csv_test.go b/executor/loadremotetest/one_csv_test.go index f5e77d4ca3e56..4979f89f6072e 100644 --- a/executor/loadremotetest/one_csv_test.go +++ b/executor/loadremotetest/one_csv_test.go @@ -350,8 +350,138 @@ func (s *mockGCSSuite) TestGBK() { "2 丐丑丒专且丕世丗丘丙业丛东丝丞丢", )) + s.server.CreateObject(fakestorage.Object{ + ObjectAttrs: fakestorage.ObjectAttrs{ + BucketName: "test-load", + Name: "utf8mb4.tsv", + }, + Content: []byte("1\t一丁丂七丄丅丆万丈三上下丌不与丏\n" + + "2\t丐丑丒专且丕世丗丘丙业丛东丝丞丢"), + }) + + s.tk.MustExec("TRUNCATE TABLE load_charset.gbk;") + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8mb4.tsv?endpoint=%s' + INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.gbk;").Check(testkit.Rows( + "1 一丁丂七丄丅丆万丈三上下丌不与丏", + "2 丐丑丒专且丕世丗丘丙业丛东丝丞丢", + )) + + s.server.CreateObject(fakestorage.Object{ + ObjectAttrs: fakestorage.ObjectAttrs{ + BucketName: "test-load", + Name: "emoji.tsv", + }, + Content: []byte("1\t一丁丂七😀😁😂😃\n" + + "2\t丐丑丒专😄😅😆😇"), + }) + + s.tk.MustExec("TRUNCATE TABLE load_charset.gbk;") + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/emoji.tsv?endpoint=%s' + INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint) + err := s.tk.ExecToErr(sql) + checkClientErrorMessage(s.T(), err, `ERROR 1366 (HY000): Incorrect string value '\xF0\x9F\x98\x80' for column 'j'`) + + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/emoji.tsv?endpoint=%s' + IGNORE INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint) + s.tk.MustExec(sql) + require.Equal(s.T(), "Records: 2 Deleted: 0 Skipped: 0 Warnings: 2", s.tk.Session().GetSessionVars().StmtCtx.GetMessage()) + s.tk.MustQuery("SELECT HEX(j) FROM load_charset.gbk;").Check(testkit.Rows( + "D2BBB6A18140C6DF3F3F3F3F", + "D8A4B3F38145D7A83F3F3F3F", + )) + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s' INTO TABLE load_charset.utf8mb4 CHARACTER SET unknown`, gcsEndpoint) - err := s.tk.ExecToErr(sql) + err = s.tk.ExecToErr(sql) require.ErrorContains(s.T(), err, "Unknown character set: 'unknown'") } + +func (s *mockGCSSuite) TestOtherCharset() { + s.tk.MustExec("DROP DATABASE IF EXISTS load_charset;") + s.tk.MustExec("CREATE DATABASE load_charset;") + s.tk.MustExec(`CREATE TABLE load_charset.utf8 ( + i INT, j VARCHAR(255) + ) CHARACTER SET utf8;`) + s.tk.MustExec(`CREATE TABLE load_charset.utf8mb4 ( + i INT, j VARCHAR(255) + ) CHARACTER SET utf8mb4;`) + + s.server.CreateObject(fakestorage.Object{ + ObjectAttrs: fakestorage.ObjectAttrs{ + BucketName: "test-load", + Name: "utf8.tsv", + }, + Content: []byte("1\tကခဂဃ\n2\tငစဆဇ"), + }) + + sql := fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8.tsv?endpoint=%s' + INTO TABLE load_charset.utf8 CHARACTER SET utf8`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.utf8;").Check(testkit.Rows( + "1 ကခဂဃ", + "2 ငစဆဇ", + )) + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8.tsv?endpoint=%s' + INTO TABLE load_charset.utf8mb4 CHARACTER SET utf8`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows( + "1 ကခဂဃ", + "2 ငစဆဇ", + )) + + s.server.CreateObject(fakestorage.Object{ + ObjectAttrs: fakestorage.ObjectAttrs{ + BucketName: "test-load", + Name: "latin1.tsv", + }, + // "1\t‘’“”\n2\t¡¢£¤" + Content: []byte{0x31, 0x09, 0x91, 0x92, 0x93, 0x94, 0x0a, 0x32, 0x09, 0xa1, 0xa2, 0xa3, 0xa4}, + }) + s.tk.MustExec(`CREATE TABLE load_charset.latin1 ( + i INT, j VARCHAR(255) + ) CHARACTER SET latin1;`) + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/latin1.tsv?endpoint=%s' + INTO TABLE load_charset.latin1 CHARACTER SET latin1`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.latin1;").Check(testkit.Rows( + "1 ‘’“”", + "2 ¡¢£¤", + )) + + s.tk.MustExec("TRUNCATE TABLE load_charset.utf8mb4;") + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/latin1.tsv?endpoint=%s' + INTO TABLE load_charset.utf8mb4 CHARACTER SET latin1`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows( + "1 ‘’“”", + "2 ¡¢£¤", + )) + + s.server.CreateObject(fakestorage.Object{ + ObjectAttrs: fakestorage.ObjectAttrs{ + BucketName: "test-load", + Name: "ascii.tsv", + }, + Content: []byte{0, 1, 2, 3, 4, 5, 6, 7}, + }) + s.tk.MustExec(`CREATE TABLE load_charset.ascii ( + j VARCHAR(255) + ) CHARACTER SET ascii;`) + s.tk.MustExec(`CREATE TABLE load_charset.binary ( + j VARCHAR(255) + ) CHARACTER SET binary;`) + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/ascii.tsv?endpoint=%s' + INTO TABLE load_charset.ascii CHARACTER SET ascii`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT HEX(j) FROM load_charset.ascii;").Check(testkit.Rows( + "0001020304050607", + )) + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/ascii.tsv?endpoint=%s' + INTO TABLE load_charset.binary CHARACTER SET binary`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT HEX(j) FROM load_charset.binary;").Check(testkit.Rows( + "0001020304050607", + )) +}