From e9ed0c8f8f7fd82109ff0dcdb342a87471279b8d Mon Sep 17 00:00:00 2001 From: dsdashun Date: Thu, 9 Feb 2023 00:03:58 +0800 Subject: [PATCH] lightning: introduce param to skip CSV header parsing (#41128) close pingcap/tidb#40839 --- br/pkg/lightning/config/config.go | 32 +++---- br/pkg/lightning/mydump/csv_parser.go | 3 + br/pkg/lightning/mydump/csv_parser_test.go | 84 +++++++++++++++++-- br/pkg/lightning/mydump/region.go | 4 +- br/pkg/lightning/mydump/region_test.go | 75 +++++++++-------- .../lightning/restore/table_restore_test.go | 32 +++---- .../data/mytest.testtbl-schema.sql | 5 ++ .../data/mytest.testtbl.csv | 6 ++ .../err_config.toml | 9 ++ .../err_default_config.toml | 8 ++ .../normal_config.toml | 9 ++ .../lightning_config_skip_csv_header/run.sh | 59 +++++++++++++ 12 files changed, 255 insertions(+), 71 deletions(-) create mode 100644 br/tests/lightning_config_skip_csv_header/data/mytest.testtbl-schema.sql create mode 100644 br/tests/lightning_config_skip_csv_header/data/mytest.testtbl.csv create mode 100644 br/tests/lightning_config_skip_csv_header/err_config.toml create mode 100644 br/tests/lightning_config_skip_csv_header/err_default_config.toml create mode 100644 br/tests/lightning_config_skip_csv_header/normal_config.toml create mode 100755 br/tests/lightning_config_skip_csv_header/run.sh diff --git a/br/pkg/lightning/config/config.go b/br/pkg/lightning/config/config.go index d14f12066c0f4..7038bd4d9255d 100644 --- a/br/pkg/lightning/config/config.go +++ b/br/pkg/lightning/config/config.go @@ -479,14 +479,15 @@ type PostRestore struct { type CSVConfig struct { // Separator, Delimiter and Terminator should all be in utf8mb4 encoding. - Separator string `toml:"separator" json:"separator"` - Delimiter string `toml:"delimiter" json:"delimiter"` - Terminator string `toml:"terminator" json:"terminator"` - Null string `toml:"null" json:"null"` - Header bool `toml:"header" json:"header"` - TrimLastSep bool `toml:"trim-last-separator" json:"trim-last-separator"` - NotNull bool `toml:"not-null" json:"not-null"` - BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"` + Separator string `toml:"separator" json:"separator"` + Delimiter string `toml:"delimiter" json:"delimiter"` + Terminator string `toml:"terminator" json:"terminator"` + Null string `toml:"null" json:"null"` + Header bool `toml:"header" json:"header"` + HeaderSchemaMatch bool `toml:"header-schema-match" json:"header-schema-match"` + TrimLastSep bool `toml:"trim-last-separator" json:"trim-last-separator"` + NotNull bool `toml:"not-null" json:"not-null"` + BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"` // hide these options for lightning configuration file, they can only be used by LOAD DATA // https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-field-line-handling StartingBy string `toml:"-" json:"-"` @@ -743,13 +744,14 @@ func NewConfig() *Config { Mydumper: MydumperRuntime{ ReadBlockSize: ReadBlockSize, CSV: CSVConfig{ - Separator: ",", - Delimiter: `"`, - Header: true, - NotNull: false, - Null: `\N`, - BackslashEscape: true, - TrimLastSep: false, + Separator: ",", + Delimiter: `"`, + Header: true, + HeaderSchemaMatch: true, + NotNull: false, + Null: `\N`, + BackslashEscape: true, + TrimLastSep: false, }, StrictFormat: false, MaxRegionSize: MaxRegionSize, diff --git a/br/pkg/lightning/mydump/csv_parser.go b/br/pkg/lightning/mydump/csv_parser.go index 26fb65a493183..d9cd033d70861 100644 --- a/br/pkg/lightning/mydump/csv_parser.go +++ b/br/pkg/lightning/mydump/csv_parser.go @@ -607,6 +607,9 @@ func (parser *CSVParser) ReadColumns() error { if err != nil { return errors.Trace(err) } + if !parser.cfg.HeaderSchemaMatch { + return nil + } parser.columns = make([]string, 0, len(columns)) for _, colName := range columns { colName, _, err = parser.unescapeString(colName) diff --git a/br/pkg/lightning/mydump/csv_parser_test.go b/br/pkg/lightning/mydump/csv_parser_test.go index adb057679b3a4..8980ce221fe75 100644 --- a/br/pkg/lightning/mydump/csv_parser_test.go +++ b/br/pkg/lightning/mydump/csv_parser_test.go @@ -481,12 +481,13 @@ func TestSyntaxErrorCSV(t *testing.T) { func TestTSV(t *testing.T) { cfg := config.CSVConfig{ - Separator: "\t", - Delimiter: "", - BackslashEscape: false, - NotNull: false, - Null: "", - Header: true, + Separator: "\t", + Delimiter: "", + BackslashEscape: false, + NotNull: false, + Null: "", + Header: true, + HeaderSchemaMatch: true, } parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(`a b c d e f @@ -577,6 +578,7 @@ func TestCsvWithWhiteSpaceLine(t *testing.T) { require.Nil(t, parser.Close()) cfg.Header = true + cfg.HeaderSchemaMatch = true data = " \r\na,b,c\r\n0,,abc\r\n" parser, err = mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), ioWorkers, true, nil) require.NoError(t, err) @@ -609,6 +611,7 @@ func TestEmpty(t *testing.T) { // Try again with headers. cfg.Header = true + cfg.HeaderSchemaMatch = true parser, err = mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), ioWorkers, true, nil) require.NoError(t, err) @@ -1292,3 +1295,72 @@ func BenchmarkReadRowUsingEncodingCSV(b *testing.B) { } require.Equal(b, b.N, rowsCount) } + +func TestHeaderSchemaMatch(t *testing.T) { + cfg := config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: `"`, + }, + } + + inputData := `id,val1,val2,val3 +1,111,aaa,1.0 +2,222,bbb,2.0 +3,333,ccc,3.0 +4,444,ddd,4.0` + + parsedDataPart := [][]types.Datum{ + {types.NewStringDatum("1"), types.NewStringDatum("111"), types.NewStringDatum("aaa"), types.NewStringDatum("1.0")}, + {types.NewStringDatum("2"), types.NewStringDatum("222"), types.NewStringDatum("bbb"), types.NewStringDatum("2.0")}, + {types.NewStringDatum("3"), types.NewStringDatum("333"), types.NewStringDatum("ccc"), types.NewStringDatum("3.0")}, + {types.NewStringDatum("4"), types.NewStringDatum("444"), types.NewStringDatum("ddd"), types.NewStringDatum("4.0")}, + } + + type testCase struct { + Header bool + HeaderSchemaMatch bool + ExpectedData [][]types.Datum + ExpectedColumns []string + } + + for _, tc := range []testCase{ + { + Header: true, + HeaderSchemaMatch: true, + ExpectedData: parsedDataPart, + ExpectedColumns: []string{"id", "val1", "val2", "val3"}, + }, + { + Header: true, + HeaderSchemaMatch: false, + ExpectedData: parsedDataPart, + ExpectedColumns: nil, + }, + { + Header: false, + HeaderSchemaMatch: true, + ExpectedData: append([][]types.Datum{ + {types.NewStringDatum("id"), types.NewStringDatum("val1"), types.NewStringDatum("val2"), types.NewStringDatum("val3")}, + }, parsedDataPart...), + ExpectedColumns: nil, + }, + } { + comment := fmt.Sprintf("header = %v, header-schema-match = %v", tc.Header, tc.HeaderSchemaMatch) + cfg.CSV.Header = tc.Header + cfg.CSV.HeaderSchemaMatch = tc.HeaderSchemaMatch + charsetConvertor, err := mydump.NewCharsetConvertor(cfg.DataCharacterSet, cfg.DataInvalidCharReplace) + assert.NoError(t, err) + parser, err := mydump.NewCSVParser(context.Background(), &cfg.CSV, mydump.NewStringReader(inputData), int64(config.ReadBlockSize), ioWorkers, tc.Header, charsetConvertor) + assert.NoError(t, err) + for i, row := range tc.ExpectedData { + comment := fmt.Sprintf("row = %d, header = %v, header-schema-match = %v", i+1, tc.Header, tc.HeaderSchemaMatch) + e := parser.ReadRow() + assert.NoErrorf(t, e, "row = %d, error = %s", i+1, errors.ErrorStack(e)) + assert.Equal(t, int64(i)+1, parser.LastRow().RowID, comment) + assert.Equal(t, row, parser.LastRow().Row, comment) + } + assert.ErrorIsf(t, errors.Cause(parser.ReadRow()), io.EOF, comment) + assert.Equal(t, tc.ExpectedColumns, parser.Columns(), comment) + } +} diff --git a/br/pkg/lightning/mydump/region.go b/br/pkg/lightning/mydump/region.go index aba71f666be2e..d57a2c4742c73 100644 --- a/br/pkg/lightning/mydump/region.go +++ b/br/pkg/lightning/mydump/region.go @@ -404,7 +404,9 @@ func SplitLargeFile( if err = parser.ReadColumns(); err != nil { return 0, nil, nil, err } - columns = parser.Columns() + if cfg.Mydumper.CSV.HeaderSchemaMatch { + columns = parser.Columns() + } startOffset, _ = parser.Pos() endOffset = startOffset + maxRegionSize if endOffset > dataFile.FileMeta.FileSize { diff --git a/br/pkg/lightning/mydump/region_test.go b/br/pkg/lightning/mydump/region_test.go index 5aa2b3a85b752..362ff8603c7f9 100644 --- a/br/pkg/lightning/mydump/region_test.go +++ b/br/pkg/lightning/mydump/region_test.go @@ -174,13 +174,14 @@ func TestMakeSourceFileRegion(t *testing.T) { ReadBlockSize: config.ReadBlockSize, MaxRegionSize: 1, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: "", - Header: true, - TrimLastSep: false, - NotNull: false, - Null: "NULL", - BackslashEscape: true, + Separator: ",", + Delimiter: "", + Header: true, + HeaderSchemaMatch: true, + TrimLastSep: false, + NotNull: false, + Null: "NULL", + BackslashEscape: true, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -230,13 +231,14 @@ func TestCompressedMakeSourceFileRegion(t *testing.T) { ReadBlockSize: config.ReadBlockSize, MaxRegionSize: 1, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: "", - Header: true, - TrimLastSep: false, - NotNull: false, - Null: "NULL", - BackslashEscape: true, + Separator: ",", + Delimiter: "", + Header: true, + HeaderSchemaMatch: true, + TrimLastSep: false, + NotNull: false, + Null: "NULL", + BackslashEscape: true, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -284,13 +286,14 @@ func TestSplitLargeFile(t *testing.T) { Mydumper: config.MydumperRuntime{ ReadBlockSize: config.ReadBlockSize, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: "", - Header: true, - TrimLastSep: false, - NotNull: false, - Null: "NULL", - BackslashEscape: true, + Separator: ",", + Delimiter: "", + Header: true, + HeaderSchemaMatch: true, + TrimLastSep: false, + NotNull: false, + Null: "NULL", + BackslashEscape: true, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -342,13 +345,14 @@ func TestSplitLargeFileNoNewLineAtEOF(t *testing.T) { Mydumper: config.MydumperRuntime{ ReadBlockSize: config.ReadBlockSize, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: "", - Header: true, - TrimLastSep: false, - NotNull: false, - Null: "NULL", - BackslashEscape: true, + Separator: ",", + Delimiter: "", + Header: true, + HeaderSchemaMatch: true, + TrimLastSep: false, + NotNull: false, + Null: "NULL", + BackslashEscape: true, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -447,13 +451,14 @@ func TestSplitLargeFileOnlyOneChunk(t *testing.T) { Mydumper: config.MydumperRuntime{ ReadBlockSize: config.ReadBlockSize, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: "", - Header: true, - TrimLastSep: false, - NotNull: false, - Null: "NULL", - BackslashEscape: true, + Separator: ",", + Delimiter: "", + Header: true, + HeaderSchemaMatch: true, + TrimLastSep: false, + NotNull: false, + Null: "NULL", + BackslashEscape: true, }, StrictFormat: true, Filter: []string{"*.*"}, diff --git a/br/pkg/lightning/restore/table_restore_test.go b/br/pkg/lightning/restore/table_restore_test.go index 5cfaeabc804d9..0f6d87892e329 100644 --- a/br/pkg/lightning/restore/table_restore_test.go +++ b/br/pkg/lightning/restore/table_restore_test.go @@ -306,6 +306,7 @@ func (s *tableRestoreSuite) TestPopulateChunks() { // set csv header to true, this will cause check columns fail s.cfg.Mydumper.CSV.Header = true + s.cfg.Mydumper.CSV.HeaderSchemaMatch = true s.cfg.Mydumper.StrictFormat = true regionSize := s.cfg.Mydumper.MaxRegionSize s.cfg.Mydumper.MaxRegionSize = 5 @@ -455,6 +456,7 @@ func (s *tableRestoreSuite) TestPopulateChunksCSVHeader() { cfg.Mydumper.MaxRegionSize = 40 cfg.Mydumper.CSV.Header = true + cfg.Mydumper.CSV.HeaderSchemaMatch = true cfg.Mydumper.StrictFormat = true rc := &Controller{cfg: cfg, ioWorkers: worker.NewPool(context.Background(), 1, "io"), store: store} @@ -2135,13 +2137,14 @@ func (s *tableRestoreSuite) TestSchemaIsValid() { Mydumper: config.MydumperRuntime{ ReadBlockSize: config.ReadBlockSize, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: `"`, - Header: ca.hasHeader, - NotNull: false, - Null: `\N`, - BackslashEscape: true, - TrimLastSep: false, + Separator: ",", + Delimiter: `"`, + Header: ca.hasHeader, + HeaderSchemaMatch: true, + NotNull: false, + Null: `\N`, + BackslashEscape: true, + TrimLastSep: false, }, IgnoreColumns: ca.ignoreColumns, }, @@ -2170,13 +2173,14 @@ func (s *tableRestoreSuite) TestGBKEncodedSchemaIsValid() { DataCharacterSet: "gb18030", DataInvalidCharReplace: string(utf8.RuneError), CSV: config.CSVConfig{ - Separator: ",", - Delimiter: `"`, - Header: true, - NotNull: false, - Null: `\N`, - BackslashEscape: true, - TrimLastSep: false, + Separator: ",", + Delimiter: `"`, + Header: true, + HeaderSchemaMatch: true, + NotNull: false, + Null: `\N`, + BackslashEscape: true, + TrimLastSep: false, }, IgnoreColumns: nil, }, diff --git a/br/tests/lightning_config_skip_csv_header/data/mytest.testtbl-schema.sql b/br/tests/lightning_config_skip_csv_header/data/mytest.testtbl-schema.sql new file mode 100644 index 0000000000000..93582d5178139 --- /dev/null +++ b/br/tests/lightning_config_skip_csv_header/data/mytest.testtbl-schema.sql @@ -0,0 +1,5 @@ +CREATE TABLE testtbl ( + id INTEGER PRIMARY KEY, + val1 VARCHAR(40) NOT NULL, + INDEX `idx_val1` (`val1`) +); diff --git a/br/tests/lightning_config_skip_csv_header/data/mytest.testtbl.csv b/br/tests/lightning_config_skip_csv_header/data/mytest.testtbl.csv new file mode 100644 index 0000000000000..5958ab0c80cb2 --- /dev/null +++ b/br/tests/lightning_config_skip_csv_header/data/mytest.testtbl.csv @@ -0,0 +1,6 @@ +aaa,bbb +1,"aaa01" +2,"aaa01" +3,"aaa02" +4,"aaa02" +5,"aaa05" diff --git a/br/tests/lightning_config_skip_csv_header/err_config.toml b/br/tests/lightning_config_skip_csv_header/err_config.toml new file mode 100644 index 0000000000000..95493db0dff44 --- /dev/null +++ b/br/tests/lightning_config_skip_csv_header/err_config.toml @@ -0,0 +1,9 @@ +[lightning] +check-requirements=true + +[mydumper.csv] +header = true +header-schema-match = true + +[tikv-importer] +duplicate-resolution = 'remove' diff --git a/br/tests/lightning_config_skip_csv_header/err_default_config.toml b/br/tests/lightning_config_skip_csv_header/err_default_config.toml new file mode 100644 index 0000000000000..a7b17c7276d92 --- /dev/null +++ b/br/tests/lightning_config_skip_csv_header/err_default_config.toml @@ -0,0 +1,8 @@ +[lightning] +check-requirements=true + +[mydumper.csv] +header = true + +[tikv-importer] +duplicate-resolution = 'remove' diff --git a/br/tests/lightning_config_skip_csv_header/normal_config.toml b/br/tests/lightning_config_skip_csv_header/normal_config.toml new file mode 100644 index 0000000000000..190e635cfc4e9 --- /dev/null +++ b/br/tests/lightning_config_skip_csv_header/normal_config.toml @@ -0,0 +1,9 @@ +[lightning] +check-requirements=true + +[mydumper.csv] +header = true +header-schema-match = false + +[tikv-importer] +duplicate-resolution = 'remove' diff --git a/br/tests/lightning_config_skip_csv_header/run.sh b/br/tests/lightning_config_skip_csv_header/run.sh new file mode 100755 index 0000000000000..80ad201e2d323 --- /dev/null +++ b/br/tests/lightning_config_skip_csv_header/run.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# Copyright 2023 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +mydir=$(dirname "${BASH_SOURCE[0]}") + +data_file="${mydir}/data/mytest.testtbl.csv" + +total_row_count=$( sed '1d' "${data_file}" | wc -l | xargs echo ) + +run_sql 'DROP TABLE IF EXISTS mytest.testtbl' + +console_output_file="/tmp/${TEST_NAME}.out" + +echo "Use config that causes errors" +run_lightning --backend tidb --config "${mydir}/err_config.toml" 2>&1 | tee "${console_output_file}" +if [[ ${PIPESTATUS[0]} -eq 0 ]]; then + echo "The lightning import doesn't fail as expected" >&2 + exit 1 +fi + +grep -q "Lightning:Restore:ErrUnknownColumns" "${console_output_file}" + +# import a second time +echo "Use default config that causes errors" +run_lightning --backend tidb --config "${mydir}/err_default_config.toml" 2>&1 | tee "${console_output_file}" +if [[ ${PIPESTATUS[0]} -eq 0 ]]; then + echo "The lightning import doesn't fail as expected" >&2 + exit 1 +fi + +grep -q "Lightning:Restore:ErrUnknownColumns" "${console_output_file}" + +# import a thrid time + +run_sql 'DROP TABLE IF EXISTS mytest.testtbl' + +echo "Use config that can sucessfully import the data" +run_lightning --backend tidb --config "${mydir}/normal_config.toml" + +run_sql 'SELECT * FROM mytest.testtbl' +run_sql 'SELECT COUNT(*) FROM mytest.testtbl' +check_contains "COUNT(*): ${total_row_count}" +run_sql 'SELECT COUNT(*) FROM mytest.testtbl WHERE id > 0' +check_contains "COUNT(*): ${total_row_count}"