From d960d0e691e7d7e2f7d523254846aa324a88dd51 Mon Sep 17 00:00:00 2001 From: Xiaochao Dong Date: Thu, 18 Aug 2022 00:54:57 +0800 Subject: [PATCH] Store: improve index header reading performance by sorting values first (#5588) Signed-off-by: Xiaochao Dong (@damnever) Signed-off-by: Xiaochao Dong (@damnever) --- CHANGELOG.md | 1 + pkg/store/bucket.go | 17 +++++++++++------ pkg/store/bucket_test.go | 27 ++++++++++++++++++++++----- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e447d5b69..1756a18a3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5451](https://github.com/thanos-io/thanos/pull/5451) Azure: Reduce memory usage by not buffering file downloads entirely in memory. - [#5484](https://github.com/thanos-io/thanos/pull/5484) Update Prometheus deps to v2.36.2. - [#5511](https://github.com/thanos-io/thanos/pull/5511) Update Prometheus deps to v2.37.0. +- [#5588](https://github.com/thanos-io/thanos/pull/5588) Store: improve index header reading performance by sorting values first. ### Removed diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go index c0b663472e..e567486e54 100644 --- a/pkg/store/bucket.go +++ b/pkg/store/bucket.go @@ -1976,13 +1976,18 @@ func checkNilPosting(l labels.Label, p index.Postings) index.Postings { // NOTE: Derived from tsdb.postingsForMatcher. index.Merge is equivalent to map duplication. func toPostingGroup(lvalsFn func(name string) ([]string, error), m *labels.Matcher) (*postingGroup, error) { - if m.Type == labels.MatchRegexp && len(findSetMatches(m.Value)) > 0 { - vals := findSetMatches(m.Value) - toAdd := make([]labels.Label, 0, len(vals)) - for _, val := range vals { - toAdd = append(toAdd, labels.Label{Name: m.Name, Value: val}) + if m.Type == labels.MatchRegexp { + if vals := findSetMatches(m.Value); len(vals) > 0 { + // Sorting will improve the performance dramatically if the dataset is relatively large + // since entries in the postings offset table was sorted by label name and value, + // the sequential reading is much faster. + sort.Strings(vals) + toAdd := make([]labels.Label, 0, len(vals)) + for _, val := range vals { + toAdd = append(toAdd, labels.Label{Name: m.Name, Value: val}) + } + return newPostingGroup(false, toAdd, nil), nil } - return newPostingGroup(false, toAdd, nil), nil } // If the matcher selects an empty value, it selects all the series which don't diff --git a/pkg/store/bucket_test.go b/pkg/store/bucket_test.go index 70d010b9f4..6a54f76b8e 100644 --- a/pkg/store/bucket_test.go +++ b/pkg/store/bucket_test.go @@ -17,6 +17,7 @@ import ( "regexp" "sort" "strconv" + "strings" "sync" "testing" "time" @@ -1113,14 +1114,16 @@ func appendTestData(t testing.TB, app storage.Appender, series int) { } series = series / 5 + uniq := 0 for n := 0; n < 10; n++ { for i := 0; i < series/10; i++ { - addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "foo")) + addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "foo", "uniq", fmt.Sprintf("%08d", uniq))) // Have some series that won't be matched, to properly test inverted matches. - addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "bar")) - addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", "0_"+strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "bar")) - addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", "1_"+strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "bar")) - addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", "2_"+strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "foo")) + addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "bar", "uniq", fmt.Sprintf("%08d", uniq+1))) + addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", "0_"+strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "bar", "uniq", fmt.Sprintf("%08d", uniq+2))) + addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", "1_"+strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "bar", "uniq", fmt.Sprintf("%08d", uniq+3))) + addSeries(labels.FromStrings("i", strconv.Itoa(i)+storetestutil.LabelLongSuffix, "n", "2_"+strconv.Itoa(n)+storetestutil.LabelLongSuffix, "j", "foo", "uniq", fmt.Sprintf("%08d", uniq+4))) + uniq += 5 } } testutil.Ok(t, app.Commit()) @@ -1161,6 +1164,19 @@ func benchmarkExpandedPostings( iNot2 := labels.MustNewMatcher(labels.MatchNotEqual, "n", "2"+storetestutil.LabelLongSuffix) iNot2Star := labels.MustNewMatcher(labels.MatchNotRegexp, "i", "^2.*$") iRegexSet := labels.MustNewMatcher(labels.MatchRegexp, "i", "0"+storetestutil.LabelLongSuffix+"|1"+storetestutil.LabelLongSuffix+"|2"+storetestutil.LabelLongSuffix) + bigValueSetSize := series / 10 + if bigValueSetSize > 50000 { + bigValueSetSize = 50000 + } + bigValueSet := make([]string, 0, bigValueSetSize) + for i := 0; i < series; i += series / bigValueSetSize { + bigValueSet = append(bigValueSet, fmt.Sprintf("%08d", i)) + } + bigValueSetSize = len(bigValueSet) + rand.New(rand.NewSource(time.Now().UnixNano())).Shuffle(len(bigValueSet), func(i, j int) { + bigValueSet[i], bigValueSet[j] = bigValueSet[j], bigValueSet[i] + }) + iRegexBigValueSet := labels.MustNewMatcher(labels.MatchRegexp, "uniq", strings.Join(bigValueSet, "|")) series = series / 5 cases := []struct { @@ -1186,6 +1202,7 @@ func benchmarkExpandedPostings( {`n="1",i=~".+",i!="2",j="foo"`, []*labels.Matcher{n1, iPlus, iNot2, jFoo}, int(float64(series) * 0.1)}, {`n="1",i=~".+",i!~"2.*",j="foo"`, []*labels.Matcher{n1, iPlus, iNot2Star, jFoo}, int(1 + float64(series)*0.088888)}, {`i=~"0|1|2"`, []*labels.Matcher{iRegexSet}, 150}, // 50 series for "1", 50 for "2" and 50 for "3". + {`uniq=~"9|random-shuffled-values|1"`, []*labels.Matcher{iRegexBigValueSet}, bigValueSetSize}, } for _, c := range cases {