Skip to content

Commit

Permalink
Be pessimistic about strict stores (#3562)
Browse files Browse the repository at this point in the history
We want queries with ?partial_response=0 to be sound and complete.
Unfortunately today many queries can succeed without an error while the
system is heavily degraded.

Make strict stores cover a long time range by default until the right
time range has been discovered. We might need to extend this to labels
too. Also on many consecutive Metadata failures we should revert to a
default minTime and maxTime.

Signed-off-by: Holger Hans Peter Freyther <holger@freyther.de>
  • Loading branch information
zecke authored Apr 1, 2021
1 parent 811b86c commit e7255fe
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
8 changes: 8 additions & 0 deletions pkg/query/storeset.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"sort"
"sync"
"time"
Expand Down Expand Up @@ -499,6 +500,9 @@ func (s *StoreSet) getActiveStores(ctx context.Context, stores map[string]*store
}

st = &storeRef{StoreClient: storepb.NewStoreClient(conn), storeType: component.UnknownStoreAPI, cc: conn, addr: addr, logger: s.logger}
if spec.StrictStatic() {
st.maxTime = math.MaxInt64
}
}

var rule rulespb.RulesClient
Expand Down Expand Up @@ -566,6 +570,10 @@ func (s *StoreSet) updateStoreStatus(store *storeRef, err error) {
prev, ok := s.storeStatuses[store.addr]
if ok {
status = *prev
} else {
mint, maxt := store.TimeRange()
status.MinTime = mint
status.MaxTime = maxt
}

if err == nil {
Expand Down
20 changes: 18 additions & 2 deletions pkg/query/storeset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ func TestQuerierStrict(t *testing.T) {
defer st.Close()

staticStoreAddr := st.StoreAddresses()[0]
slowStaticStoreAddr := st.StoreAddresses()[2]
storeSet := NewStoreSet(nil, nil, func() (specs []StoreSpec) {
return []StoreSpec{
NewGRPCStoreSpec(st.StoreAddresses()[0], true),
Expand All @@ -649,14 +650,26 @@ func TestQuerierStrict(t *testing.T) {
storeSet.Update(context.Background())
testutil.Equals(t, 3, len(storeSet.stores), "three clients must be available for running store nodes")

testutil.Assert(t, storeSet.stores[st.StoreAddresses()[2]].cc.GetState().String() != "SHUTDOWN", "slow store's connection should not be closed")
// The store has not responded to the info call and is assumed to cover everything.
curMin, curMax := storeSet.stores[slowStaticStoreAddr].minTime, storeSet.stores[slowStaticStoreAddr].maxTime
testutil.Assert(t, storeSet.stores[slowStaticStoreAddr].cc.GetState().String() != "SHUTDOWN", "slow store's connection should not be closed")
testutil.Equals(t, int64(0), curMin)
testutil.Equals(t, int64(math.MaxInt64), curMax)

// The store is statically defined + strict mode is enabled
// so its client + information must be retained.
curMin, curMax := storeSet.stores[staticStoreAddr].minTime, storeSet.stores[staticStoreAddr].maxTime
curMin, curMax = storeSet.stores[staticStoreAddr].minTime, storeSet.stores[staticStoreAddr].maxTime
testutil.Equals(t, int64(12345), curMin, "got incorrect minimum time")
testutil.Equals(t, int64(54321), curMax, "got incorrect minimum time")

// Successfully retrieve the information and observe minTime/maxTime updating.
storeSet.gRPCInfoCallTimeout = 3 * time.Second
storeSet.Update(context.Background())
updatedCurMin, updatedCurMax := storeSet.stores[slowStaticStoreAddr].minTime, storeSet.stores[slowStaticStoreAddr].maxTime
testutil.Equals(t, int64(65644), updatedCurMin)
testutil.Equals(t, int64(77777), updatedCurMax)
storeSet.gRPCInfoCallTimeout = 1 * time.Second

// Turn off the stores.
st.Close()

Expand All @@ -670,6 +683,9 @@ func TestQuerierStrict(t *testing.T) {
testutil.Equals(t, curMin, storeSet.stores[staticStoreAddr].minTime, "minimum time reported by the store node is different")
testutil.Equals(t, curMax, storeSet.stores[staticStoreAddr].maxTime, "minimum time reported by the store node is different")
testutil.NotOk(t, storeSet.storeStatuses[staticStoreAddr].LastError.originalErr)

testutil.Equals(t, updatedCurMin, storeSet.stores[slowStaticStoreAddr].minTime, "minimum time reported by the store node is different")
testutil.Equals(t, updatedCurMax, storeSet.stores[slowStaticStoreAddr].maxTime, "minimum time reported by the store node is different")
}

func TestStoreSet_Update_Rules(t *testing.T) {
Expand Down

0 comments on commit e7255fe

Please sign in to comment.