Skip to content

Commit

Permalink
Restore alertmanager state from storage as fallback (grafana#2293)
Browse files Browse the repository at this point in the history
* Restore alertmanager state from storage as fallback

In cortexproject/cortex#3925 the ability to restore alertmanager state from
peer alertmanagers was added, short-circuiting if there is only a single
replica of the alertmanager. In cortexproject/cortex#4021 a fallback to read
state from storage was added in case reading from peers failed. However, the
short-circuiting if there is only a single peer was not removed. This has the
effect of never restoring state in an alertmanager if only running a single
replica.

Fixes grafana#2245

Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com>

* Code review changes

Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com>
  • Loading branch information
56quarters authored and mason committed Jul 11, 2022
1 parent 7ff1a55 commit cac96f5
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 77 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* [CHANGE] Ruler: Remove unused CLI flags `-ruler.search-pending-for` and `-ruler.flush-period` (and their respective YAML config options). #2288
* [ENHANCEMENT] Alertmanager: Allow the HTTP `proxy_url` configuration option in the receiver's configuration. #2317
* [BUGFIX] Compactor: log the actual error on compaction failed. #2261
* [BUGFIX] Alertmanager: restore state from storage even when running a single replica. #2293

### Mixin

Expand Down
2 changes: 2 additions & 0 deletions pkg/alertmanager/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func createAlertmanagerAndSendAlerts(t *testing.T, alertGroups, groupsLimit, exp
TenantDataDir: t.TempDir(),
ExternalURL: &url.URL{Path: "/am"},
ShardingEnabled: true,
Store: prepareInMemoryAlertStore(),
Replicator: &stubReplicator{},
ReplicationFactor: 1,
// We have to set this interval non-zero, though we don't need the persister to do anything.
Expand Down Expand Up @@ -148,6 +149,7 @@ func TestDispatcherLoggerInsightKey(t *testing.T) {
TenantDataDir: t.TempDir(),
ExternalURL: &url.URL{Path: "/am"},
ShardingEnabled: true,
Store: prepareInMemoryAlertStore(),
Replicator: &stubReplicator{},
ReplicationFactor: 1,
PersisterConfig: PersisterConfig{Interval: time.Hour},
Expand Down
63 changes: 31 additions & 32 deletions pkg/alertmanager/state_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ type state struct {
func newReplicatedStates(userID string, rf int, re Replicator, st alertstore.AlertStore, l log.Logger, r prometheus.Registerer) *state {

s := &state{
logger: l,
logger: log.With(l, "user", userID),
userID: userID,
replicationFactor: rf,
replicator: re,
Expand Down Expand Up @@ -199,45 +199,44 @@ func (s *state) starting(ctx context.Context) error {
timer := prometheus.NewTimer(s.initialSyncDuration)
defer timer.ObserveDuration()

level.Info(s.logger).Log("msg", "Waiting for notification and silences to settle...")

// If the replication factor is <= 1, there is nowhere to obtain the state from.
if s.replicationFactor <= 1 {
level.Info(s.logger).Log("msg", "skipping settling (no replicas)")
return nil
}

// We can check other alertmanager(s) and explicitly ask them to propagate their state to us if available.
readCtx, cancel := context.WithTimeout(ctx, s.settleReadTimeout)
defer cancel()
// If replication factor is > 1 attempt to read state from other replicas, falling back to reading from
// storage if they are unavailable.
if s.replicationFactor > 1 {
level.Info(s.logger).Log("msg", "Waiting for notification and silences to settle...")

// We can check other alertmanager(s) and explicitly ask them to propagate their state to us if available.
readCtx, cancel := context.WithTimeout(ctx, s.settleReadTimeout)
defer cancel()

s.fetchReplicaStateTotal.Inc()
fullStates, err := s.replicator.ReadFullStateForUser(readCtx, s.userID)
if err == nil {
if err = s.mergeFullStates(fullStates); err == nil {
level.Info(s.logger).Log("msg", "state settled; proceeding")
s.initialSyncCompleted.WithLabelValues(syncFromReplica).Inc()
return nil
}
}

s.fetchReplicaStateTotal.Inc()
fullStates, err := s.replicator.ReadFullStateForUser(readCtx, s.userID)
if err == nil {
if err = s.mergeFullStates(fullStates); err == nil {
level.Info(s.logger).Log("msg", "state settled; proceeding")
s.initialSyncCompleted.WithLabelValues(syncFromReplica).Inc()
return nil
// The user not being found in all of the replicas is not recorded as a failure, as this is
// expected when this is the first replica to come up for a user. Note that it is important
// to continue and try to read from the state from remote storage, as the replicas may have
// lost state due to an all-replica restart.
if err != errAllReplicasUserNotFound {
s.fetchReplicaStateFailed.Inc()
}
}

// The user not being found in all of the replicas is not recorded as a failure, as this is
// expected when this is the first replica to come up for a user. Note that it is important
// to continue and try to read from the state from remote storage, as the replicas may have
// lost state due to an all-replica restart.
if err != errAllReplicasUserNotFound {
s.fetchReplicaStateFailed.Inc()
level.Info(s.logger).Log("msg", "unable to read state from other Alertmanager replicas; trying to read from storage", "err", err)
}

level.Info(s.logger).Log("msg", "state not settled; trying to read from storage", "err", err)

level.Info(s.logger).Log("msg", "reading state from storage")
// Attempt to read the state from persistent storage instead.
storeReadCtx, cancel := context.WithTimeout(ctx, s.storeReadTimeout)
defer cancel()

fullState, err := s.store.GetFullState(storeReadCtx, s.userID)
if errors.Is(err, alertspb.ErrNotFound) {
level.Info(s.logger).Log("msg", "no state for user in storage; proceeding", "user", s.userID)
level.Info(s.logger).Log("msg", "no state for user in storage; proceeding")
s.initialSyncCompleted.WithLabelValues(syncUserNotFound).Inc()
return nil
}
Expand Down Expand Up @@ -271,11 +270,11 @@ func (s *state) mergeFullStates(fs []*clusterpb.FullState) error {

for _, f := range fs {
for _, p := range f.Parts {
level.Debug(s.logger).Log("msg", "merging full state", "user", s.userID, "key", p.Key, "bytes", len(p.Data))
level.Debug(s.logger).Log("msg", "merging full state", "key", p.Key, "bytes", len(p.Data))

st, ok := s.states[p.Key]
if !ok {
level.Error(s.logger).Log("msg", "key not found while merging full state", "user", s.userID, "key", p.Key)
level.Error(s.logger).Log("msg", "key not found while merging full state", "key", p.Key)
continue
}

Expand All @@ -300,7 +299,7 @@ func (s *state) running(ctx context.Context) error {
s.stateReplicationTotal.WithLabelValues(p.Key).Inc()
if err := s.replicator.ReplicateStateForUser(ctx, s.userID, p); err != nil {
s.stateReplicationFailed.WithLabelValues(p.Key).Inc()
level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "user", s.userID, "key", p.Key, "err", err)
level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "key", p.Key, "err", err)
}
case <-ctx.Done():
return nil
Expand Down
91 changes: 46 additions & 45 deletions pkg/alertmanager/state_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
"github.com/grafana/mimir/pkg/alertmanager/alertstore"
)

const testUserID = "user-1"

type fakeState struct {
binary []byte
merges [][]byte
Expand Down Expand Up @@ -73,8 +75,8 @@ func (f *fakeReplicator) GetPositionForUser(_ string) int {
}

func (f *fakeReplicator) ReadFullStateForUser(ctx context.Context, userID string) ([]*clusterpb.FullState, error) {
if userID != "user-1" {
return nil, errors.New("Unexpected userID")
if userID != testUserID {
return nil, errors.New("unexpected userID")
}

if f.read.blocking {
Expand All @@ -96,31 +98,39 @@ func newFakeAlertStore() *fakeAlertStore {
}
}

func (f *fakeAlertStore) GetFullState(ctx context.Context, user string) (alertspb.FullStateDesc, error) {
func (f *fakeAlertStore) GetFullState(_ context.Context, user string) (alertspb.FullStateDesc, error) {
if result, ok := f.states[user]; ok {
return result, nil
}
return alertspb.FullStateDesc{}, alertspb.ErrNotFound
}

func (f *fakeAlertStore) SetFullState(_ context.Context, user string, state alertspb.FullStateDesc) error {
f.states[user] = state
return nil
}

func TestStateReplication(t *testing.T) {
tc := []struct {
name string
replicationFactor int
message *clusterpb.Part
results map[string]*clusterpb.Part
name string
replicationFactor int
message *clusterpb.Part
replicationResults map[string]clusterpb.Part
storeResults map[string]clusterpb.Part
}{
{
name: "with a replication factor of <= 1, state is not replicated.",
replicationFactor: 1,
message: &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
results: map[string]*clusterpb.Part{},
name: "with a replication factor of <= 1, state is not replicated but loaded from storage.",
replicationFactor: 1,
message: &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
replicationResults: map[string]clusterpb.Part{},
storeResults: map[string]clusterpb.Part{testUserID: {Key: "nflog", Data: []byte("OK")}},
},
{
name: "with a replication factor of > 1, state is broadcasted for replication.",
replicationFactor: 3,
message: &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
results: map[string]*clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}},
name: "with a replication factor of > 1, state is broadcasted for replication.",
replicationFactor: 3,
message: &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
replicationResults: map[string]clusterpb.Part{testUserID: {Key: "nflog", Data: []byte("OK")}},
storeResults: map[string]clusterpb.Part{},
},
}

Expand All @@ -129,9 +139,15 @@ func TestStateReplication(t *testing.T) {
reg := prometheus.NewPedanticRegistry()
replicator := newFakeReplicator()
replicator.read = readStateResult{res: nil, err: nil}

store := newFakeAlertStore()
s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
for user, part := range tt.storeResults {
require.NoError(t, store.SetFullState(context.Background(), user, alertspb.FullStateDesc{
State: &clusterpb.FullState{Parts: []clusterpb.Part{part}},
}))
}

s := newReplicatedStates(testUserID, tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
require.False(t, s.Ready())
{
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
Expand Down Expand Up @@ -161,47 +177,32 @@ func TestStateReplication(t *testing.T) {
require.Eventually(t, func() bool {
replicator.mtx.Lock()
defer replicator.mtx.Unlock()
return len(replicator.results) == len(tt.results)
return len(replicator.results) == len(tt.replicationResults)
}, time.Second, time.Millisecond)

if tt.replicationFactor > 1 {
// If the replication factor is greater than 1, we expect state to be loaded from other Alertmanagers
assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
# HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
# TYPE alertmanager_state_fetch_replica_state_failed_total counter
alertmanager_state_fetch_replica_state_failed_total 0
# HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
# TYPE alertmanager_state_fetch_replica_state_total counter
alertmanager_state_fetch_replica_state_total 1
# HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE alertmanager_partial_state_merges_failed_total counter
alertmanager_partial_state_merges_failed_total{key="nflog"} 0
# HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE alertmanager_partial_state_merges_total counter
alertmanager_partial_state_merges_total{key="nflog"} 0
# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
# TYPE alertmanager_state_initial_sync_completed_total counter
alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1
alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0
alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
# HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage.
# TYPE alertmanager_state_initial_sync_total counter
alertmanager_state_initial_sync_total 1
# HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
# TYPE alertmanager_state_replication_failed_total counter
alertmanager_state_replication_failed_total{key="nflog"} 0
# HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
# TYPE alertmanager_state_replication_total counter
alertmanager_state_replication_total{key="nflog"} 1
`),
"alertmanager_state_fetch_replica_state_failed_total",
"alertmanager_state_fetch_replica_state_total",
"alertmanager_partial_state_merges_failed_total",
"alertmanager_partial_state_merges_total",
"alertmanager_state_initial_sync_completed_total",
"alertmanager_state_initial_sync_total",
"alertmanager_state_replication_failed_total",
"alertmanager_state_replication_total",
))
} else {
// Replication factor is 1, we expect state to be loaded from storage *instead* of other Alertmanagers
assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
# TYPE alertmanager_state_initial_sync_completed_total counter
alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 0
alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 1
alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
`),
"alertmanager_state_initial_sync_completed_total",
))

}
Expand Down

0 comments on commit cac96f5

Please sign in to comment.