From 07b9ff2ddd6de0f95200d53a69ceacb922b90725 Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Mon, 20 Apr 2020 09:25:31 +0100 Subject: [PATCH] Reverted addition of deletion mark for partial uploads. Fixes https://github.com/thanos-io/thanos/issues/2459 (quick fix). This keeps the logic from the 0.11.0 which was good enough. Some improvement for future: https://github.com/thanos-io/thanos/issues/2470 Signed-off-by: Bartlomiej Plotka --- CHANGELOG.md | 3 ++- cmd/thanos/compact.go | 2 +- pkg/compact/clean.go | 14 ++++++++++---- pkg/compact/clean_test.go | 15 ++++++--------- test/e2e/compact_test.go | 21 ++++++++++++--------- 5 files changed, 31 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bbf1c31a5..448162c628 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,8 +13,9 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Fixed -- [#2411](https://github.com/thanos-io/thanos/pull/2411) Query: fix a bug where queries might not time out sometimes due to issues with one or more StoreAPIs +- [#2411](https://github.com/thanos-io/thanos/pull/2411) Query: fix a bug where queries might not time out sometimes due to issues with one or more StoreAPIs. - [#2474](https://github.com/thanos-io/thanos/pull/2474) Store: fix a panic caused by concurrent memory access during block filtering. +- [#2472](https://github.com/thanos-io/thanos/pull/2472) compact: Fixed bug where partial blocks were never deleted and causing spam of warnings. ## [v0.12.0](https://github.com/thanos-io/thanos/releases/tag/v0.12.0) - 2020.04.15 diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 1bad45f773..627010f179 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -420,7 +420,7 @@ func runCompact( } // No need to resync before partial uploads and delete marked blocks. Last sync should be valid. - compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksMarkedForDeletion) + compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksCleaned, blockCleanupFailures) if err := blocksCleaner.DeleteMarkedBlocks(ctx); err != nil { return errors.Wrap(err, "error cleaning blocks") } diff --git a/pkg/compact/clean.go b/pkg/compact/clean.go index 53cfb16908..8d99927d51 100644 --- a/pkg/compact/clean.go +++ b/pkg/compact/clean.go @@ -27,7 +27,8 @@ func BestEffortCleanAbortedPartialUploads( partial map[ulid.ULID]error, bkt objstore.Bucket, deleteAttempts prometheus.Counter, - blocksMarkedForDeletion prometheus.Counter, + blockCleanups prometheus.Counter, + blockCleanupFailures prometheus.Counter, ) { level.Info(logger).Log("msg", "started cleaning of aborted partial uploads") @@ -45,10 +46,15 @@ func BestEffortCleanAbortedPartialUploads( deleteAttempts.Inc() level.Info(logger).Log("msg", "found partially uploaded block; marking for deletion", "block", id) - if err := block.MarkForDeletion(ctx, logger, bkt, id, blocksMarkedForDeletion); err != nil { - level.Warn(logger).Log("msg", "failed to delete aborted partial upload; skipping", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err) - return + // We don't gather any information about deletion marks for partial blocks, so let's simply remove it. We waited + // long PartialUploadThresholdAge already. + // TODO(bwplotka): Fix some edge cases: https://github.com/thanos-io/thanos/issues/2470 . + if err := block.Delete(ctx, logger, bkt, id); err != nil { + blockCleanupFailures.Inc() + level.Warn(logger).Log("msg", "failed to delete aborted partial upload; will retry in next iteration", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err) + continue } + blockCleanups.Inc() level.Info(logger).Log("msg", "deleted aborted partial upload", "block", id, "thresholdAge", PartialUploadThresholdAge) } level.Info(logger).Log("msg", "cleaning of aborted partial uploads done") diff --git a/pkg/compact/clean_test.go b/pkg/compact/clean_test.go index 1321c888ba..ea0f840c69 100644 --- a/pkg/compact/clean_test.go +++ b/pkg/compact/clean_test.go @@ -59,22 +59,19 @@ func TestBestEffortCleanAbortedPartialUploads(t *testing.T) { testutil.Ok(t, bkt.Upload(ctx, path.Join(shouldIgnoreID2.String(), "chunks", "000001"), &fakeChunk)) deleteAttempts := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) - blocksMarkedForDeletion := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) - + blockCleanups := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) + blockCleanupFailures := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) _, partial, err := metaFetcher.Fetch(ctx) testutil.Ok(t, err) - BestEffortCleanAbortedPartialUploads(ctx, logger, partial, bkt, deleteAttempts, blocksMarkedForDeletion) + BestEffortCleanAbortedPartialUploads(ctx, logger, partial, bkt, deleteAttempts, blockCleanups, blockCleanupFailures) testutil.Equals(t, 1.0, promtest.ToFloat64(deleteAttempts)) + testutil.Equals(t, 1.0, promtest.ToFloat64(blockCleanups)) + testutil.Equals(t, 0.0, promtest.ToFloat64(blockCleanupFailures)) exists, err := bkt.Exists(ctx, path.Join(shouldDeleteID.String(), "chunks", "000001")) testutil.Ok(t, err) - testutil.Equals(t, true, exists) - - exists, err = bkt.Exists(ctx, path.Join(shouldDeleteID.String(), metadata.DeletionMarkFilename)) - testutil.Ok(t, err) - testutil.Equals(t, true, exists) - testutil.Equals(t, 1.0, promtest.ToFloat64(blocksMarkedForDeletion)) + testutil.Equals(t, false, exists) exists, err = bkt.Exists(ctx, path.Join(shouldIgnoreID1.String(), "chunks", "000001")) testutil.Ok(t, err) diff --git a/test/e2e/compact_test.go b/test/e2e/compact_test.go index 9524b960dd..9f4dd0d65f 100644 --- a/test/e2e/compact_test.go +++ b/test/e2e/compact_test.go @@ -478,6 +478,7 @@ func TestCompactWithStoreGateway(t *testing.T) { // We expect no ops. testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_iterations_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_cleaned_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_block_cleanup_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_marked_for_deletion_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_total")) @@ -504,11 +505,11 @@ func TestCompactWithStoreGateway(t *testing.T) { // NOTE: We cannot assert on intermediate `thanos_blocks_meta_` metrics as those are gauge and change dynamically due to many // compaction groups. Wait for at least first compaction iteration (next is in 5m). testutil.Ok(t, c.WaitSumMetrics(e2e.Greater(0), "thanos_compactor_iterations_total")) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_cleaned_total")) // This should be 1 [BUG no 1]. TODO: fix. - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2*4+2+2*3+1), "thanos_compactor_blocks_marked_for_deletion_total")) // 17. - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) // We should have 1. + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_blocks_cleaned_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_block_cleanup_failures_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2*4+2+2*3), "thanos_compactor_blocks_marked_for_deletion_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(5), "thanos_compact_group_compactions_total")) - // TODO(bwplotka): This is confusing, should be either normal compaction or vertical not both (?) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(3), "thanos_compact_group_vertical_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(12), "thanos_compact_group_compaction_runs_started_total")) @@ -519,7 +520,8 @@ func TestCompactWithStoreGateway(t *testing.T) { testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64( len(rawBlockIDs)+7+ - 5, // 5 compactions, 5 newly added blocks. + 5+ // 5 compactions, 5 newly added blocks. + -2, // Partial block removed. )), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) @@ -539,7 +541,7 @@ func TestCompactWithStoreGateway(t *testing.T) { expectedEndVector, ) // Store view: - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7+5)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7+5-2)), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) }) @@ -553,8 +555,9 @@ func TestCompactWithStoreGateway(t *testing.T) { // compaction groups. Wait for at least first compaction iteration (next is in 5m). testutil.Ok(t, c.WaitSumMetrics(e2e.Greater(0), "thanos_compactor_iterations_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(16), "thanos_compactor_blocks_cleaned_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_block_cleanup_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_marked_for_deletion_total")) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) // We should have 1. + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_vertical_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_failures_total")) @@ -564,7 +567,7 @@ func TestCompactWithStoreGateway(t *testing.T) { testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_downsample_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_downsample_failures_total")) - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7+5-16)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7+5-16-2)), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_halted")) @@ -584,7 +587,7 @@ func TestCompactWithStoreGateway(t *testing.T) { ) // Store view: - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7-16+5)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7-16+5-2)), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) })