Skip to content

Commit

Permalink
SERVER-84777 Ensure dbCheck reports a retryable warning upon primary …
Browse files Browse the repository at this point in the history
…stepdown (#18474)

GitOrigin-RevId: 72b172737f9545051c39b3e33a2f86dbf411e0ad
  • Loading branch information
moustafamaher authored and MongoDB Bot committed Feb 7, 2024
1 parent b4157fa commit 2b0bab0
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 185 deletions.
11 changes: 8 additions & 3 deletions jstests/noPassthrough/dbcheck_current_op.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,12 @@ const verifyCurOp = (expectedCurOp, returnedCurOp) => {

const runTest = (parameters) => {
jsTestLog("Running dbcheck with " + tojson(parameters));
resetAndInsert(rst, testDB, collName, 10, {a: 1});
resetAndInsert(rst, testDB, collName, 10);
assert.commandWorked(testDB.runCommand({
createIndexes: collName,
indexes: [{key: {a: 1}, name: 'a_1'}],
}));
rst.awaitReplication();

const dbcheckFp = configureFailPoint(primary, "hangBeforeProcessingDbCheckRun");
runDbCheck(rst, testDB, collName, parameters);
Expand Down Expand Up @@ -133,8 +138,8 @@ const dbCheckParameters = [
{
validateMode: "extraIndexKeysCheck",
secondaryIndex: "a_1",
start: {"a": 10},
end: {"a": 40},
start: {"a": 1},
end: {"a": 4},
skipLookupForExtraKeys: true,
},
{
Expand Down
34 changes: 0 additions & 34 deletions jstests/replsets/dbcheck.js
Original file line number Diff line number Diff line change
Expand Up @@ -385,40 +385,6 @@ testErrorOnNonexistent();
testErrorOnSecondary();
testErrorOnUnreplicated();

// Test stepdown.
function testSucceedsOnStepdown() {
let primary = replSet.getPrimary();
let db = primary.getDB(dbName);

let nodeId = replSet.getNodeId(primary);
runDbCheck(replSet, db, multiBatchSimpleCollName);

// Step down the primary.
assert.commandWorked(primary.getDB("admin").runCommand({replSetStepDown: 0, force: true}));

// Wait for the cluster to come up.
replSet.awaitSecondaryNodes();

// Find the node we ran dbCheck on.
db = replSet.getSecondaries()
.filter(function isPreviousPrimary(node) {
return replSet.getNodeId(node) === nodeId;
})[0]
.getDB(dbName);

// Check that it's still responding.
try {
assert.commandWorked(db.runCommand({ping: 1}), "ping failed after stepdown during dbCheck");
} catch (e) {
doassert("cannot connect after dbCheck with stepdown");
}

// And that our dbCheck completed.
assert(dbCheckCompleted(db), "dbCheck failed to terminate on stepdown");
}

testSucceedsOnStepdown();

// Just add an extra document, and test that it catches it.
function simpleTestCatchesExtra() {
{
Expand Down
114 changes: 114 additions & 0 deletions jstests/replsets/dbcheck_stepdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* Tests stepdown while dbcheck is running.
* @tags: [
* featureFlagSecondaryIndexChecksInDbCheck
* ]
*/

import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {
checkHealthLog,
clearHealthLog,
dbCheckCompleted,
logEveryBatch,
runDbCheck
} from "jstests/replsets/libs/dbcheck_utils.js";

const rst = new ReplSetTest({nodes: 2});
rst.startSet();
rst.initiate();

logEveryBatch(rst);
const dbName = "dbCheck_stepdown";
const collName = "coll";
let primary = rst.getPrimary();
let testDB = primary.getDB(dbName);
assert.commandWorked(testDB.runCommand({
createIndexes: collName,
indexes: [{key: {a: 1}, name: "a_1"}],
}));

// Insert nDocs, each slightly larger than the maxDbCheckMBperSec value (1MB), which is the
// default value, while maxBatchTimeMillis is defaulted to 1 second. Consequently, we will
// have only 1MB per batch and each batch will take at least 1 second on primary.
const nDocs = 5;
const chars = ['a', 'b', 'c', 'd', 'e'];
testDB[collName].insertMany(
[...Array(nDocs).keys()].map(x => ({a: chars[x].repeat(1024 * 1024 * 2)})), {ordered: false});
rst.awaitReplication();
Random.setRandomSeed();

const stepdownWarningQuery = {
severity: "warning",
"msg": "abandoning dbCheck batch due to stepdown."
};
const dbCheckStartQuery = {
severity: "info",
operation: "dbCheckStart"
};
const dbCheckStopQuery = {
severity: "info",
operation: "dbCheckStop"
};

const runTest = (parameters) => {
jsTestLog("Running dbcheck with " + tojson(parameters));
primary = rst.getPrimary();
testDB = primary.getDB(dbName);
const nodeId = rst.getNodeId(primary);
clearHealthLog(rst);
const dbcheckFp = configureFailPoint(primary, "hangBeforeProcessingDbCheckRun");
runDbCheck(rst, testDB, collName, parameters);
// Make sure that dbcheck job starts.
dbcheckFp.wait();
dbcheckFp.off();

// Introduce sleep intervals to randomize the timing of the stepdown while the dbcheck is
// running, ranging between 0 and 2 seconds. This allows for the possibility of a stepdown
// occurring anytime between the start of the dbcheck and before running the 3rd batch.
sleep(Random.randInt(2 * 1000));
// Step down the primary.
assert.commandWorked(primary.getDB("admin").runCommand({replSetStepDown: 0, force: true}));

// Wait for the cluster to come up.
rst.awaitSecondaryNodes();

// Find the node we ran dbCheck on.
const node = rst.getSecondaries().filter(function isPreviousPrimary(node) {
return rst.getNodeId(node) === nodeId;
})[0];
const db = node.getDB(dbName);

// Check that it's still responding.
try {
assert.commandWorked(db.runCommand({ping: 1}), "ping failed after stepdown during dbCheck");
} catch (e) {
doassert("cannot connect after dbCheck with stepdown");
}

// And that our dbCheck completed.
assert(dbCheckCompleted(db), "dbCheck failed to terminate on stepdown");
const healthlog = node.getDB('local').system.healthlog;
// Test health log has the expected logs after the stepdown.
checkHealthLog(healthlog, dbCheckStartQuery, 1);
checkHealthLog(healthlog, stepdownWarningQuery, 1);
checkHealthLog(healthlog, dbCheckStopQuery, 1);
};

const dbCheckParameters = [
{
validateMode: "dataConsistency",
maxDocsPerBatch: 1,
},
{
validateMode: "dataConsistencyAndMissingIndexKeysCheck",
maxDocsPerBatch: 1,
bsonValidateMode: "kFull"
},
{validateMode: "extraIndexKeysCheck", maxDocsPerBatch: 1, secondaryIndex: "a_1"},
];
// Execute the test multiple times to assess the randomization of when stepdown occurs while dbcheck
// is running.
[...Array(5).keys()].map(_ => dbCheckParameters.forEach(parameters => runTest(parameters)));

rst.stopSet();
3 changes: 2 additions & 1 deletion jstests/replsets/libs/dbcheck_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ export const checkHealthLog = (healthlog, query, numExpected, timeout = 60 * 100
return query_count == numExpected;
},
"health log query returned " + query_count + " entries, expected " + numExpected +
" query: " + tojson(query) + " found: " + tojson(healthlog.find(query).toArray()),
" query: " + tojson(query) + " found: " + tojson(healthlog.find(query).toArray()) +
" HealthLog: " + tojson(healthlog.find().toArray()),
timeout);
};

Expand Down
Loading

0 comments on commit 2b0bab0

Please sign in to comment.