Skip to content

Commit

Permalink
add customizeable liveness/readiness/startup probe endpoints (#1363)
Browse files Browse the repository at this point in the history
* add customizeable liveness/readiness/startup probe endpoints

* package updates

* typofix

* typofix

* use startupProbe

* restore apollo-health endpoint

* restore apollo-health endpoint

* cleanup
  • Loading branch information
carrolp authored Aug 14, 2024
1 parent 47bfa3e commit 115f1f6
Show file tree
Hide file tree
Showing 19 changed files with 1,251 additions and 939 deletions.
2 changes: 2 additions & 0 deletions app/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ const _ = require('lodash');
const addRequestId = require('express-request-id')();
const {router, initialize} = require('./routes/index.js');
const log = require('./log').createLogger('razeedash-api/app/index');
const DefaultProbes = require('./utils/probes/probe-default.js');
const port = 3333;

// Set ipv4first (changed in Node 18)
Expand Down Expand Up @@ -151,6 +152,7 @@ function onListening() {
const addr = server.address();
const bind = typeof addr === 'string' ? `pipe ${addr}` : `port ${addr.port}`;
log.info(`🏄 razeedash-api listening on ${bind}/api`);
DefaultProbes.setStartupComplete(true);
}

function onError(error) {
Expand Down
48 changes: 25 additions & 23 deletions app/routes/kube/kube.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2019 IBM Corp. All Rights Reserved.
* Copyright 2019,2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,36 +15,38 @@
*/
const express = require('express');
const asyncHandler = require('express-async-handler');
const probeUtil = require('../../utils/probes');

const router = express.Router();
const { GraphqlPubSub } = require('../../apollo/subscription');
const pubSub = GraphqlPubSub.getInstance();
const logger = require('../../log').createLogger('razeedash-api/kube/liveness');
const timeInterval = 300000; //5 mintues

// /kube/liveness
router.get('/liveness', asyncHandler(async(req, res) => {
// does a db call to make sure we didnt disconnect
router.get('/startup', asyncHandler(async (req, res) => {
try {
await require('../../apollo/models').models.Organization.findOne({});
} catch (err) {
logger.error(err, 'razeedash-api liveness probe failed due to a mongo connection issue');
return res.sendStatus(503);
const payload = await probeUtil.getStartupPayload(req);
return res.status(200).send(payload);
}
catch (e) {
return res.status(503).send('service unavailable');
}
}));

// TODO: not real pub-sub liveness test yet, will add later
if (pubSub.initRetries > 5) {
// if the remote redis is not ready after 5 initial retries, then
// it is better to restart this pod, return 500 error
logger.error('Razeedash Api is down due to Redis pubsub connection issue, please check logs.');
return res.sendStatus(503);
router.get('/readiness', asyncHandler(async (req, res) => {
try {
const payload = await probeUtil.getReadinessPayload(req);
return res.status(200).send(payload);
}
catch (e) {
return res.status(503).send('service unavailable');
}
}));

if (pubSub.lastPubSubMessage !== null && Date.now()- pubSub.lastPubSubMessage.time > timeInterval) {
// check if the most recent message received is within ${timeInterval/60000} minitue
logger.error(`Razeedash Api is down, haven't received any published messages within ${timeInterval/60000} minitue, please check logs.`);
return res.sendStatus(503);
router.get('/liveness', asyncHandler(async(req, res) => {
try {
const payload = await probeUtil.getLivenessPayload(req);
return res.status(200).send(payload);
}
catch (e) {
return res.status(503).send('service unavailable');
}
return res.sendStatus(200);
}));

module.exports = router;
55 changes: 55 additions & 0 deletions app/utils/probes/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

const PROBE_DEFAULT_IMPL = require( './probe-default.js' );
const PROBE_CUSTOM_IMPL = require( process.env.PROBE_IMPL || './probe-none.js' );

/*
Return an impl for each of the probe types:
Get the default probe payload.
If default probe impl throws an error, throw an error.
If module specified by PROBE_IMPL implements a probe, get the custom probe payload.
If custom probe impl throws an error, throw an error.
Return the custom payload, or the default payload if there is none.
*/
const PROBE_IMPL = {
getStartupPayload: async function( context ) {
const method = 'getStartupPayload';
const defaultPayload = await PROBE_DEFAULT_IMPL[method](context);
if( !Object.prototype.hasOwnProperty.call(PROBE_CUSTOM_IMPL, method) ) {
return( PROBE_DEFAULT_IMPL[method](context) );
}
return defaultPayload;
},
getReadinessPayload: async function( context ) {
const method = 'getReadinessPayload';
const defaultPayload = await PROBE_DEFAULT_IMPL[method](context);
if( !Object.prototype.hasOwnProperty.call(PROBE_CUSTOM_IMPL, method) ) {
return( PROBE_DEFAULT_IMPL[method](context) );
}
return defaultPayload;
},
getLivenessPayload: async function( context ) {
const method = 'getLivenessPayload';
const defaultPayload = await PROBE_DEFAULT_IMPL[method](context);
if( !Object.prototype.hasOwnProperty.call(PROBE_CUSTOM_IMPL, method) ) {
return( PROBE_DEFAULT_IMPL[method](context) );
}
return defaultPayload;
}
};

module.exports = PROBE_IMPL;
60 changes: 60 additions & 0 deletions app/utils/probes/probe-default.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

const Models = require('../../apollo/models');
const { GraphqlPubSub } = require('../../apollo/subscription');
const pubSub = GraphqlPubSub.getInstance();
const timeInterval = 300000; //5 mintues

let STARTUP_COMPLETE = false;
async function getStartupPayload() {
if( !STARTUP_COMPLETE ) {
throw new Error('startup incomplete');
}
return('startup probe successful');
}

async function getReadinessPayload() {
return('readiness probe successful');
}

async function getLivenessPayload() {
// does a db call to make sure we didnt disconnect
try {
await Models.models.Organization.findOne({});
} catch (err) {
throw new Error(`Razeedash-api liveness probe failed due to a mongo connection issue: ${err.message}`);
}

// TODO: not real pub-sub liveness test yet, will add later
if (pubSub.initRetries > 5) {
// if the remote redis is not ready after 5 initial retries, then
// it is better to restart this pod, return 500 error
throw new Error('Razeedash-api liveness probe failed due to Redis pubsub connection issue, please check logs');
}

if (pubSub.lastPubSubMessage !== null && Date.now()- pubSub.lastPubSubMessage.time > timeInterval) {
// check if the most recent message received is within ${timeInterval/60000} minitue
throw new Error(`Razeedash-api is down, haven't received any published messages within ${timeInterval/60000} minutes, please check logs`);
}
}

// Called from app/index.js when server is ready to receive traffic
function setStartupComplete(b) {
STARTUP_COMPLETE = b;
}

module.exports = { getLivenessPayload, getReadinessPayload, getStartupPayload, setStartupComplete };
19 changes: 19 additions & 0 deletions app/utils/probes/probe-none.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// empty implementation to be used if PROBE_IMPL is not specified

module.exports = {};
65 changes: 65 additions & 0 deletions app/utils/probes/probe-sample.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


/*
This sample shows how the startup/liveness/readiness probes can be customized by providing a
module that exports three functions:
- getStartupPayload
- getReadinessPayload
- getLivenessPayload
In each case, the function should return a payload string (not used by kubernetes, but can
be informative), or throw an error that explains why the probe should be failed.
In this sample:
- Return failure for startup probe for 60s, then success
- Return success for readiness probe for 5 minutes, then failure
- Always return success for liveness probe
To use this sample, `export PROBE_IMPL=./probe-sample` before starting the server.
*/

const START_TIME = Date.now();

async function getStartupPayload(req) {
const method = 'getStartupPayload';
req.log.warn( {req_id: req.id}, `${method} using SAMPLE implementation, should only happen during dev/test` );

if( Date.Now() - START_TIME < 60*1000 ) {
throw new Error('startup probe failing for first 60 seconds');
}
return('startup probe passing after 60 seconds');
}

async function getReadinessPayload(req) {
const method = 'getReadinessPayload';
req.log.warn( {req_id: req.id}, `${method} using SAMPLE implementation, should only happen during dev/test` );

if( Date.Now() - START_TIME < 5*60*1000 ) {
return('readiness probe passing for first 5 minutes');
}
throw new Error('readiness probe failing after 5 minutes');
}

async function getLivenessPayload(req) {
const method = 'getLivenessPayload';
req.log.warn( {req_id: req.id}, `${method} using SAMPLE implementation, should only happen during dev/test` );

return('liveness probe passing');
}

module.exports = { getLivenessPayload, getReadinessPayload, getStartupPayload };
11 changes: 8 additions & 3 deletions kubernetes/razeedash-api/resource.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,23 @@ items:
ports:
- containerPort: 3333
protocol: TCP
startupProbe:
httpGet:
path: /api/kube/startup
port: 3333
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /api/kube/liveness
port: 3333
initialDelaySeconds: 5
periodSeconds: 30
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /.well-known/apollo/server-health
path: /api/kube/readiness
port: 3333
initialDelaySeconds: 5
periodSeconds: 30
timeoutSeconds: 10
resources:
Expand Down
2 changes: 1 addition & 1 deletion locales/de/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Die Subskription für die Subskriptions-ID {{subscription_id}} konnte nicht gefunden werden.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Der Cluster mit der Cluster-ID {{cluster_id}} konnte nicht gefunden werden.",
"Could not locate the cluster with clusterName {{clusterName}}": "Der Cluster mit dem Clusternamen {{clusterName}} konnte nicht gefunden werden.",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "DeployableVersion konnte für {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}} nicht gefunden werden.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Publizieren der Ressourcenbenachrichtigung fehlgeschlagen. Bitte laden Sie die Seite erneut.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Publizieren der Ressourcenbenachrichtigung fehlgeschlagen. pubsub ist noch nicht bereit. Bitte versuchen Sie es später erneut.",
"Failed to Publish subscription notification to clusters, please retry.": "Publizieren der Subskriptionsbenachrichtigung an Cluster fehlgeschlagen. Bitte versuchen Sie es erneut.",
Expand Down
2 changes: 1 addition & 1 deletion locales/en/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Could not find the subscription for the subscription id {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Could not locate the cluster with cluster_id {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "Could not locate the cluster with clusterName {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Failed to Publish resource notification, please reload the page.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Failed to Publish resource notification, pubsub is not ready yet, please retry later.",
"Failed to Publish subscription notification to clusters, please retry.": "Failed to Publish subscription notification to clusters, please retry.",
Expand Down
2 changes: 1 addition & 1 deletion locales/es/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "No se ha podido encontrar la suscripción para el id de suscripción {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "No se ha podido localizar el clúster con ID_clúster {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "No se ha podido localizar el clúster con el nombre de clúster {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "No se ha encontrado DeployableVersion para {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "No se ha podido publicar la notificación del recurso, vuelva a cargar la página.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "No se ha podido publicar la notificación de recurso, pubsub aún no está preparado; vuelva a intentarlo más tarde.",
"Failed to Publish subscription notification to clusters, please retry.": "No se ha podido publicar la notificación de suscripción a los clústeres; vuelva a intentarlo.",
Expand Down
2 changes: 1 addition & 1 deletion locales/fr/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Impossible de trouver l'abonnement pour l'id d'abonnement {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Impossible de localiser le cluster cluster_id {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "Impossible de localiser le cluster clusterName {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "Version déployable introuvable pour {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Echec de publication de la notification de ressource, veuillez recharger la page.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Echec de publication de la notification de ressource, pubsub n'est pas prêt, veuillez réessayer plus tard.",
"Failed to Publish subscription notification to clusters, please retry.": "Echec de publication de la notification d'abonnement sur les clusters, veuillez réessayer,",
Expand Down
2 changes: 1 addition & 1 deletion locales/it/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Impossibile trovare la sottoscrizione per l'ID sottoscrizione {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Impossibile individuare il cluster con ID cluster {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "Impossibile individuare il cluster con nome cluster {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "DeployableVersion non trovato per {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Impossibile pubblicare la notifica della risorsa, ricaricare la pagina.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Impossibile pubblicare la notifica della risorsa, pubsub non è ancora pronto. Riprovare in seguito.",
"Failed to Publish subscription notification to clusters, please retry.": "Impossibile pubblicare la notifica della sottoscrizione ai cluster, riprovare.",
Expand Down
Loading

0 comments on commit 115f1f6

Please sign in to comment.