Skip to content

Commit

Permalink
feat(prom-client) add implementation for collecting event loop lag, g…
Browse files Browse the repository at this point in the history
…arbage collector, heap size and heap space
  • Loading branch information
pikalovArtemN committed Apr 22, 2024
1 parent a288410 commit ed6596f
Show file tree
Hide file tree
Showing 9 changed files with 675 additions and 52 deletions.
77 changes: 32 additions & 45 deletions plugins/node/instrumentation-runtime-node/src/instrumentation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,83 +13,70 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { EventLoopUtilization, performance } from 'node:perf_hooks';
const { eventLoopUtilization } = performance;

import { InstrumentationBase } from '@opentelemetry/instrumentation';

import { VERSION } from './version';
import { RuntimeNodeInstrumentationConfig } from './types';
import { MetricCollector } from './types/metricCollector';
import { EventLoopUtilizationCollector } from './metrics/eventLoopUtilizationCollector';
import { EventLoopLagCollector } from './metrics/eventLoopLagCollector';
import { GCCollector } from './metrics/gcCollector';
import { HeapSizeAndUsedCollector } from './metrics/heapSizeAndUsedCollector';
import { HeapSpacesSizeAndUsedCollector } from './metrics/heapSpacesSizeAndUsedCollector';

const ELUS_LENGTH = 2;
const DEFAULT_CONFIG: RuntimeNodeInstrumentationConfig = {
eventLoopUtilizationMeasurementInterval: 5000,
monitoringPrecision: 5000,
};

const namePrefix = 'nodejs';

export class RuntimeNodeInstrumentation extends InstrumentationBase {
private _ELUs: EventLoopUtilization[] = [];
private _interval: NodeJS.Timeout | undefined;
private _collectors: MetricCollector[] = [];

constructor(config: RuntimeNodeInstrumentationConfig = {}) {
super(
'@opentelemetry/instrumentation-runtime-node',
VERSION,
Object.assign({}, DEFAULT_CONFIG, config)
);
}

private _addELU() {
this._ELUs.unshift(eventLoopUtilization());
if (this._ELUs.length > ELUS_LENGTH) {
this._ELUs.pop();
this._collectors = [
new EventLoopUtilizationCollector(this._config, namePrefix),
new EventLoopLagCollector(this._config, namePrefix),
new GCCollector(this._config, namePrefix),
new HeapSizeAndUsedCollector(this._config, namePrefix),
new HeapSpacesSizeAndUsedCollector(this._config, namePrefix),
];
if (this._config.enabled) {
for (const collector of this._collectors) {
collector.enable();
}
}
}

private _clearELU() {
if (!this._ELUs) {
this._ELUs = [];
}
this._ELUs.length = 0;
}

// Called when a new `MeterProvider` is set
// the Meter (result of @opentelemetry/api's getMeter) is available as this.meter within this method
override _updateMetricInstruments() {
this.meter
.createObservableGauge('nodejs.event_loop.utilization', {
description: 'Event loop utilization',
unit: '1',
})
.addCallback(async observableResult => {
if (this._ELUs.length !== ELUS_LENGTH) {
return;
}
const elu = eventLoopUtilization(...this._ELUs);
observableResult.observe(elu.utilization);
});
if (!this._collectors) return;
for (const collector of this._collectors) {
collector.updateMetricInstruments(this.meter);
}
}

init() {
// Not instrumenting or patching a Node.js module
}

override enable() {
this._clearELU();
this._addELU();
clearInterval(this._interval);
this._interval = setInterval(
() => this._addELU(),
(this._config as RuntimeNodeInstrumentationConfig)
.eventLoopUtilizationMeasurementInterval
);
if (!this._collectors) return;

// unref so that it does not keep the process running if disable() is never called
this._interval?.unref();
for (const collector of this._collectors) {
collector.enable();
}
}

override disable() {
this._clearELU();
clearInterval(this._interval);
this._interval = undefined;
for (const collector of this._collectors) {
collector.disable();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright The OpenTelemetry Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { MetricCollector } from '../types/metricCollector';
import { Meter } from '@opentelemetry/api';
import { clearInterval } from 'node:timers';
import { RuntimeNodeInstrumentationConfig } from '../types';

export abstract class BaseCollector<T> implements MetricCollector {
protected _config: RuntimeNodeInstrumentationConfig = {};

protected namePrefix: string;
private _interval: NodeJS.Timeout | undefined;
protected _scrapeQueue: T[] = [];

constructor(
config: RuntimeNodeInstrumentationConfig = {},
namePrefix: string
) {
this._config = config;
this.namePrefix = namePrefix;
}

public disable(): void {
this._clearQueue();
clearInterval(this._interval);
this._interval = undefined;

this.internalDisable();
}

public enable(): void {
this._clearQueue();
clearInterval(this._interval);
this._interval = setInterval(
() => this._addTask(),
this._config.monitoringPrecision
);

// unref so that it does not keep the process running if disable() is never called
this._interval?.unref();

this.internalEnable();
}

private _clearQueue() {
this._scrapeQueue.length = 0;
}

private _addTask() {
const taskResult = this.scrape();
if (taskResult) {
this._scrapeQueue.push(taskResult);
}
}

public abstract updateMetricInstruments(meter: Meter): void;
protected abstract internalEnable(): void;
protected abstract internalDisable(): void;
protected abstract scrape(): T;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
* Copyright The OpenTelemetry Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { RuntimeNodeInstrumentationConfig } from '../types';
import { Meter } from '@opentelemetry/api';
import { IntervalHistogram } from 'node:perf_hooks';
import { BaseCollector } from './baseCollector';
import * as perf_hooks from 'node:perf_hooks';

const NODEJS_EVENTLOOP_LAG = 'event_loop.lag_seconds';
const NODEJS_EVENTLOOP_LAG_MIN = 'event_loop.lag_min_seconds';
const NODEJS_EVENTLOOP_LAG_MAX = 'event_loop.lag_max_seconds';
const NODEJS_EVENTLOOP_LAG_MEAN = 'event_loop.lag_mean_seconds';
const NODEJS_EVENTLOOP_LAG_STDDEV = 'event_loop.lag_stddev_seconds';
const NODEJS_EVENTLOOP_LAG_P50 = 'event_loop.lag_p50_seconds';
const NODEJS_EVENTLOOP_LAG_P90 = 'event_loop.lag_p90_seconds';
const NODEJS_EVENTLOOP_LAG_P99 = 'event_loop.lag_p99_seconds';

export const metricNames = [
{ name: NODEJS_EVENTLOOP_LAG, description: 'Lag of event loop in seconds.' },
{
name: NODEJS_EVENTLOOP_LAG_MIN,
description: 'The minimum recorded event loop delay.',
},
{
name: NODEJS_EVENTLOOP_LAG_MAX,
description: 'The maximum recorded event loop delay.',
},
{
name: NODEJS_EVENTLOOP_LAG_MEAN,
description: 'The mean of the recorded event loop delays.',
},
{
name: NODEJS_EVENTLOOP_LAG_STDDEV,
description: 'The standard deviation of the recorded event loop delays.',
},
{
name: NODEJS_EVENTLOOP_LAG_P50,
description: 'The 50th percentile of the recorded event loop delays.',
},
{
name: NODEJS_EVENTLOOP_LAG_P90,
description: 'The 90th percentile of the recorded event loop delays.',
},
{
name: NODEJS_EVENTLOOP_LAG_P99,
description: 'The 99th percentile of the recorded event loop delays.',
},
];

export interface EventLoopLagInformation {
min: number;
max: number;
mean: number;
stddev: number;
p50: number;
p90: number;
p99: number;
}

export class EventLoopLagCollector extends BaseCollector<EventLoopLagInformation> {
private _histogram: IntervalHistogram;

constructor(
config: RuntimeNodeInstrumentationConfig = {},
namePrefix: string
) {
super(config, namePrefix);
this._histogram = perf_hooks.monitorEventLoopDelay({
resolution: config.monitoringPrecision,
});
}

updateMetricInstruments(meter: Meter): void {
const lag = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[0].name}`,
{
description: metricNames[0].description,
unit: '1',
}
);
const lagMin = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[1].name}`,
{
description: metricNames[1].description,
unit: '1',
}
);
const lagMax = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[2].name}`,
{
description: metricNames[2].description,
unit: '1',
}
);
const lagMean = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[3].name}`,
{
description: metricNames[3].description,
unit: '1',
}
);
const lagStddev = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[4].name}`,
{
description: metricNames[4].description,
unit: '1',
}
);
const lagp50 = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[5].name}`,
{
description: metricNames[5].description,
unit: '1',
}
);
const lagp90 = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[6].name}`,
{
description: metricNames[6].description,
unit: '1',
}
);
const lagp99 = meter.createObservableGauge(
`${this.namePrefix}.${metricNames[7].name}`,
{
description: metricNames[7].description,
unit: '1',
}
);

meter.addBatchObservableCallback(
async observableResult => {
if (this._scrapeQueue.length === 0) return;

const data = this._scrapeQueue.shift();
if (data === undefined) return;

const start = process.hrtime();
const lagResult = await new Promise<number>(res => {
setImmediate((start: [number, number]) => {
res(this._reportEventloopLag(start));
}, start);
});

observableResult.observe(lag, lagResult);
observableResult.observe(lagMin, data.min);
observableResult.observe(lagMax, data.max);
observableResult.observe(lagMean, data.mean);
observableResult.observe(lagStddev, data.stddev);
observableResult.observe(lagp50, data.p50);
observableResult.observe(lagp90, data.p90);
observableResult.observe(lagp99, data.p99);

this._histogram.reset();
},
[lag, lagMin, lagMax, lagMean, lagStddev, lagp50, lagp90, lagp99]
);
}

internalEnable(): void {
this._histogram.enable();
}

internalDisable(): void {
this._histogram.disable();
}

protected scrape(): EventLoopLagInformation {
return {
min: this.checkNan(this._histogram.min / 1e9),
max: this.checkNan(this._histogram.max / 1e9),
mean: this.checkNan(this._histogram.mean / 1e9),
stddev: this.checkNan(this._histogram.stddev / 1e9),
p50: this.checkNan(this._histogram.percentile(90) / 1e9),
p90: this.checkNan(this._histogram.percentile(90) / 1e9),
p99: this.checkNan(this._histogram.percentile(99) / 1e9),
};
}

private _reportEventloopLag(start: [number, number]): number {
const delta = process.hrtime(start);
const nanosec = delta[0] * 1e9 + delta[1];
const seconds = nanosec / 1e9;
return seconds;
}

private checkNan(value: number) {
return isNaN(value) ? 0 : value;
}
}
Loading

0 comments on commit ed6596f

Please sign in to comment.