-
Notifications
You must be signed in to change notification settings - Fork 390
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(queue): observability for queue #2721
base: master
Are you sure you want to change the base?
Changes from 4 commits
722fb5d
1b2fd0b
e6dfb85
a75a8dd
1b176c6
66b85c5
54a2c18
2e3fe37
c4cb517
14b2f2e
7f6c674
059d50b
f70cc8a
8f38087
379abf5
3e8e0d8
6169ea3
908ec13
2dab35f
846e3ef
fc733e9
4c199f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,8 @@ import { | |
RATE_LIMIT_ERROR, | ||
WaitingChildrenError, | ||
} from './errors'; | ||
import { Tracer } from '../interfaces'; | ||
import { TelemetryAttributes } from '../enums'; | ||
|
||
// 10 seconds is the maximum time a BRPOPLPUSH can block. | ||
const maximumBlockTimeout = 10; | ||
|
@@ -190,6 +192,13 @@ export class Worker< | |
protected processFn: Processor<DataType, ResultType, NameType>; | ||
protected running = false; | ||
|
||
/** | ||
* Instance of a telemetry client | ||
* To use it create if statement in a method to observe with start and end of a span | ||
* It will check if tracer is provided and if not it will continue as is | ||
*/ | ||
private tracer: Tracer | undefined; | ||
|
||
static RateLimitError(): Error { | ||
return new RateLimitError(); | ||
} | ||
|
@@ -314,6 +323,8 @@ export class Worker< | |
this.blockingConnection.on('ready', () => | ||
setTimeout(() => this.emit('ready'), 0), | ||
); | ||
|
||
this.tracer = opts?.telemetry?.tracer; | ||
} | ||
|
||
emit<U extends keyof WorkerListener<DataType, ResultType, NameType>>( | ||
|
@@ -402,11 +413,30 @@ export class Worker< | |
} | ||
|
||
async run() { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.run`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerOptions]: JSON.stringify(this.opts), | ||
}); | ||
} | ||
|
||
if (!this.processFn) { | ||
if (this.tracer) { | ||
span.end(); | ||
} | ||
|
||
throw new Error('No process function is defined.'); | ||
} | ||
|
||
if (this.running) { | ||
if (this.tracer) { | ||
span.end(); | ||
} | ||
|
||
throw new Error('Worker is already running.'); | ||
} | ||
|
||
|
@@ -507,10 +537,14 @@ export class Worker< | |
} | ||
|
||
this.running = false; | ||
return asyncFifoQueue.waitAll(); | ||
return await asyncFifoQueue.waitAll(); | ||
} catch (error) { | ||
this.running = false; | ||
throw error; | ||
} finally { | ||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -520,12 +554,30 @@ export class Worker< | |
* @returns a Job or undefined if no job was available in the queue. | ||
*/ | ||
async getNextJob(token: string, { block = true }: GetNextJobOptions = {}) { | ||
return this._getNextJob( | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.getNextJob`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerToken]: token, | ||
[TelemetryAttributes.WorkerOptions]: JSON.stringify({ block }), | ||
}); | ||
} | ||
|
||
const nextJob = await this._getNextJob( | ||
await this.client, | ||
await this.blockingConnection.client, | ||
token, | ||
{ block }, | ||
); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is possible that nextJob is undefined here, so we need to check before trying to set the attribute. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually I am not sure we need to create a span at all if there is no next job :thinking_face: |
||
if (this.tracer) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In order to match the job that was added to the queue we also need to specify the job.id here in the span attributes. |
||
span.end(); | ||
} | ||
|
||
return nextJob; | ||
} | ||
|
||
private async _getNextJob( | ||
|
@@ -585,6 +637,17 @@ export class Worker< | |
* @param expireTimeMs - expire time in ms of this rate limit. | ||
*/ | ||
async rateLimit(expireTimeMs: number): Promise<void> { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.rateLimit`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerRateLimit]: expireTimeMs, | ||
}); | ||
} | ||
|
||
await this.client.then(client => | ||
client.set( | ||
this.keys.limiter, | ||
|
@@ -593,6 +656,10 @@ export class Worker< | |
expireTimeMs, | ||
), | ||
); | ||
|
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
|
||
get minimumBlockTimeout(): number { | ||
|
@@ -743,7 +810,23 @@ will never work with more accuracy than 1ms. */ | |
fetchNextCallback = () => true, | ||
jobsInProgress: Set<{ job: Job; ts: number }>, | ||
): Promise<void | Job<DataType, ResultType, NameType>> { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.processJob`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerToken]: token, | ||
[TelemetryAttributes.JobId]: job.id, | ||
}); | ||
} | ||
|
||
if (!job || this.closing || this.paused) { | ||
if (this.tracer) { | ||
span.end(); | ||
} | ||
|
||
return; | ||
} | ||
|
||
|
@@ -808,6 +891,10 @@ will never work with more accuracy than 1ms. */ | |
return handleFailed(<Error>err); | ||
} finally { | ||
jobsInProgress.delete(inProgressItem); | ||
|
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -816,6 +903,17 @@ will never work with more accuracy than 1ms. */ | |
* Pauses the processing of this queue only for this worker. | ||
*/ | ||
async pause(doNotWaitActive?: boolean): Promise<void> { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.pause`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerDoNotWaitActive]: doNotWaitActive, | ||
}); | ||
} | ||
|
||
if (!this.paused) { | ||
this.paused = new Promise(resolve => { | ||
this.resumeWorker = function () { | ||
|
@@ -827,17 +925,35 @@ will never work with more accuracy than 1ms. */ | |
await (!doNotWaitActive && this.whenCurrentJobsFinished()); | ||
this.emit('paused'); | ||
} | ||
|
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
|
||
/** | ||
* | ||
* Resumes processing of this worker (if paused). | ||
*/ | ||
resume(): void { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.resume`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
}); | ||
} | ||
|
||
if (this.resumeWorker) { | ||
this.resumeWorker(); | ||
this.emit('resumed'); | ||
} | ||
|
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -872,7 +988,22 @@ will never work with more accuracy than 1ms. */ | |
* @returns Promise that resolves when the worker has been closed. | ||
*/ | ||
close(force = false): Promise<void> { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.close`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerForceClose]: force, | ||
}); | ||
} | ||
|
||
if (this.closing) { | ||
if (this.tracer) { | ||
span.end(); | ||
} | ||
|
||
return this.closing; | ||
} | ||
this.closing = (async () => { | ||
|
@@ -905,6 +1036,10 @@ will never work with more accuracy than 1ms. */ | |
|
||
this.closed = true; | ||
this.emit('closed'); | ||
|
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
})(); | ||
return this.closing; | ||
} | ||
|
@@ -922,6 +1057,16 @@ will never work with more accuracy than 1ms. */ | |
* @see {@link https://docs.bullmq.io/patterns/manually-fetching-jobs} | ||
*/ | ||
async startStalledCheckTimer(): Promise<void> { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.startStalledCheckTimer`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
}); | ||
} | ||
|
||
if (!this.opts.skipStalledCheck) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, do not create a span if we are skipping the stalled checker. The spans are not just for wrapping methods, but if there is actually something useful to do in them! :) |
||
clearTimeout(this.stalledCheckTimer); | ||
|
||
|
@@ -936,6 +1081,10 @@ will never work with more accuracy than 1ms. */ | |
} | ||
} | ||
} | ||
|
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
|
||
private startLockExtenderTimer( | ||
|
@@ -1017,6 +1166,17 @@ will never work with more accuracy than 1ms. */ | |
} | ||
|
||
protected async extendLocks(jobs: Job[]) { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.extendLocks`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
[TelemetryAttributes.WorkerJobsInvolved]: JSON.stringify(jobs), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wouldn't we also want to store all the IDs of the jobs whose locks we are extending? |
||
}); | ||
} | ||
|
||
try { | ||
const pipeline = (await this.client).pipeline(); | ||
for (const job of jobs) { | ||
|
@@ -1040,10 +1200,24 @@ will never work with more accuracy than 1ms. */ | |
} | ||
} catch (err) { | ||
this.emit('error', <Error>err); | ||
} finally { | ||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
} | ||
|
||
private async moveStalledJobsToWait() { | ||
let span; | ||
if (this.tracer) { | ||
const spanName = `${this.name} ${this.id} Worker.moveStalledJobsToWait`; | ||
span = this.tracer.startSpan(spanName); | ||
span.setAttributes({ | ||
[TelemetryAttributes.WorkerName]: this.name, | ||
[TelemetryAttributes.WorkerId]: this.id, | ||
}); | ||
} | ||
|
||
const chunkSize = 50; | ||
const [failed, stalled] = await this.scripts.moveStalledJobsToWait(); | ||
|
||
|
@@ -1065,6 +1239,10 @@ will never work with more accuracy than 1ms. */ | |
} | ||
|
||
this.notifyFailedJobs(await Promise.all(jobPromises)); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to store the ids of the failed/stalled jobs here because they need to be mapped to the jobs added to the queue so that we can trace all the way of a job added to the queue to the place where they complete/fail/stall |
||
if (this.tracer) { | ||
span.end(); | ||
} | ||
} | ||
|
||
private notifyFailedJobs(failedJobs: Job<DataType, ResultType, NameType>[]) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
export enum TelemetryAttributes { | ||
QueueName = 'bullmq.queue.name', | ||
WorkerName = 'bullmq.worker.name', | ||
BulkCount = 'bullmq.job.bulk.count', | ||
BulkNames = 'bullmq.job.bulk.names', | ||
JobName = 'bullmq.job.name', | ||
JobId = 'bullmq.job.id', | ||
JobKey = 'bullmq.job.key', | ||
JobOptions = 'bullmq.job.options', | ||
JobProgress = 'bullmq.job.progress', | ||
QueueDrainDelay = 'bullmq.queue.drain.delay', | ||
QueueGrace = 'bullmq.queue.grace', | ||
QueueCleanLimit = 'bullmq.queue.clean.limit', | ||
JobType = 'bullmq.job.type', | ||
JobTimestamp = 'bullmq.job.timestamp', | ||
QueueOptions = 'bullmq.queue.options', | ||
QueueEventMaxLength = 'bullmq.queue.event.max.length', | ||
WorkerOptions = 'bullmq.worker.options', | ||
WorkerToken = 'bullmq.worker.token', | ||
WorkerId = 'bullmq.worker.id', | ||
WorkerRateLimit = 'bullmq.worker.rate.limit', | ||
WorkerDoNotWaitActive = 'bullmq.worker.do.not.wait.active', | ||
WorkerForceClose = 'bullmq.worker.force.close', | ||
WorkerJobsInvolved = 'bullmq.worker.jobs.involved', | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This made me think, why do we need to make sure the span ends even in the case of an exception, but not in all other places? As most functions perform asynchronous calls, they could also potentially raise an exception. Have you thought about that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, in the case of error, shouldn't we store this information in the span? just wondering here... how does the current open telemetry for BullMQ package handle exceptions?