Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a cluster health endpoint to monitor #2353

Merged
merged 6 commits into from
Aug 3, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions charts/hedera-mirror-monitor/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ prometheusRules:
MonitorPublishErrors:
annotations:
description: "Averaging {{ $value | humanizePercentage }} error rate publishing '{{ $labels.scenario }}' scenario from {{ $labels.namespace }}/{{ $labels.pod }}"
summary: "Publish error rate exceeds 15%"
summary: "Publish error rate exceeds 33%"
enabled: true
expr: sum(rate(hedera_mirror_monitor_publish_submit_seconds_sum{application="hedera-mirror-monitor",status!="SUCCESS"}[2m])) by (namespace, pod, scenario) / sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor"}[2m])) by (namespace, pod, scenario) > 0.15
for: 2m
expr: sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor",status!="SUCCESS"}[2m])) by (namespace, pod, scenario) / sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor"}[2m])) by (namespace, pod, scenario) > 0.33
for: 3m
labels:
severity: critical
application: hedera-mirror-monitor
Expand All @@ -207,7 +207,7 @@ prometheusRules:
description: "Averaging {{ $value | humanizePercentage }} PLATFORM_NOT_ACTIVE or UNAVAILABLE errors while attempting to publish in {{ $labels.namespace }}"
summary: "Platform is not active"
enabled: true
expr: sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor",status=~"(PLATFORM_NOT_ACTIVE|UNAVAILABLE)"}[2m])) by (namespace) / sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor"}[2m])) by (namespace) > 0
expr: sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor",status=~"(PLATFORM_NOT_ACTIVE|UNAVAILABLE)"}[2m])) by (namespace) / sum(rate(hedera_mirror_monitor_publish_submit_seconds_count{application="hedera-mirror-monitor"}[2m])) by (namespace) > 0.33
for: 1m
labels:
application: hedera-mirror-monitor
Expand Down
2 changes: 1 addition & 1 deletion docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,10 @@ Name | Default | D
`hedera.mirror.monitor.publish.scenarios.<name>.enabled` | true | Whether this publish scenario is enabled
`hedera.mirror.monitor.publish.scenarios.<name>.limit` | 0 | How many transactions to publish before halting. 0 for unlimited
`hedera.mirror.monitor.publish.scenarios.<name>.logResponse` | false | Whether to log the response from HAPI
`hedera.mirror.monitor.publish.scenarios.<name>.maxAttempts` | 1 | The maximum number of times a scenario transaction will be attempted
`hedera.mirror.monitor.publish.scenarios.<name>.properties` | {} | Key/value pairs used to configure the [`TransactionSupplier`](/hedera-mirror-datagenerator/src/main/java/com/hedera/datagenerator/sdk/supplier) associated with this scenario type
`hedera.mirror.monitor.publish.scenarios.<name>.receiptPercent` | 0.0 | The percentage of receipts to retrieve from HAPI. Accepts values between 0-1
`hedera.mirror.monitor.publish.scenarios.<name>.recordPercent` | 0.0 | The percentage of records to retrieve from HAPI. Accepts values between 0-1
`hedera.mirror.monitor.publish.scenarios.<name>.retry.maxAttempts` | 1 | The maximum number of times a scenario transaction will be attempted
`hedera.mirror.monitor.publish.scenarios.<name>.timeout` | 12s | How long to wait for the transaction result
`hedera.mirror.monitor.publish.scenarios.<name>.tps` | 1.0 | The rate at which transactions will publish
`hedera.mirror.monitor.publish.scenarios.<name>.type` | | The type of transaction to publish. See the [`TransactionType`](/hedera-mirror-datagenerator/src/main/java/com/hedera/datagenerator/sdk/supplier/TransactionType.java) enum for a list of possible values
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.hedera.mirror.monitor.subscribe;
package com.hedera.mirror.monitor;

/*-
* ‌
Expand All @@ -21,6 +21,7 @@
*/

import com.google.common.base.Stopwatch;
import com.google.common.base.Throwables;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
import io.micrometer.core.instrument.Clock;
Expand All @@ -38,11 +39,13 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.hedera.mirror.monitor.subscribe.Scenario;

@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public abstract class AbstractSubscription<P extends AbstractSubscriberProperties, T> implements Subscription {
public abstract class AbstractScenario<P extends ScenarioProperties, T> implements Scenario<P, T> {

private static final long UPDATE_INTERVAL = 10_000L; // 10s
private static final long UPDATE_INTERVAL = 20_000L; // 20s measured in milliseconds

@EqualsAndHashCode.Include
protected final int id;
Expand Down Expand Up @@ -85,27 +88,36 @@ public double getRate() {
}

@Override
public SubscriptionStatus getStatus() {
if (!stopwatch.isRunning()) {
return SubscriptionStatus.COMPLETED;
public ScenarioStatus getStatus() {
if (!isRunning()) {
return ScenarioStatus.COMPLETED;
} else if (getRate() <= 0.0) {
return SubscriptionStatus.IDLE;
return ScenarioStatus.IDLE;
} else {
return SubscriptionStatus.RUNNING;
return ScenarioStatus.RUNNING;
}
}

@Override
public boolean isRunning() {
return stopwatch.isRunning();
}

@Override
public void onComplete() {
if (stopwatch.isRunning()) {
if (isRunning()) {
stopwatch.stop();
log.info("Stopping '{}' subscription", this);
log.info("Stopping '{}' scenario", this);
}
}

public void onError(Throwable t) {
errors.add(t.getClass().getSimpleName());
@Override
public void onError(Throwable throwable) {
Throwable rootCause = Throwables.getRootCause(throwable);
errors.add(rootCause.getClass().getSimpleName());
}

@Override
public void onNext(T response) {
counter.incrementAndGet();
intervalCounter.getCurrent().increment();
Expand All @@ -115,7 +127,6 @@ public void onNext(T response) {

@Override
public String toString() {
String name = getName();
return getProperties().getSubscribers() <= 1 ? name : name + " #" + getId();
return getName();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.hedera.mirror.monitor;

/*-
* ‌
* Hedera Mirror Node
* ​
* Copyright (C) 2019 - 2021 Hedera Hashgraph, LLC
* ​
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ‍
*/

import javax.inject.Named;
import lombok.RequiredArgsConstructor;
import org.apache.commons.lang3.StringUtils;
import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.ReactiveHealthIndicator;
import org.springframework.boot.actuate.health.Status;
import reactor.core.publisher.Mono;

import com.hedera.mirror.monitor.publish.generator.TransactionGenerator;
import com.hedera.mirror.monitor.subscribe.MirrorSubscriber;
import com.hedera.mirror.monitor.subscribe.Scenario;

@Named
@RequiredArgsConstructor
public class ClusterHealthIndicator implements ReactiveHealthIndicator {

private static final Mono<Health> DOWN = health(Status.DOWN, "Subscribing is inactive");
private static final Mono<Health> UNKNOWN = health(Status.UNKNOWN, "Publishing is inactive");
private static final Mono<Health> UP = health(Status.UP, "");

private final MirrorSubscriber mirrorSubscriber;
private final TransactionGenerator transactionGenerator;

private static Mono<Health> health(Status status, String reason) {
Health.Builder health = Health.status(status);
if (StringUtils.isNotBlank(reason)) {
health.withDetail("reason", reason);
}
return Mono.just(health.build());
}

@Override
public Mono<Health> health() {
return publishing().switchIfEmpty(subscribing());
}

// Returns unknown if all publish scenarios aggregated rate has dropped to zero, otherwise returns an empty flux
private Mono<Health> publishing() {
return transactionGenerator.scenarios()
Nana-EC marked this conversation as resolved.
Show resolved Hide resolved
.map(Scenario::getRate)
.reduce(0.0, (c, n) -> c + n)
.filter(sum -> sum <= 0)
.flatMap(n -> UNKNOWN);
}

// Returns up if any subscription is running and its rate is above zero, otherwise returns down
private Mono<Health> subscribing() {
return mirrorSubscriber.getSubscriptions()
Nana-EC marked this conversation as resolved.
Show resolved Hide resolved
.map(Scenario::getRate)
.reduce(0.0, (cur, next) -> cur + next)
.filter(sum -> sum > 0)
.flatMap(n -> UP)
.switchIfEmpty(DOWN);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
@RequiredArgsConstructor
public enum HederaNetwork {

MAINNET(mainnet(), mirrorNode("mainnet")),
MAINNET(mainnet(), mirrorNode("mainnet-public")),
PREVIEWNET(previewnet(), mirrorNode("previewnet")),
TESTNET(testnet(), mirrorNode("testnet")),
OTHER(Collections.emptySet(), null);
Expand All @@ -42,6 +42,10 @@ private static MirrorNodeProperties mirrorNode(String environment) {
MirrorNodeProperties mirrorNodeProperties = new MirrorNodeProperties();
mirrorNodeProperties.getGrpc().setHost("hcs." + host);
mirrorNodeProperties.getRest().setHost(host);
if ("mainnet-public".equals(environment)) {
mirrorNodeProperties.getGrpc().setHost(host);
mirrorNodeProperties.getGrpc().setPort(443);
}
return mirrorNodeProperties;
}

Expand Down Expand Up @@ -76,7 +80,8 @@ private static Set<NodeProperties> previewnet() {
new NodeProperties("0.0.3", "0.previewnet.hedera.com"),
new NodeProperties("0.0.4", "1.previewnet.hedera.com"),
new NodeProperties("0.0.5", "2.previewnet.hedera.com"),
new NodeProperties("0.0.6", "3.previewnet.hedera.com")
new NodeProperties("0.0.6", "3.previewnet.hedera.com"),
new NodeProperties("0.0.7", "4.previewnet.hedera.com")
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ public NodeProperties(String accountId, String host) {
private int port = 50211;

public String getEndpoint() {
// Allow for in-process testing of gRPC stubs
if (host.startsWith("in-process:")) {
Nana-EC marked this conversation as resolved.
Show resolved Hide resolved
return host;
}
return host + ":" + port;
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package com.hedera.mirror.monitor;

/*-
* ‌
* Hedera Mirror Node
* ​
* Copyright (C) 2019 - 2021 Hedera Hashgraph, LLC
* ​
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ‍
*/

import java.time.Duration;
import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;
import lombok.Data;
import org.hibernate.validator.constraints.time.DurationMin;
import org.springframework.validation.annotation.Validated;

@Data
public abstract class ScenarioProperties {

@NotNull
@DurationMin(seconds = 30)
protected Duration duration = Duration.ofNanos(Long.MAX_VALUE);

protected boolean enabled = true;

@Min(0)
protected long limit = 0; // 0 for unlimited

protected String name;

@NotNull
protected RetryProperties retry = new RetryProperties();

@Data
@Validated
public static class RetryProperties {

@Min(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minimum should be 1, setting to 0 will never attempt.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but Flux retry spec uses this as the "retry attempts" not total attempts like the SDK so it needs to remain 0 here. I can add an assertion in ConfigurableTransactionGenerator that validates it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair.

private long maxAttempts = 16L;

@NotNull
@DurationMin(millis = 500L)
private Duration maxBackoff = Duration.ofSeconds(8L);

@NotNull
@DurationMin(millis = 100L)
private Duration minBackoff = Duration.ofMillis(250L);
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.hedera.mirror.monitor.subscribe;
package com.hedera.mirror.monitor;

/*-
* ‌
Expand All @@ -20,7 +20,7 @@
* ‍
*/

public enum SubscriberProtocol {
public enum ScenarioProtocol {
GRPC,
REST
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.hedera.mirror.monitor.subscribe;
package com.hedera.mirror.monitor;

/*-
* ‌
Expand All @@ -20,7 +20,7 @@
* ‍
*/

public enum SubscriptionStatus {
public enum ScenarioStatus {
COMPLETED, // The scenario has completed normally due to reaching the configured duration or limit
IDLE, // The scenario has not completed but is not currently receiving any responses
RUNNING, // The scenario is still actively receiving responses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@
import reactor.core.publisher.Flux;
import reactor.core.scheduler.Schedulers;

import com.hedera.mirror.monitor.generator.TransactionGenerator;
import com.hedera.mirror.monitor.publish.PublishException;
import com.hedera.mirror.monitor.publish.PublishMetrics;
import com.hedera.mirror.monitor.publish.PublishProperties;
import com.hedera.mirror.monitor.publish.PublishRequest;
import com.hedera.mirror.monitor.publish.TransactionPublisher;
import com.hedera.mirror.monitor.publish.generator.TransactionGenerator;
import com.hedera.mirror.monitor.subscribe.MirrorSubscriber;
import com.hedera.mirror.monitor.subscribe.SubscribeMetrics;

Expand Down
Loading