Skip to content

Commit

Permalink
Reboot ArgoCD operator when a possible deadlock situation occurs
Browse files Browse the repository at this point in the history
  • Loading branch information
Aline Abler committed Jul 11, 2023
1 parent 6ac4f8d commit 3af8725
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 13 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/deepmap/oapi-codegen v1.11.0
github.com/projectsyn/lieutenant-api v0.7.0
github.com/stretchr/testify v1.8.0
go.uber.org/multierr v1.6.0
golang.org/x/crypto v0.0.0-20220525230936-793ad666bf5e
gopkg.in/alecthomas/kingpin.v2 v2.2.6
k8s.io/api v0.21.2
Expand Down Expand Up @@ -46,6 +47,7 @@ require (
github.com/taion809/haikunator v0.0.0-20150324135039-4e414e676fd1 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.1 // indirect
go.uber.org/atomic v1.7.0 // indirect
golang.org/x/net v0.0.0-20220513224357-95641704303c // indirect
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d // indirect
golang.org/x/sys v0.0.0-20220513210249-45d2b4557a2a // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -805,13 +805,15 @@ go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/multierr v0.0.0-20180122172545-ddea229ff1df/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
go.uber.org/multierr v1.4.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA=
go.uber.org/zap v0.0.0-20180814183419-67bc79d13d15/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
Expand Down
1 change: 1 addition & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ func main() {
app.Flag("region", "Cloud region this cluster is running in").StringVar(&agent.CloudRegion)
app.Flag("distribution", "Kubernetes distribution this cluster is running").StringVar(&agent.Distribution)
app.Flag("namespace", "Namespace in which steward is running").Default("syn").StringVar(&agent.Namespace)
app.Flag("operator-namespace", "Namespace in which the ArgoCD operator will be running").Default("syn-argocd-operator").StringVar(&agent.OperatorNamespace)
app.Flag("argo-image", "Image to be used for the Argo CD deployments").Default(images.DefaultArgoCDImage).StringVar(&agent.ArgoCDImage)
app.Flag("redis-image", "Image to be used for the Argo CD Redis deployment").Default(images.DefaultRedisImage).StringVar(&agent.RedisImage)

Expand Down
21 changes: 11 additions & 10 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ import (

// Agent configures the cluster agent
type Agent struct {
APIURL *url.URL
Token string
ClusterID string
CloudType string
CloudRegion string
Distribution string
Namespace string
ArgoCDImage string
RedisImage string
APIURL *url.URL
Token string
ClusterID string
CloudType string
CloudRegion string
Distribution string
Namespace string
OperatorNamespace string
ArgoCDImage string
RedisImage string

facts factCollector
}
Expand Down Expand Up @@ -125,7 +126,7 @@ func (a *Agent) registerCluster(ctx context.Context, config *rest.Config, apiCli
return
}

if err := argocd.Apply(ctx, config, a.Namespace, a.ArgoCDImage, a.RedisImage, apiClient, cluster); err != nil {
if err := argocd.Apply(ctx, config, a.Namespace, a.OperatorNamespace, a.ArgoCDImage, a.RedisImage, apiClient, cluster); err != nil {
klog.Error(err)
}
}
Expand Down
61 changes: 58 additions & 3 deletions pkg/argocd/argocd.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@ package argocd

import (
"context"
"time"

"github.com/projectsyn/lieutenant-api/pkg/api"
"go.uber.org/multierr"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
)

Expand All @@ -34,7 +36,7 @@ var (
)

// Apply reconciles the Argo CD deployments
func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redisArgoImage string, apiClient *api.Client, cluster *api.Cluster) error {
func Apply(ctx context.Context, config *rest.Config, namespace, operatorNamespace, argoImage, redisArgoImage string, apiClient *api.Client, cluster *api.Cluster) error {
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return err
Expand All @@ -58,7 +60,9 @@ func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redis
}

if err == nil && len(argos.Items) > 0 {
return nil
// An ArgoCD custom resource exists in our namespace
err = fixArgoOperatorDeadlock(ctx, clientset, config, namespace, operatorNamespace)
return err
}

deployments, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{
Expand Down Expand Up @@ -123,3 +127,54 @@ func bootstrapArgo(ctx context.Context, clientset *kubernetes.Clientset, config

return nil
}

func fixArgoOperatorDeadlock(ctx context.Context, clientset *kubernetes.Clientset, config *rest.Config, namespace, operatorNamespace string) error {
configmaps, err := clientset.CoreV1().ConfigMaps(namespace).List(ctx, metav1.ListOptions{
LabelSelector: "app.kubernetes.io/managed-by=syn-argocd",
})

if err != nil {
return err
}

if len(configmaps.Items) > 2 {
// no restart required
return nil
}

pods, err := clientset.CoreV1().Pods(operatorNamespace).List(ctx, metav1.ListOptions{})
if err != nil {
return err
}

for _, pod := range(pods.Items) {
if pod.CreationTimestamp.Time.After(time.Now().Add(-10 * time.Minute)) {
klog.Info("ArgoCD Operator pod was recently created, waiting to reboot...")
return nil
}
}

// if there still exists an argocd-secret not managed by the operator, clean it up:
secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, argoSecretName, metav1.GetOptions{})
if err != nil {
return err
}

if len(secret.ObjectMeta.OwnerReferences) == 0 {
klog.Info("Deleting ArgoCD secret")
err := clientset.CoreV1().Secrets(namespace).Delete(ctx, argoSecretName, metav1.DeleteOptions{})
if err != nil {
return err
}
}

// reboot argo operator
errors := []error{}
for _, pod := range(pods.Items) {
klog.Infof("Removing pod %s", pod.Name)
err := clientset.CoreV1().Pods(operatorNamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
errors = append(errors, err)
}

return multierr.Combine(errors ...)
}

0 comments on commit 3af8725

Please sign in to comment.