From e487e2217bf7e7935c7298ae31ae719bfb352033 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sat, 12 Sep 2020 20:52:52 +0200 Subject: [PATCH 1/7] Updates to user-scheduler's coupling to the kube-scheduler binary --- .../scheduling/user-scheduler/deployment.yaml | 6 +- .../scheduling/user-scheduler/rbac.yaml | 216 +++++++++++++++--- jupyterhub/values.yaml | 4 +- 3 files changed, 183 insertions(+), 43 deletions(-) diff --git a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml index b6cdb824db..733ad354fc 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml @@ -13,10 +13,8 @@ spec: template: metadata: labels: - {{- /* Changes here will cause the Deployment to restart the pods. */}} {{- include "jupyterhub.matchLabels" . | nindent 8 }} annotations: - # This lets us autorestart when the configmap changes! checksum/config-map: {{ include (print $.Template.BasePath "/scheduling/user-scheduler/configmap.yaml") . | sha256sum }} spec: {{- if .Values.rbac.enabled }} @@ -31,13 +29,11 @@ spec: - name: user-scheduler image: {{ include "jupyterhub.scheduler.image" . }} command: + # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ - /usr/local/bin/kube-scheduler - --scheduler-name={{ .Release.Name }}-user-scheduler - --policy-configmap=user-scheduler - --policy-configmap-namespace={{ .Release.Namespace }} - - --lock-object-name=user-scheduler - - --lock-object-namespace={{ .Release.Namespace }} - - --leader-elect-resource-lock=configmaps - --v={{ .Values.scheduling.userScheduler.logLevel | default 4 }} livenessProbe: httpGet: diff --git a/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml b/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml index f03ad4846d..514396aab9 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml @@ -7,58 +7,202 @@ metadata: labels: {{- include "jupyterhub.labels" . | nindent 4 }} --- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: {{ .Release.Name }}-user-scheduler-base - labels: - {{- $_ := merge (dict "componentSuffix" "-base") . }} - {{- include "jupyterhub.labels" $_ | nindent 4 }} -subjects: - - kind: ServiceAccount - name: user-scheduler - namespace: {{ .Release.Namespace }} -roleRef: - kind: ClusterRole - name: system:kube-scheduler - apiGroup: rbac.authorization.k8s.io ---- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ .Release.Name }}-user-scheduler-complementary + name: {{ .Release.Name }}-user-scheduler labels: - {{- $_ := merge (dict "componentSuffix" "-complementary") . }} - {{- include "jupyterhub.labels" $_ | nindent 4 }} + {{- include "jupyterhub.labels" . | nindent 4 }} rules: - # Support leader elections - - apiGroups: [""] - resourceNames: ["user-scheduler"] - resources: ["configmaps"] - verbs: ["get", "update"] - # Workaround for missing permission in system:kube-scheduler as of k8s 1.10.4 - - apiGroups: ["storage.k8s.io"] - resources: ["storageclasses"] - verbs: ["get", "list", "watch"] - # Workaround for missing permission with rancher local-path-provisioner - - apiGroups: [""] - resources: ["persistentvolume", "persistentvolumeclaims"] - verbs: ["update"] + # Copied from the system:kube-scheduler ClusterRole of the k8s version + # matching the kube-scheduler binary we use. A modification of two resource + # name references from kube-scheduler to user-scheduler was made. + # + # NOTE: These have been unchanged between 1.12 and 1.15, but changed in 1.16 + # and in 1.17, but not 1.18 and 1.19. + # + # ref: https://github.com/kubernetes/kubernetes/blob/v1.19.0/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml#L696-L829 + - apiGroups: + - "" + - events.k8s.io + resources: + - events + verbs: + - create + - patch + - update + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - apiGroups: + - coordination.k8s.io + resourceNames: + - user-scheduler + resources: + - leases + verbs: + - get + - update + - apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - apiGroups: + - "" + resourceNames: + - user-scheduler + resources: + - endpoints + verbs: + - get + - update + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - watch + - apiGroups: + - "" + resources: + - bindings + - pods/binding + verbs: + - create + - apiGroups: + - "" + resources: + - pods/status + verbs: + - patch + - update + - apiGroups: + - "" + resources: + - replicationcontrollers + - services + verbs: + - get + - list + - watch + - apiGroups: + - apps + - extensions + resources: + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + verbs: + - get + - list + - watch + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - persistentvolumeclaims + - persistentvolumes + verbs: + - get + - list + - watch + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + - apiGroups: + - storage.k8s.io + resources: + - csinodes + verbs: + - get + - list + - watch + + # Copied from the system:volume-scheduler ClusterRole of the k8s version + # matching the kube-scheduler binary we use. These have not changed between + # 1.12 and 1.19. + # + # ref: https://github.com/kubernetes/kubernetes/blob/v1.19.0/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml#L1213-L1240 + - apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - get + - list + - patch + - update + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - get + - list + - patch + - update + - watch --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ .Release.Name }}-user-scheduler-complementary + name: {{ .Release.Name }}-user-scheduler labels: - {{- $_ := merge (dict "componentSuffix" "-complementary") . }} - {{- include "jupyterhub.labels" $_ | nindent 4 }} + {{- include "jupyterhub.labels" . | nindent 4 }} subjects: - kind: ServiceAccount name: user-scheduler namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole - name: {{ .Release.Name }}-user-scheduler-complementary + name: {{ .Release.Name }}-user-scheduler apiGroup: rbac.authorization.k8s.io {{- end }} {{- end }} diff --git a/jupyterhub/values.yaml b/jupyterhub/values.yaml index d72d6a852e..9c417203bc 100644 --- a/jupyterhub/values.yaml +++ b/jupyterhub/values.yaml @@ -326,8 +326,8 @@ scheduling: ## to breaking changes in the kube-scheduler binary. policy: {} image: - name: gcr.io/google_containers/kube-scheduler-amd64 - tag: v1.16.11 + name: k8s.gcr.io/kube-scheduler + tag: v1.19.1 nodeSelector: {} pdb: enabled: true From 70f166b48628ddaf393871d660940fc9ec0d32e8 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 13 Sep 2020 00:21:31 +0200 Subject: [PATCH 2/7] user-scheduler: use k8s 1.19 kube-scheduler binary and its modern config --- .../files/userscheduler-defaultpolicy.yaml | 76 ------------------- .../scheduling/user-scheduler/configmap.yaml | 19 ++++- .../scheduling/user-scheduler/deployment.yaml | 13 +++- jupyterhub/values.yaml | 10 +-- 4 files changed, 28 insertions(+), 90 deletions(-) delete mode 100644 jupyterhub/files/userscheduler-defaultpolicy.yaml diff --git a/jupyterhub/files/userscheduler-defaultpolicy.yaml b/jupyterhub/files/userscheduler-defaultpolicy.yaml deleted file mode 100644 index 455a9abeee..0000000000 --- a/jupyterhub/files/userscheduler-defaultpolicy.yaml +++ /dev/null @@ -1,76 +0,0 @@ -{ - "kind": "Policy", - "apiVersion": "v1", - "predicates": [ - { "name": "PodFitsResources" }, - { "name": "HostName" }, - { "name": "PodFitsHostPorts" }, - { "name": "MatchNodeSelector" }, - { "name": "NoDiskConflict" }, - { "name": "PodToleratesNodeTaints" }, - { "name": "MaxEBSVolumeCount" }, - { "name": "MaxGCEPDVolumeCount" }, - { "name": "MaxAzureDiskVolumeCount" }, - { "name": "MaxCSIVolumeCountPred" }, - { "name": "CheckVolumeBinding" }, - { "name": "NoVolumeZoneConflict" }, - { "name": "MatchInterPodAffinity" } - ], - "priorities": [ - { "name": "NodePreferAvoidPodsPriority", "weight": 161051 }, - { "name": "NodeAffinityPriority", "weight": 14641 }, - { "name": "InterPodAffinityPriority", "weight": 1331 }, - { "name": "MostRequestedPriority", "weight": 121 }, - { "name": "ImageLocalityPriority", "weight": 11} - ], - "hardPodAffinitySymmetricWeight" : 100, - "alwaysCheckAllPredicates" : false -} - -# # Notes about ranges -# ImageLocalityPriority - ranges from 0-10 * 11 -# MostRequestedPriority - ranges from 0-10 * 11^2 -# InterPodAffinityPriority - ranges from 0-1 * 11^3 (i guess) -# NodeAffinityPriority - ranges from 0-1 * 11^4 (i guess) -# NodePreferAvoidPodsPriority - ranges from 0-1 * 11^5 (i guess) - -# # Notes about the GeneralPredicates -# The following predicates was not found by kube-scheduler 1.11.1-beta.0 -# { "name": "CheckNodePIDPressure" }, -# { "name": "CheckNodeUnschedulable" }, -# { "name": "CheckNodeCondition" }, -# { "name": "General" }, -# { "name": "PodToleratesNodeNoExecuteTaints" }, -# { "name": "CheckNodeMemoryPressure" }, -# { "name": "CheckNodeDiskPressure" }, - -# # Notes about the priorities -# NodePreferAvoidPodsPriority: What does this really mean? -# HardPodAffinitySymmetricWeight: "It represents the weight of implicit -# PreferredDuringScheduling affinity rule." - preferred node affinity or preferred -# pod/anti-pod affinity or those affinities in general? How does this relate to -# the InterPodAffinityPriority and NodeAffinityPriority? - -# AlwaysCheckAllPredicates: scheduler checks all the configured predicates even -# after one or more of them fails. - -# GeneralPredicates checks whether noncriticalPredicates and EssentialPredicates -# pass. noncriticalPredicates are the predicates that only non-critical pods need -# and EssentialPredicates are the predicates that all pods, including critical -# pods, need - -# MostRequestedPriority: Is using the default MostRequestedPriorityMap that is a -# priority function that favors nodes with most requested resources. It calculates -# the percentage of memory and CPU requested by pods scheduled on the node, and -# prioritizes based on the maximum of the average of the fraction of requested to -# capacity. - -# Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / -# capacity)) / 2 - -# ImageLocalityPriorityMap is a priority function that favors nodes that already -# have requested pod container's images. It will detect whether the requested -# images are present on a node, and then calculate a score ranging from 0 to 10 -# based on the total size of those images. - If none of the images are present, -# this node will be given the lowest priority. - If some of the images are present -# on a node, the larger their sizes' sum, the higher the node's priority. diff --git a/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml b/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml index 1689ad6d16..faed1f91bb 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml @@ -6,6 +6,21 @@ metadata: labels: {{- include "jupyterhub.labels" . | nindent 4 }} data: - {{- $defaultPolicy := .Files.Get "files/userscheduler-defaultpolicy.yaml" | fromYaml }} - policy.cfg: {{ .Values.scheduling.userScheduler.policy | default $defaultPolicy | toJson | quote }} + # ref: https://kubernetes.io/docs/reference/scheduling/config/ + config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1beta1 + kind: KubeSchedulerConfiguration + profiles: + - schedulerName: {{ .Release.Name }}-user-scheduler + plugins: + {{- if .Values.scheduling.userScheduler.plugins }} + {{- .Values.scheduling.userScheduler.plugins | toYaml | trimSuffix "\n" | nindent 10 }} + {{- else }} + score: + disabled: + - name: NodeResourcesLeastAllocated + - name: NodeResourcesBalancedAllocation + enabled: + - name: NodeResourcesMostAllocated + {{- end }} {{- end }} diff --git a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml index 733ad354fc..9006430cd1 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml @@ -25,16 +25,23 @@ spec: {{- end }} nodeSelector: {{ toJson .Values.scheduling.userScheduler.nodeSelector }} {{- include "jupyterhub.coreAffinity" . | nindent 6 }} + volumes: + - name: config + configMap: + name: user-scheduler containers: - name: user-scheduler image: {{ include "jupyterhub.scheduler.image" . }} command: # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ - /usr/local/bin/kube-scheduler - - --scheduler-name={{ .Release.Name }}-user-scheduler - - --policy-configmap=user-scheduler - - --policy-configmap-namespace={{ .Release.Namespace }} + - --config=/etc/user-scheduler/config.yaml + - --leader-elect-resource-name=user-scheduler + - --leader-elect-resource-namespace={{ .Release.Namespace }} - --v={{ .Values.scheduling.userScheduler.logLevel | default 4 }} + volumeMounts: + - mountPath: /etc/user-scheduler + name: config livenessProbe: httpGet: path: /healthz diff --git a/jupyterhub/values.yaml b/jupyterhub/values.yaml index 9c417203bc..f49d7d159a 100644 --- a/jupyterhub/values.yaml +++ b/jupyterhub/values.yaml @@ -316,15 +316,7 @@ scheduling: enabled: true replicas: 2 logLevel: 4 - ## policy: - ## Allows you to provide custom YAML/JSON to render into a JSON policy.cfg, - ## a configuration file for the kube-scheduler binary. - ## NOTE: The kube-scheduler binary in the kube-scheduler image we are - ## currently using may be version bumped. It would for example happen if we - ## increase the lowest supported k8s version for the helm chart. At this - ## point, the provided policy.cfg may require a change along with that due - ## to breaking changes in the kube-scheduler binary. - policy: {} + plugins: {} image: name: k8s.gcr.io/kube-scheduler tag: v1.19.1 From d46233c5c3009f6058b1cd4edd474d75d4612627 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 13 Sep 2020 00:22:50 +0200 Subject: [PATCH 3/7] docs: remove outdated instruction in CONTRIBUTING.md --- CONTRIBUTING.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 718a56403f..f2ff17f4f9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -122,9 +122,6 @@ docker images to be pushed to a dedicated registry before they can be accessed by the pods in the Kubernetes cluster, until [this issue](https://github.com/rancher/k3d/issues/113) is resolved. -For this setup to work, make `registry.local` point to `127.0.0.1` (localhost) -by adding an entry in `/etc/hosts` or its equivalent in Windows. - __Install__ ```shell From 364ac0f04035440ff381c9ecedcbc849009c7962 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 13 Sep 2020 01:57:47 +0200 Subject: [PATCH 4/7] user-scheduler: fixes to kube-scheduler modern config --- .../scheduling/user-scheduler/configmap.yaml | 4 ++++ .../scheduling/user-scheduler/deployment.yaml | 7 +++++-- .../templates/scheduling/user-scheduler/rbac.yaml | 15 ++++++++------- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml b/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml index faed1f91bb..ec87706f1a 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml @@ -10,6 +10,10 @@ data: config.yaml: | apiVersion: kubescheduler.config.k8s.io/v1beta1 kind: KubeSchedulerConfiguration + leaderElection: + resourceLock: endpoints + resourceName: user-scheduler-lock + resourceNamespace: {{ .Release.Namespace }} profiles: - schedulerName: {{ .Release.Name }}-user-scheduler plugins: diff --git a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml index 9006430cd1..2a692c5d63 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml @@ -33,11 +33,14 @@ spec: - name: user-scheduler image: {{ include "jupyterhub.scheduler.image" . }} command: + # NOTE: --leader-elect- flags are silently ignored in favor of whats + # defined in the passed KubeSchedulerConfiguration. + # + # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ - /usr/local/bin/kube-scheduler + - --scheduler-name={{ .Release.Name }}-user-scheduler - --config=/etc/user-scheduler/config.yaml - - --leader-elect-resource-name=user-scheduler - - --leader-elect-resource-namespace={{ .Release.Namespace }} - --v={{ .Values.scheduling.userScheduler.logLevel | default 4 }} volumeMounts: - mountPath: /etc/user-scheduler diff --git a/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml b/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml index 514396aab9..4c15ed62c2 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/rbac.yaml @@ -16,10 +16,10 @@ metadata: rules: # Copied from the system:kube-scheduler ClusterRole of the k8s version # matching the kube-scheduler binary we use. A modification of two resource - # name references from kube-scheduler to user-scheduler was made. + # name references from kube-scheduler to user-scheduler-lock was made. # - # NOTE: These have been unchanged between 1.12 and 1.15, but changed in 1.16 - # and in 1.17, but not 1.18 and 1.19. + # NOTE: These rules have been unchanged between 1.12 and 1.15, then changed in + # 1.16 and in 1.17, but unchanged in 1.18 and 1.19. # # ref: https://github.com/kubernetes/kubernetes/blob/v1.19.0/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml#L696-L829 - apiGroups: @@ -40,7 +40,7 @@ rules: - apiGroups: - coordination.k8s.io resourceNames: - - user-scheduler + - user-scheduler-lock resources: - leases verbs: @@ -55,7 +55,7 @@ rules: - apiGroups: - "" resourceNames: - - user-scheduler + - user-scheduler-lock resources: - endpoints verbs: @@ -157,8 +157,9 @@ rules: - watch # Copied from the system:volume-scheduler ClusterRole of the k8s version - # matching the kube-scheduler binary we use. These have not changed between - # 1.12 and 1.19. + # matching the kube-scheduler binary we use. + # + # NOTE: These rules have not changed between 1.12 and 1.19. # # ref: https://github.com/kubernetes/kubernetes/blob/v1.19.0/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml#L1213-L1240 - apiGroups: From 902fbd6987f9ec17ba69f773eaa0b49f9f0945e8 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 13 Sep 2020 03:36:44 +0200 Subject: [PATCH 5/7] user-scheduler: fallback to modern kube-scheduler version and config --- .../files/userscheduler-defaultpolicy.yaml | 76 +++++++++++++++++++ .../scheduling/user-scheduler/configmap.yaml | 3 + .../scheduling/user-scheduler/deployment.yaml | 23 +++++- 3 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 jupyterhub/files/userscheduler-defaultpolicy.yaml diff --git a/jupyterhub/files/userscheduler-defaultpolicy.yaml b/jupyterhub/files/userscheduler-defaultpolicy.yaml new file mode 100644 index 0000000000..455a9abeee --- /dev/null +++ b/jupyterhub/files/userscheduler-defaultpolicy.yaml @@ -0,0 +1,76 @@ +{ + "kind": "Policy", + "apiVersion": "v1", + "predicates": [ + { "name": "PodFitsResources" }, + { "name": "HostName" }, + { "name": "PodFitsHostPorts" }, + { "name": "MatchNodeSelector" }, + { "name": "NoDiskConflict" }, + { "name": "PodToleratesNodeTaints" }, + { "name": "MaxEBSVolumeCount" }, + { "name": "MaxGCEPDVolumeCount" }, + { "name": "MaxAzureDiskVolumeCount" }, + { "name": "MaxCSIVolumeCountPred" }, + { "name": "CheckVolumeBinding" }, + { "name": "NoVolumeZoneConflict" }, + { "name": "MatchInterPodAffinity" } + ], + "priorities": [ + { "name": "NodePreferAvoidPodsPriority", "weight": 161051 }, + { "name": "NodeAffinityPriority", "weight": 14641 }, + { "name": "InterPodAffinityPriority", "weight": 1331 }, + { "name": "MostRequestedPriority", "weight": 121 }, + { "name": "ImageLocalityPriority", "weight": 11} + ], + "hardPodAffinitySymmetricWeight" : 100, + "alwaysCheckAllPredicates" : false +} + +# # Notes about ranges +# ImageLocalityPriority - ranges from 0-10 * 11 +# MostRequestedPriority - ranges from 0-10 * 11^2 +# InterPodAffinityPriority - ranges from 0-1 * 11^3 (i guess) +# NodeAffinityPriority - ranges from 0-1 * 11^4 (i guess) +# NodePreferAvoidPodsPriority - ranges from 0-1 * 11^5 (i guess) + +# # Notes about the GeneralPredicates +# The following predicates was not found by kube-scheduler 1.11.1-beta.0 +# { "name": "CheckNodePIDPressure" }, +# { "name": "CheckNodeUnschedulable" }, +# { "name": "CheckNodeCondition" }, +# { "name": "General" }, +# { "name": "PodToleratesNodeNoExecuteTaints" }, +# { "name": "CheckNodeMemoryPressure" }, +# { "name": "CheckNodeDiskPressure" }, + +# # Notes about the priorities +# NodePreferAvoidPodsPriority: What does this really mean? +# HardPodAffinitySymmetricWeight: "It represents the weight of implicit +# PreferredDuringScheduling affinity rule." - preferred node affinity or preferred +# pod/anti-pod affinity or those affinities in general? How does this relate to +# the InterPodAffinityPriority and NodeAffinityPriority? + +# AlwaysCheckAllPredicates: scheduler checks all the configured predicates even +# after one or more of them fails. + +# GeneralPredicates checks whether noncriticalPredicates and EssentialPredicates +# pass. noncriticalPredicates are the predicates that only non-critical pods need +# and EssentialPredicates are the predicates that all pods, including critical +# pods, need + +# MostRequestedPriority: Is using the default MostRequestedPriorityMap that is a +# priority function that favors nodes with most requested resources. It calculates +# the percentage of memory and CPU requested by pods scheduled on the node, and +# prioritizes based on the maximum of the average of the fraction of requested to +# capacity. + +# Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / +# capacity)) / 2 + +# ImageLocalityPriorityMap is a priority function that favors nodes that already +# have requested pod container's images. It will detect whether the requested +# images are present on a node, and then calculate a score ranging from 0 to 10 +# based on the total size of those images. - If none of the images are present, +# this node will be given the lowest priority. - If some of the images are present +# on a node, the larger their sizes' sum, the higher the node's priority. diff --git a/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml b/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml index ec87706f1a..003260c1ef 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/configmap.yaml @@ -27,4 +27,7 @@ data: enabled: - name: NodeResourcesMostAllocated {{- end }} + + {{- $defaultPolicy := .Files.Get "files/userscheduler-defaultpolicy.yaml" | fromYaml }} + policy.cfg: {{ .Values.scheduling.userScheduler.policy | default $defaultPolicy | toJson | quote }} {{- end }} diff --git a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml index 2a692c5d63..4c33105c12 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml @@ -31,16 +31,31 @@ spec: name: user-scheduler containers: - name: user-scheduler - image: {{ include "jupyterhub.scheduler.image" . }} + # NOTE: When kube-scheduler 1.17+ fail to find CSINode resource in the + # cluster, they fail to startup. Due to this, we fallback to the + # latest functional version with its legacy configuration. + {{- if .Capabilities.APIVersions.Has "storage.k8s.io/v1/CSINode" }} + image: {{ .Values.scheduling.userScheduler.image.name }}:{{ .Values.scheduling.userScheduler.image.tag }} + {{- else }} + image: {{ .Values.scheduling.userScheduler.image.name }}:v1.16.15 + {{- end }} command: - # NOTE: --leader-elect- flags are silently ignored in favor of whats - # defined in the passed KubeSchedulerConfiguration. + # NOTE: --leader-elect-... (new) and --lock-object-... (deprecated) + # flags are silently ignored in favor of whats defined in the + # passed KubeSchedulerConfiguration whenever --config is + # passed. # # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ - /usr/local/bin/kube-scheduler - - --scheduler-name={{ .Release.Name }}-user-scheduler + {{- if .Capabilities.APIVersions.Has "storage.k8s.io/v1/CSINode" }} - --config=/etc/user-scheduler/config.yaml + {{- else }} + - --scheduler-name={{ .Release.Name }}-user-scheduler + - --policy-config-file=/etc/user-scheduler/policy.cfg + - --lock-object-name=user-scheduler-lock + - --lock-object-namespace={{ .Release.Namespace }} + {{- end }} - --v={{ .Values.scheduling.userScheduler.logLevel | default 4 }} volumeMounts: - mountPath: /etc/user-scheduler From 4d0de4eee2ff575a1ee091d3199947eeb1552f5a Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 13 Sep 2020 04:12:17 +0200 Subject: [PATCH 6/7] user-scheduler: fallback note --- .../scheduling/user-scheduler/deployment.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml index 4c33105c12..ddc92a2cd9 100644 --- a/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml +++ b/jupyterhub/templates/scheduling/user-scheduler/deployment.yaml @@ -31,23 +31,24 @@ spec: name: user-scheduler containers: - name: user-scheduler - # NOTE: When kube-scheduler 1.17+ fail to find CSINode resource in the - # cluster, they fail to startup. Due to this, we fallback to the - # latest functional version with its legacy configuration. + # NOTE: When the kube-scheduler 1.17+ binaries fail to find CSINode + # resource in the cluster, they won't start scheduling. Due to + # this, we fallback to the latest functional version with its + # legacy configuration format. This fallback can be removed when + # we assume k8s 1.17 where CSINode is generally available. {{- if .Capabilities.APIVersions.Has "storage.k8s.io/v1/CSINode" }} image: {{ .Values.scheduling.userScheduler.image.name }}:{{ .Values.scheduling.userScheduler.image.tag }} {{- else }} image: {{ .Values.scheduling.userScheduler.image.name }}:v1.16.15 {{- end }} command: + - /usr/local/bin/kube-scheduler # NOTE: --leader-elect-... (new) and --lock-object-... (deprecated) # flags are silently ignored in favor of whats defined in the # passed KubeSchedulerConfiguration whenever --config is # passed. # # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ - # ref: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ - - /usr/local/bin/kube-scheduler {{- if .Capabilities.APIVersions.Has "storage.k8s.io/v1/CSINode" }} - --config=/etc/user-scheduler/config.yaml {{- else }} From dda741a0a0bad83cea1df0d77d29d8b98231ce6e Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 13 Sep 2020 04:22:37 +0200 Subject: [PATCH 7/7] user-scheduler: removed outdated helper function --- .../templates/scheduling/user-scheduler/_helpers.tpl | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 jupyterhub/templates/scheduling/user-scheduler/_helpers.tpl diff --git a/jupyterhub/templates/scheduling/user-scheduler/_helpers.tpl b/jupyterhub/templates/scheduling/user-scheduler/_helpers.tpl deleted file mode 100644 index 2cb9fdf4a9..0000000000 --- a/jupyterhub/templates/scheduling/user-scheduler/_helpers.tpl +++ /dev/null @@ -1,12 +0,0 @@ -{{- /* -Renders the kube-scheduler's image based on .Values.scheduling.userScheduler.name and -optionally on .Values.scheduling.userScheduler.tag. The default tag is set to the clusters -kubernetes version. -*/}} -{{- define "jupyterhub.scheduler.image" -}} -{{- $name := .Values.scheduling.userScheduler.image.name -}} -{{- $valuesVersion := .Values.scheduling.userScheduler.image.tag -}} -{{- $clusterVersion := (split "-" .Capabilities.KubeVersion.GitVersion)._0 -}} -{{- $tag := $valuesVersion | default $clusterVersion -}} -{{ $name }}:{{ $tag }} -{{- end }}