diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..617c1c1 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,19 @@ +root = true + +[*] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false + +[*.yaml] +indent_size = 2 + +[Makefile] +indent_style = tab +indent_size = 4 diff --git a/.gitignore b/.gitignore index bdc5af0..70db2cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *~ build +rootfs diff --git a/Dockerfile.template b/Dockerfile.template index f4da7d9..a6dd348 100644 --- a/Dockerfile.template +++ b/Dockerfile.template @@ -4,7 +4,7 @@ # BUILD: docker build --rm -t mumoshu/kube-airflow # SOURCE: https://github.com/mumoshu/kube-airflow -FROM debian:jessie +FROM debian:stretch MAINTAINER Yusuke KUOKA # Never prompts the user for choices on installation/configuration of packages @@ -21,12 +21,12 @@ ENV LANG en_US.UTF-8 ENV LC_ALL en_US.UTF-8 ENV LC_CTYPE en_US.UTF-8 ENV LC_MESSAGES en_US.UTF-8 -ENV LC_ALL en_US.UTF-8 +ENV LC_ALL en_US.UTF-8 RUN set -ex \ && buildDeps=' \ - python-pip \ - python-dev \ + python3-pip \ + python3-dev \ libkrb5-dev \ libsasl2-dev \ libxml2-dev \ @@ -46,20 +46,21 @@ RUN set -ex \ curl \ netcat \ locales \ - && apt-get install -yqq -t jessie-backports python-requests libpq-dev \ + && apt-get install -yqq -t jessie-backports libpq-dev git \ && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ && locale-gen \ && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ && useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \ - && pip uninstall setuptools \ - && pip install setuptools==33.1.1 \ - && pip install pytz==2015.7 \ - && pip install cryptography \ - && pip install pyOpenSSL \ - && pip install ndg-httpsclient \ - && pip install pyasn1 \ - && pip install psycopg2 \ - && pip install airflow[celery,postgresql,hive]==$AIRFLOW_VERSION \ + && pip3 install --upgrade pip enum34 'setuptools!=36.0.0' \ + && pip3 install pytz==2015.7 \ + && pip3 install cryptography \ + && pip3 install requests \ + && pip3 install pyOpenSSL \ + && pip3 install ndg-httpsclient \ + && pip3 install pyasn1 \ + && pip3 install psycopg2 \ + && pip3 install airflow[celery,postgresql,hive] \ + && pip3 install click \ && apt-get remove --purge -yqq $buildDeps libpq-dev \ && apt-get clean \ && rm -rf \ @@ -75,10 +76,12 @@ ENV KUBECTL_VERSION %%KUBECTL_VERSION%% RUN curl -L -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl COPY script/entrypoint.sh ${AIRFLOW_HOME}/entrypoint.sh -COPY config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg +COPY config/airflow.cfg.in ${AIRFLOW_HOME}/airflow.cfg.in +COPY script/git-sync ${AIRFLOW_HOME}/git-sync RUN chown -R airflow: ${AIRFLOW_HOME} \ - && chmod +x ${AIRFLOW_HOME}/entrypoint.sh + && chmod +x ${AIRFLOW_HOME}/entrypoint.sh \ + && chmod +x ${AIRFLOW_HOME}/git-sync EXPOSE 8080 5555 8793 diff --git a/Makefile b/Makefile index 58465fc..5f19f2b 100644 --- a/Makefile +++ b/Makefile @@ -10,18 +10,42 @@ ALIAS ?= $(REPOSITORY):$(AIRFLOW_VERSION)-$(KUBECTL_VERSION) BUILD_ROOT ?= build/$(TAG) DOCKERFILE ?= $(BUILD_ROOT)/Dockerfile ROOTFS ?= $(BUILD_ROOT)/rootfs -AIRFLOW_CONF ?= $(BUILD_ROOT)/config/airflow.cfg +AIRFLOW_CONF ?= $(BUILD_ROOT)/config/airflow.cfg.in ENTRYPOINT_SH ?= $(BUILD_ROOT)/script/entrypoint.sh DOCKER_CACHE ?= docker-cache SAVED_IMAGE ?= $(DOCKER_CACHE)/image-$(AIRFLOW_VERSION)-$(KUBECTL_VERSION).tar NAMESPACE ?= airflow-dev +HELM_APPLICATION_NAME ?= airflow +HELM_CONFIG ?= config.yaml +CHART_LOCATION ?= ./airflow .PHONY: build clean clean: rm -Rf build +helm-install: + helm repo update + helm install $(CHART_LOCATION) \ + --version=v0.1.0 \ + --name=$(HELM_APPLICATION_NAME) \ + --namespace=$(NAMESPACE) \ + --debug \ + -f $(HELM_CONFIG) + +helm-upgrade: + helm upgrade -f $(HELM_CONFIG) \ + --debug \ + $(HELM_APPLICATION_NAME) \ + $(CHART_LOCATION) + +helm-ls: + helm ls --all $(HELM_APPLICATION_NAME) + +helm-uninstall: + helm del --purge $(HELM_APPLICATION_NAME) + build: $(DOCKERFILE) $(ROOTFS) $(AIRFLOW_CONF) $(ENTRYPOINT_SH) cd $(BUILD_ROOT) && docker build -t $(IMAGE) . && docker tag $(IMAGE) $(ALIAS) @@ -37,7 +61,7 @@ $(ROOTFS): $(BUILD_ROOT) $(AIRFLOW_CONF): $(BUILD_ROOT) mkdir -p $(shell dirname $(AIRFLOW_CONF)) - cp config/airflow.cfg $(AIRFLOW_CONF) + cp config/airflow.cfg.in $(AIRFLOW_CONF) $(ENTRYPOINT_SH): $(BUILD_ROOT) mkdir -p $(shell dirname $(ENTRYPOINT_SH)) diff --git a/README.md b/README.md index f7835a7..f3f2f5f 100644 --- a/README.md +++ b/README.md @@ -15,31 +15,21 @@ This is useful when you'd want: This repository contains: * **Dockerfile(.template)** of [airflow](https://github.com/apache/incubator-airflow) for [Docker](https://www.docker.com/) images published to the public [Docker Hub Registry](https://registry.hub.docker.com/). -* **airflow.all.yaml** for creating Kubernetes services and deployments to run Airflow on Kubernetes +* **airflow.all.yaml** for manual creating Kubernetes services and deployments to run Airflow on Kubernetes +* **Helm Chart** for deployments using Helm ## Informations * Highly inspired by the great work [puckel/docker-airflow](https://github.com/puckel/docker-airflow) -* Based on Debian Jessie official Image [debian:jessie](https://registry.hub.docker.com/_/debian/) and uses the official [Postgres](https://hub.docker.com/_/postgres/) as backend and [RabbitMQ](https://hub.docker.com/_/rabbitmq/) as queue +* Based on Debian Jessie official Image [debian:stretch](https://registry.hub.docker.com/_/debian/) and uses the official [Postgres](https://hub.docker.com/_/postgres/) as backend and [RabbitMQ](https://hub.docker.com/_/rabbitmq/) as queue * Following the Airflow release from [Python Package Index](https://pypi.python.org/pypi/airflow) -## Installation +## Manual Installation -Create all the deployments and services for Airflow: - - kubectl create -f airflow.all.yaml - -## Build - -`git clone` this repository and then just run: - - make build - -## Usage Create all the deployments and services to run Airflow on Kubernetes: - kubectl create -f airflow.all.yaml + kubectl create -f airflow.all.yaml It will create deployments for: @@ -57,6 +47,50 @@ and services for: * airflow-webserver * airflow-flower +## Helm Deployment + +Ensure your helm installation is done, you may need to have `TILLER_NAMESPACE` set as +environment variable. + +Deploy to Kubernetes using: + + make helm-install NAMESPACE=airflow + +Upgrade your installation with: + + make helm-upgrade + +Remove from the cluster using: + + make helm-uninstall + +### Helm ingresses + +The Chart provides ingress configuration to allow customization the installation by adapting +the `config.yaml` depending on your setup. + +### Helm configuration customization + +Helm allow to overload the configuration to adapt to your environment. You probably want to specify +your own ingress configuration for instance. + +Their is a "prefix" settings allowing to prefix a string to any Kubernetes names. That allows +instantiating several, independent Airflow in the same namespace. + +Note: + + Do NOT use characters such as " (double quote), ' (simple quote), / (slash) or \ (backslash) + in your passwords and prefix + + +## Build Docker image + +`git clone` this repository and then just run: + + make build + +## Run with minikube + You can browse the Airflow dashboard via running: make browse-web diff --git a/airflow/Chart.yaml b/airflow/Chart.yaml new file mode 100644 index 0000000..78d5951 --- /dev/null +++ b/airflow/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +description: Airflow installation +name: airflow +version: v0.1.0 diff --git a/airflow/templates/configmaps.yaml b/airflow/templates/configmaps.yaml new file mode 100644 index 0000000..8b8d06d --- /dev/null +++ b/airflow/templates/configmaps.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.airflow.prefix -}}airflow-cfg" +data: + AIRFLOW_HOME: "/usr/local/airflow" + {{- if .Values.airflow.fernet_key }} + FERNET_KEY: "{{- .Values.airflow.fernet_key -}}" + {{- end }} + RABBITMQ_HOST: "{{- .Values.airflow.prefix -}}{{- .Values.db.rabbitmq.basename -}}" + {{- if .Values.db.rabbitmq.user }} + RABBITMQ_CREDS: "{{- .Values.db.rabbitmq.user -}}:{{- .Values.db.rabbitmq.password -}}" + {{- end }} + POSTGRES_HOST: "{{- .Values.airflow.prefix -}}{{- .Values.db.postgres.basename -}}" + {{- if .Values.db.postgres.user }} + POSTGRES_CREDS: "{{- .Values.db.postgres.user -}}:{{- .Values.db.postgres.password -}}" + {{- end }} + {{- if .Values.airflow.init_retry_loop }} + TRY_LOOP: "{{ .Values.airflow.init_retry_loop -}}" + {{- end }} + FLOWER_URL_PREFIX: "{{ .Values.flower.url_prefix -}}" + AIRFLOW_URL_PREFIX: "{{ .Values.airflow.url_prefix -}}" + {{ if .Values.dags.git_sync_enabled }} + GIT_SYNC_REPO: "{{ .Values.dags.git_repo }}" + GIT_SYNC_BRANCH: "{{ .Values.dags.git_branch }}" + GIT_SYNC_WAIT: "{{ .Values.dags.poll_interval_sec }}" + GIT_SYNC_DEBUG: "{{ .Values.dags.git_sync_debug }}" + {{ end }} + LOAD_DAGS_EXAMPLES: "{{ .Values.dags.load_examples }}" diff --git a/airflow/templates/deployments.yaml b/airflow/templates/deployments.yaml new file mode 100644 index 0000000..0e50dfa --- /dev/null +++ b/airflow/templates/deployments.yaml @@ -0,0 +1,155 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.airflow.prefix -}}postgres +spec: + replicas: 1 + template: + metadata: + labels: + app: airflow + tier: db + spec: + containers: + - name: {{ .Values.airflow.prefix -}}postgres + image: postgres + ports: + - name: {{ .Values.airflow.prefix -}}postgres + containerPort: 5432 + env: + - name: POSTGRES_USER + value: "{{ .Values.db.postgres.user -}}" + - name: POSTGRES_PASSWORD + value: "{{ .Values.db.postgres.password -}}" + - name: POSTGRES_DB + value: "{{ .Values.db.postgres.database -}}" +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.airflow.prefix -}}rabbitmq +spec: + replicas: 1 + template: + metadata: + labels: + app: airflow + tier: rabbitmq + spec: + restartPolicy: Always + containers: + - name: {{ .Values.airflow.prefix -}}rabbitmq + image: rabbitmq:3-management + ports: + - name: {{ .Values.airflow.prefix -}}management + containerPort: 15672 + - name: {{ .Values.airflow.prefix -}}node + containerPort: 5672 + env: + - name: RABBITMQ_DEFAULT_USER + value: "{{ .Values.db.rabbitmq.user -}}" + - name: RABBITMQ_DEFAULT_PASS + value: "{{ .Values.db.rabbitmq.password -}}" + - name: RABBITMQ_DEFAULT_VHOST + value: "airflow" +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.airflow.prefix -}}web +spec: + replicas: 1 + template: + metadata: + labels: + app: airflow + tier: web + spec: + restartPolicy: Always + containers: + - name: {{ .Values.airflow.prefix -}}web + image: {{ .Values.airflow.image }} + imagePullPolicy: {{ .Values.airflow.image_pull_policy}} + envFrom: + - configMapRef: + name: {{ .Values.airflow.prefix }}airflow-cfg + ports: + - name: {{ .Values.airflow.prefix -}}web + containerPort: 8080 + args: ["webserver"] +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.airflow.prefix -}}flower +spec: + replicas: 1 + template: + metadata: + labels: + app: airflow + tier: flower + spec: + restartPolicy: Always + containers: + - name: {{ .Values.airflow.prefix -}}flower + image: {{ .Values.airflow.image }} + imagePullPolicy: {{ .Values.airflow.image_pull_policy}} + env: + - name: FLOWER_PORT + value: "5555" + envFrom: + - configMapRef: + name: {{ .Values.airflow.prefix }}airflow-cfg + ports: + - name: {{ .Values.airflow.prefix -}}flower + containerPort: 5555 + args: ["flower"] +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.airflow.prefix -}}scheduler +spec: + replicas: 1 + template: + metadata: + labels: + app: airflow + tier: scheduler + spec: + restartPolicy: Always + containers: + - name: {{ .Values.airflow.prefix -}}scheduler + image: {{ .Values.airflow.image }} + imagePullPolicy: {{ .Values.airflow.image_pull_policy}} + # volumes: + # - /localpath/to/dags:/usr/local/airflow/dags + envFrom: + - configMapRef: + name: {{ .Values.airflow.prefix }}airflow-cfg + args: ["scheduler", "-n", "{{ .Values.airflow.scheduler_num_runs }}"] +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.airflow.prefix -}}worker +spec: + replicas: {{ .Values.celery.num_workers }} + template: + metadata: + labels: + app: airflow + tier: worker + spec: + restartPolicy: Always + containers: + - name: {{ .Values.airflow.prefix -}}worker + image: {{ .Values.airflow.image }} + imagePullPolicy: {{ .Values.airflow.image_pull_policy}} + # volumes: + # - /localpath/to/dags:/usr/local/airflow/dags + envFrom: + - configMapRef: + name: {{ .Values.airflow.prefix }}airflow-cfg + args: ["worker"] diff --git a/airflow/templates/ingresses.yaml b/airflow/templates/ingresses.yaml new file mode 100644 index 0000000..83ecc7d --- /dev/null +++ b/airflow/templates/ingresses.yaml @@ -0,0 +1,37 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: {{ .Values.airflow.prefix -}}airflow + annotations: + {{ range $key, $value := .Values.ingress.annotations }} + {{ $key }}: {{ $value | quote }} + {{- end }} +spec: + rules: + - http: + paths: + - path: {{ .Values.ingress.path.web }} + backend: + serviceName: {{ .Values.airflow.prefix -}}web + servicePort: {{ .Values.airflow.prefix -}}pweb + host: {{ .Values.ingress.host }} +--- +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: {{ .Values.airflow.prefix -}}airflow-flower + annotations: + {{ range $key, $value := .Values.ingress.annotations }} + {{ $key }}: {{ $value | quote }} + {{- end }} +spec: + rules: + - http: + paths: + - path: {{ .Values.ingress.path.flower }} + backend: + serviceName: {{ .Values.airflow.prefix -}}flower + servicePort: {{ .Values.airflow.prefix -}}pflower + host: {{ .Values.ingress.host }} +{{- end }} diff --git a/airflow/templates/services.yaml b/airflow/templates/services.yaml new file mode 100644 index 0000000..a5514f2 --- /dev/null +++ b/airflow/templates/services.yaml @@ -0,0 +1,67 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.airflow.prefix -}}postgres +spec: + type: ClusterIP + selector: + app: airflow + tier: db + ports: + - name: {{ .Values.airflow.prefix -}}postgres + protocol: TCP + port: 5432 + targetPort: {{ .Values.airflow.prefix -}}postgres +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.airflow.prefix -}}rabbitmq +spec: + type: ClusterIP + selector: + app: airflow + tier: rabbitmq + ports: + - name: {{ .Values.airflow.prefix -}}node + protocol: TCP + port: 5672 + targetPort: {{ .Values.airflow.prefix -}}node + - name: {{ .Values.airflow.prefix -}}management + protocol: TCP + port: 15672 + targetPort: {{ .Values.airflow.prefix -}}management +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.airflow.prefix -}}web +spec: + type: NodePort + selector: + app: airflow + tier: web + ports: + - name: {{ .Values.airflow.prefix -}}web + protocol: TCP + port: 8080 + name: {{ .Values.airflow.prefix -}}pweb + targetPort: {{ .Values.airflow.prefix -}}web + nodePort: 32080 +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.airflow.prefix -}}flower +spec: + type: NodePort + selector: + app: airflow + tier: flower + ports: + - name: {{ .Values.airflow.prefix -}}flower + protocol: TCP + port: 5555 + name: {{ .Values.airflow.prefix -}}pflower + targetPort: {{ .Values.airflow.prefix -}}flower + nodePort: 32081 diff --git a/airflow/values.yaml b/airflow/values.yaml new file mode 100644 index 0000000..7a84a1d --- /dev/null +++ b/airflow/values.yaml @@ -0,0 +1,54 @@ +airflow: + # common settings and setting for the webserver + # For prefix, use only lower case alphanumeric characters, '-' or '.', and must start and end + # with an alphanumeric character + prefix: " " + fernet_key: "" + service: + type: ClusterIP + baseUrl: "/airflow" + init_retry_loop: + image: mumoshu/kube-airflow:1.8.0.0-1.6.1 + # image_pull_policy: Always or IfNotPresent + image_pull_policy: Always + # Set scheduler_num_runs to -1 to loop indefinitively + scheduler_num_runs: "-1" + +celery: + num_workers: 1 + +ingress: + enabled: false + annotations: + host: "" + path: + web: /airflow + flower: /flower + +db: + rabbitmq: + # use_embedded == true means final hostname will be prefix + basename + use_embedded: true + user: airflow + password: airflow + basename: rabbitmq + database: airflow + postgres: + # use_embedded == true means final hostname will be prefix + basename + use_embedded: true + user: airflow + password: airflow + basename: postgres + database: airflow + +flower: + url_prefix: /flower + +dags: + git_sync_enabled: false + git_repo: + git_branch: master + poll_interval_sec: 60 + git_sync_debug: false + # Disable Load examples as soon as you enable git_repo + load_examples: true diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..89fc49f --- /dev/null +++ b/config.yaml @@ -0,0 +1,8 @@ +airflow: + # Duplicate this file and put your customization here + # Generate fernet_key with: + # python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)" + # fernet_key: ABCDABCDABCDABCDABCDABCDABCDABCDABCDABCD + # Note: + # do NOT use characters such as " (double quote), ' (simple quote), / (slash) or \ (backslash) + # in your passwords diff --git a/config/airflow.cfg b/config/airflow.cfg.in similarity index 95% rename from config/airflow.cfg rename to config/airflow.cfg.in index 2fcbd54..f79306a 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg.in @@ -27,7 +27,7 @@ executor = CeleryExecutor # The SqlAlchemy connection string to the metadata database. # SqlAlchemy supports many different database engine, more information # their website -sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow +sql_alchemy_conn = postgresql+psycopg2://{{ POSTGRES_CREDS }}@{{ POSTGRES_HOST }}/airflow # The SqlAlchemy pool size is the maximum number of database connections # in the pool. @@ -55,13 +55,13 @@ max_active_runs_per_dag = 16 # Whether to load the examples that ship with Airflow. It's good to # get started, but you probably want to set this to False in a production # environment -load_examples = True +load_examples = {{ LOAD_DAGS_EXAMPLES }} # Where your Airflow plugins are stored plugins_folder = /usr/local/airflow/plugins # Secret key to save connection passwords in the db -fernet_key = $FERNET_KEY +fernet_key = {{ FERNET_KEY }} # Whether to disable pickling dags donot_pickle = False @@ -141,10 +141,10 @@ worker_log_server_port = 8793 # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally # a sqlalchemy database. Refer to the Celery documentation for more # information. -broker_url = amqp://airflow:airflow@rabbitmq:5672/airflow +broker_url = amqp://{{ RABBITMQ_CREDS }}@{{ RABBITMQ_HOST }}:5672/airflow # Another key Celery setting -celery_result_backend = amqp://airflow:airflow@rabbitmq:5672/airflow +celery_result_backend = amqp://{{ RABBITMQ_CREDS }}@{{ RABBITMQ_HOST }}:5672/airflow # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start # it `airflow flower`. This defines the port that Celery Flower runs on diff --git a/script/entrypoint.sh b/script/entrypoint.sh index 19d75ab..ff29d4c 100644 --- a/script/entrypoint.sh +++ b/script/entrypoint.sh @@ -1,20 +1,37 @@ #!/usr/bin/env bash CMD="airflow" -TRY_LOOP="10" -POSTGRES_HOST="postgres" -POSTGRES_PORT="5432" -RABBITMQ_HOST="rabbitmq" -RABBITMQ_CREDS="airflow:airflow" -FERNET_KEY=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print FERNET_KEY") +TRY_LOOP="${TRY_LOOP:-10}" +POSTGRES_HOST="${POSTGRES_HOST:-postgres}" +POSTGRES_PORT=5432 +POSTGRES_CREDS="${POSTGRES_CREDS:-airflow:airflow}" +RABBITMQ_HOST="${RABBITMQ_HOST:-rabbitmq}" +RABBITMQ_CREDS="${RABBITMQ_CREDS:-airflow:airflow}" +RABBITMQ_MANAGEMENT_PORT=15672 +FLOWER_URL_PREFIX="${FLOWER_URL_PREFIX:-/}" +LOAD_DAGS_EXAMPLES="${LOAD_DAGS_EXAMPLES:false}" + +if [ -z $FERNET_KEY ]; then + FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY))" +fi + +echo "Postgres host: $POSTGRES_HOST" +echo "RabbitMQ host: $RABBITMQ_HOST" +echo # Generate Fernet key -sed -i "s/{FERNET_KEY}/${FERNET_KEY}/" $AIRFLOW_HOME/airflow.cfg +cp -f $AIRFLOW_HOME/airflow.cfg.in $AIRFLOW_HOME/airflow.cfg +sed -i "s/{{ FERNET_KEY }}/${FERNET_KEY}/" $AIRFLOW_HOME/airflow.cfg +sed -i "s/{{ POSTGRES_HOST }}/${POSTGRES_HOST}/" $AIRFLOW_HOME/airflow.cfg +sed -i "s/{{ POSTGRES_CREDS }}/${RABBITMQ_CREDS}/" $AIRFLOW_HOME/airflow.cfg +sed -i "s/{{ RABBITMQ_HOST }}/${RABBITMQ_HOST}/" $AIRFLOW_HOME/airflow.cfg +sed -i "s/{{ RABBITMQ_CREDS }}/${RABBITMQ_CREDS}/" $AIRFLOW_HOME/airflow.cfg +sed -i "s/{{ LOAD_DAGS_EXAMPLES }}/${LOAD_DAGS_EXAMPLES}/" $AIRFLOW_HOME/airflow.cfg # wait for rabbitmq if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] || [ "$1" = "flower" ] ; then j=0 - while ! curl -sI -u $RABBITMQ_CREDS http://$RABBITMQ_HOST:15672/api/whoami |grep '200 OK'; do + while ! curl -sI -u $RABBITMQ_CREDS http://$RABBITMQ_HOST:$RABBITMQ_MANAGEMENT_PORT/api/whoami |grep '200 OK'; do j=`expr $j + 1` if [ $j -ge $TRY_LOOP ]; then echo "$(date) - $RABBITMQ_HOST still not reachable, giving up" @@ -44,4 +61,9 @@ if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; the sleep 5 fi -exec $CMD "$@" +if [ ! -z $GIT_SYNC_REPO ]; then + echo "Executing background task git-sync on repo $GIT_SYNC_REPO" + $AIRFLOW_HOME/git-sync --dest $AIRFLOW_HOME/dags & +fi + +$CMD "$@" diff --git a/script/git-sync b/script/git-sync new file mode 100644 index 0000000..2fdf525 --- /dev/null +++ b/script/git-sync @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +''' +Git-sync script + +Forked from : + + https://github.com/jlowin/git-sync/edit/master/git-sync.py + +''' + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import datetime +import os +import shlex +import subprocess +import sys +import time + +import click +# try to be py2/3 compatible +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + + +def sh(*args, **kwargs): + """ Get subprocess output""" + return subprocess.check_output(*args, **kwargs).decode().strip() + + +def get_repo_at(dest): + if not os.path.exists(os.path.join(dest, '.git')): + raise ValueError('No repo found at {dest}'.format(**locals())) + + current_remote = sh( + shlex.split('git config --get remote.origin.url'), + cwd=dest) + + current_branch = sh( + shlex.split('git rev-parse --abbrev-ref HEAD'), + cwd=dest) + + return current_remote.lower(), current_branch.lower() + +def setup_repo(repo, dest, branch): + """ + Clones `branch` of remote `repo` to `dest`, if it doesn't exist already. + Raises an error if a different repo or branch is found. + """ + dest = os.path.expanduser(dest) + + repo_name = urlparse(repo).path + + # if no git repo exists at dest, clone the requested repo + if not os.path.exists(os.path.join(dest, '.git')): + output = sh( + ['git', 'clone', '--no-checkout', '-b', branch, repo, dest]) + click.echo('Cloned ...{repo_name}'.format(**locals())) + + else: + # if there is a repo, make sure it's the right one + current_remote, current_branch = get_repo_at(dest) + repo = repo.lower() + if not repo.endswith('.git'): + repo += '.git' + if not current_remote.endswith('.git'): + current_remote += '.git' + parsed_remote = urlparse(current_remote) + parsed_repo = urlparse(repo) + + if (parsed_repo.netloc != parsed_remote.netloc or parsed_repo.path != parsed_remote.path): + raise ValueError( + 'Requested repo `...{repo_name}` but destination already ' + 'has a remote repo cloned: {current_remote}'.format(**locals())) + + # and check that the branches match as well + if branch.lower() != current_branch: + raise ValueError( + 'Requested branch `{branch}` but destination is ' + 'already on branch `{current_branch}`'.format(**locals())) + + # and check that we aren't going to overwrite any changes! + # modified_status: uncommited modifications + # ahead_status: commited but not pushed + modified_status = sh(shlex.split('git status -s'), cwd=dest) + ahead_status = sh(shlex.split('git status -sb'), cwd=dest)[3:] + if modified_status: + raise ValueError( + 'There are uncommitted changes at {dest} that syncing ' + 'would overwrite'.format(**locals())) + if '[ahead ' in ahead_status: + raise ValueError( + 'This branch is ahead of the requested repo and syncing would ' + 'overwrite the changes: {ahead_status}'.format(**locals())) + + +def sync_repo(repo, dest, branch, rev): + """ + Syncs `branch` of remote `repo` (at `rev`) to `dest`. + Assumes `dest` has already been cloned. + """ + # fetch branch + output = sh(['git', 'fetch', 'origin', branch], cwd=dest) + click.echo('Fetched {branch}: {output}'.format(**locals())) + + # reset working copy + if not rev: + output = sh(['git', 'reset', '--hard', 'origin/' + branch], cwd=dest) + else: + output = sh(['git', 'reset', '--hard', rev], cwd=dest) + + # clean untracked files + sh(['git', 'clean', '-dfq'], cwd=dest) + + click.echo('Reset to {rev}: {output}'.format(**locals())) + + repo_name = urlparse(repo).path + click.echo( + 'Finished syncing {repo_name}:{branch} at {t:%Y-%m-%d %H:%M:%S}'.format( + t=datetime.datetime.now(), **locals())) + + +@click.command() +@click.option('--dest', '-d', envvar='GIT_SYNC_DEST', default=os.getcwd(), + help=('The destination path. Defaults to the current working directory; ' + 'can also be set with envvar GIT_SYNC_DEST.')) +@click.option('--repo', '-r', envvar='GIT_SYNC_REPO', default='', + help=('The url of the remote repo to sync. Defaults to inferring from `dest`; ' + 'can also be set with envvar GIT_SYNC_REPO.')) +@click.option('--branch', '-b', envvar='GIT_SYNC_BRANCH', default='', + help=('The branch to sync. Defaults to inferring from `repo` (if already cloned), ' + 'otherwise defaults to master; can also be set with envvar GIT_SYNC_BRANCH.')) +@click.option('--wait', '-w', envvar='GIT_SYNC_WAIT', default=60, + help=('The number of seconds to pause after each sync. Defaults to 60; ' + 'can also be set with envvar GIT_SYNC_WAIT.')) +@click.option('--run-once', '-1', envvar='GIT_SYNC_RUN_ONCE', is_flag=True, + help=("Run only once (don't loop). Defaults to off; can also be set with envvar " + "GIT_SYNC_RUN_ONCE=true.")) +@click.option('--rev', envvar='GIT_SYNC_REV', default=None, + help=('The revision to sync. Defaults to HEAD; can also be set with envvar ' + 'GIT_SYNC_REV.')) +@click.option('--debug', envvar='GIT_SYNC_DEBUG', is_flag=True, + help='Print tracebacks on error.') +def git_sync(repo, dest, branch, rev, wait, run_once, debug): + """ + Periodically syncs a remote git repository to a local directory. The sync + is one-way; any local changes will be lost. + """ + + if not debug: + sys.excepthook = ( + lambda etype, e, tb: print("{}: {}".format(etype.__name__, e))) + + # infer repo/branch + if not repo and not branch: + repo, branch = get_repo_at(dest) + elif not repo: + repo, _ = get_repo_at(dest) + elif not branch: + branch = 'master' + + setup_repo(repo, dest, branch) + while True: + sync_repo(repo, dest, branch, rev) + if run_once: + break + click.echo('Waiting {wait} seconds...'.format(**locals())) + time.sleep(wait) + + +if __name__ == '__main__': + git_sync()