diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index bb397bc8574829..d0efae8b479844 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -111,9 +111,24 @@ jobs: echo "::set-output name=matrix::$keys" echo "::set-output name=test_map::$test_map" + run_check_runners: + name: Check Runners + needs: setup + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: NVIDIA-SMI + run: | + nvidia-smi + run_tests_single_gpu: name: Model tests - needs: setup + needs: [setup, run_check_runners] # `dummy` means there is no test to run if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true strategy: @@ -198,7 +213,7 @@ jobs: run_tests_multi_gpu: name: Model tests - needs: setup + needs: [setup, run_check_runners] # `dummy` means there is no test to run if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true strategy: @@ -285,7 +300,7 @@ jobs: run_tests_torch_cuda_extensions_single_gpu: name: Torch CUDA extension tests - needs: setup + needs: [setup, run_check_runners] if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') strategy: fail-fast: false @@ -364,7 +379,7 @@ jobs: run_tests_torch_cuda_extensions_multi_gpu: name: Torch CUDA extension tests - needs: setup + needs: [setup, run_check_runners] if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') strategy: fail-fast: false @@ -447,12 +462,20 @@ jobs: if: always() needs: [ setup, + run_check_runners, run_tests_single_gpu, run_tests_multi_gpu, run_tests_torch_cuda_extensions_single_gpu, run_tests_torch_cuda_extensions_multi_gpu ] steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Setup status: ${{ needs.setup.result }}" + echo "Runner status: ${{ needs.run_check_runners.result }}" + # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) - name: Prepare custom environment variables @@ -498,6 +521,9 @@ jobs: CI_TITLE_PUSH: ${{ github.event.head_commit.message }} CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} CI_SHA: ${{ env.CI_SHA }} + SETUP_STATUS: ${{ needs.setup.result }} + RUNNER_STATUS: ${{ needs.run_check_runners.result }} + # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | diff --git a/utils/notification_service.py b/utils/notification_service.py index 4918b4a459ac38..9ed97236d46270 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -387,28 +387,52 @@ def payload(self) -> str: return json.dumps(blocks) @staticmethod - def error_out(): - payload = [ - { - "type": "section", - "text": { - "type": "plain_text", - "text": "There was an issue running the tests.", - }, - "accessory": { - "type": "button", - "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, - "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}", - }, - } - ] + def error_out(title, ci_title="", setup_failed=False, runner_failed=False): + + blocks = [] + title_block = {"type": "header", "text": {"type": "plain_text", "text": title}} + blocks.append(title_block) + + if ci_title: + ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}} + blocks.append(ci_title_block) + + if setup_failed: + text = "💔 Setup job failed. Tests are not run. 😭" + elif runner_failed: + text = "💔 CI runners have problems! Tests are not run. 😭" + else: + text = "💔 There was an issue running the tests. 😭" + + error_block_1 = { + "type": "header", + "text": { + "type": "plain_text", + "text": text, + }, + } + error_block_2 = { + "type": "section", + "text": { + "type": "plain_text", + "text": "🙏 Let's fix it ASAP! 🙏", + }, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, + "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}", + }, + } + blocks.extend([error_block_1, error_block_2]) + + payload = json.dumps(blocks) print("Sending the following payload") - print(json.dumps({"blocks": json.loads(payload)})) + print(json.dumps({"blocks": blocks})) client.chat_postMessage( channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], - text="There was an issue running the tests.", + text=text, blocks=payload, ) @@ -630,6 +654,11 @@ def prepare_reports(title, header, reports, to_truncate=True): if __name__ == "__main__": + setup_status = os.environ.get("SETUP_STATUS") + runner_status = os.environ.get("RUNNER_STATUS") + setup_failed = True if setup_status is not None and setup_status != "success" else False + runner_failed = True if runner_status is not None and runner_status != "success" else False + org = "huggingface" repo = "transformers" repository_full_name = f"{org}/{repo}" @@ -689,6 +718,10 @@ def prepare_reports(title, header, reports, to_truncate=True): else: ci_title = "" + if setup_failed or runner_failed: + Message.error_out(title, ci_title, setup_failed, runner_failed) + exit(0) + arguments = sys.argv[1:][0] try: models = ast.literal_eval(arguments)