Skip to content

Commit

Permalink
Add checks for some workflow jobs (huggingface#18583)
Browse files Browse the repository at this point in the history
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
  • Loading branch information
2 people authored and oneraghavan committed Sep 26, 2022
1 parent dfe9f00 commit 4615cd3
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 21 deletions.
34 changes: 30 additions & 4 deletions .github/workflows/self-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,24 @@ jobs:
echo "::set-output name=matrix::$keys"
echo "::set-output name=test_map::$test_map"
run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
run_tests_single_gpu:
name: Model tests
needs: setup
needs: [setup, run_check_runners]
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
Expand Down Expand Up @@ -198,7 +213,7 @@ jobs:

run_tests_multi_gpu:
name: Model tests
needs: setup
needs: [setup, run_check_runners]
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
Expand Down Expand Up @@ -285,7 +300,7 @@ jobs:

run_tests_torch_cuda_extensions_single_gpu:
name: Torch CUDA extension tests
needs: setup
needs: [setup, run_check_runners]
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
Expand Down Expand Up @@ -364,7 +379,7 @@ jobs:

run_tests_torch_cuda_extensions_multi_gpu:
name: Torch CUDA extension tests
needs: setup
needs: [setup, run_check_runners]
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
Expand Down Expand Up @@ -447,12 +462,20 @@ jobs:
if: always()
needs: [
setup,
run_check_runners,
run_tests_single_gpu,
run_tests_multi_gpu,
run_tests_torch_cuda_extensions_single_gpu,
run_tests_torch_cuda_extensions_multi_gpu
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Setup status: ${{ needs.setup.result }}"
echo "Runner status: ${{ needs.run_check_runners.result }}"
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
Expand Down Expand Up @@ -498,6 +521,9 @@ jobs:
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}

# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
Expand Down
67 changes: 50 additions & 17 deletions utils/notification_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,28 +387,52 @@ def payload(self) -> str:
return json.dumps(blocks)

@staticmethod
def error_out():
payload = [
{
"type": "section",
"text": {
"type": "plain_text",
"text": "There was an issue running the tests.",
},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
}
]
def error_out(title, ci_title="", setup_failed=False, runner_failed=False):

blocks = []
title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
blocks.append(title_block)

if ci_title:
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
blocks.append(ci_title_block)

if setup_failed:
text = "💔 Setup job failed. Tests are not run. 😭"
elif runner_failed:
text = "💔 CI runners have problems! Tests are not run. 😭"
else:
text = "💔 There was an issue running the tests. 😭"

error_block_1 = {
"type": "header",
"text": {
"type": "plain_text",
"text": text,
},
}
error_block_2 = {
"type": "section",
"text": {
"type": "plain_text",
"text": "🙏 Let's fix it ASAP! 🙏",
},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
}
blocks.extend([error_block_1, error_block_2])

payload = json.dumps(blocks)

print("Sending the following payload")
print(json.dumps({"blocks": json.loads(payload)}))
print(json.dumps({"blocks": blocks}))

client.chat_postMessage(
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
text="There was an issue running the tests.",
text=text,
blocks=payload,
)

Expand Down Expand Up @@ -630,6 +654,11 @@ def prepare_reports(title, header, reports, to_truncate=True):

if __name__ == "__main__":

setup_status = os.environ.get("SETUP_STATUS")
runner_status = os.environ.get("RUNNER_STATUS")
setup_failed = True if setup_status is not None and setup_status != "success" else False
runner_failed = True if runner_status is not None and runner_status != "success" else False

org = "huggingface"
repo = "transformers"
repository_full_name = f"{org}/{repo}"
Expand Down Expand Up @@ -689,6 +718,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
else:
ci_title = ""

if setup_failed or runner_failed:
Message.error_out(title, ci_title, setup_failed, runner_failed)
exit(0)

arguments = sys.argv[1:][0]
try:
models = ast.literal_eval(arguments)
Expand Down

0 comments on commit 4615cd3

Please sign in to comment.