Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add checks for some workflow jobs #18583

Merged
merged 3 commits into from
Aug 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions .github/workflows/self-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,24 @@ jobs:
echo "::set-output name=matrix::$keys"
echo "::set-output name=test_map::$test_map"

run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi

run_tests_single_gpu:
name: Model tests
needs: setup
needs: [setup, run_check_runners]
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
Expand Down Expand Up @@ -198,7 +213,7 @@ jobs:

run_tests_multi_gpu:
name: Model tests
needs: setup
needs: [setup, run_check_runners]
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
Expand Down Expand Up @@ -285,7 +300,7 @@ jobs:

run_tests_torch_cuda_extensions_single_gpu:
name: Torch CUDA extension tests
needs: setup
needs: [setup, run_check_runners]
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
Expand Down Expand Up @@ -364,7 +379,7 @@ jobs:

run_tests_torch_cuda_extensions_multi_gpu:
name: Torch CUDA extension tests
needs: setup
needs: [setup, run_check_runners]
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
Expand Down Expand Up @@ -447,12 +462,20 @@ jobs:
if: always()
needs: [
setup,
run_check_runners,
run_tests_single_gpu,
run_tests_multi_gpu,
run_tests_torch_cuda_extensions_single_gpu,
run_tests_torch_cuda_extensions_multi_gpu
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Setup status: ${{ needs.setup.result }}"
echo "Runner status: ${{ needs.run_check_runners.result }}"

# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
Expand Down Expand Up @@ -498,6 +521,9 @@ jobs:
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}

# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
Expand Down
67 changes: 50 additions & 17 deletions utils/notification_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,28 +387,52 @@ def payload(self) -> str:
return json.dumps(blocks)

@staticmethod
def error_out():
payload = [
{
"type": "section",
"text": {
"type": "plain_text",
"text": "There was an issue running the tests.",
},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
}
]
def error_out(title, ci_title="", setup_failed=False, runner_failed=False):

blocks = []
title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
blocks.append(title_block)

if ci_title:
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
blocks.append(ci_title_block)

if setup_failed:
text = "💔 Setup job failed. Tests are not run. 😭"
elif runner_failed:
text = "💔 CI runners have problems! Tests are not run. 😭"
else:
text = "💔 There was an issue running the tests. 😭"

error_block_1 = {
"type": "header",
"text": {
"type": "plain_text",
"text": text,
},
}
error_block_2 = {
"type": "section",
"text": {
"type": "plain_text",
"text": "🙏 Let's fix it ASAP! 🙏",
},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
}
blocks.extend([error_block_1, error_block_2])

payload = json.dumps(blocks)

print("Sending the following payload")
print(json.dumps({"blocks": json.loads(payload)}))
print(json.dumps({"blocks": blocks}))

client.chat_postMessage(
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
text="There was an issue running the tests.",
text=text,
blocks=payload,
)

Expand Down Expand Up @@ -630,6 +654,11 @@ def prepare_reports(title, header, reports, to_truncate=True):

if __name__ == "__main__":

setup_status = os.environ.get("SETUP_STATUS")
runner_status = os.environ.get("RUNNER_STATUS")
setup_failed = True if setup_status is not None and setup_status != "success" else False
runner_failed = True if runner_status is not None and runner_status != "success" else False

org = "huggingface"
repo = "transformers"
repository_full_name = f"{org}/{repo}"
Expand Down Expand Up @@ -689,6 +718,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
else:
ci_title = ""

if setup_failed or runner_failed:
Message.error_out(title, ci_title, setup_failed, runner_failed)
exit(0)

arguments = sys.argv[1:][0]
try:
models = ast.literal_eval(arguments)
Expand Down