From 8c2689877fb48bfb4a6a133b020cdb5ec7c9b066 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Sun, 30 Jun 2024 05:19:51 +1000 Subject: [PATCH] CoreML: Disable 1D ML Program matmul due to bug in coreml (#21186) ### Description Disable using CoreML ML Program for a matmul where one of the inputs is 1D as the CoreML implementation appears to be broken. See https://github.com/apple/coremltools/issues/2263 Add some debugging notes. ### Motivation and Context Fix failing test on macos-14. --- .github/workflows/mac.yml | 3 +- .../core/providers/coreml/DebugMLProgram.md | 85 +++++++++++++++++++ .../coreml/builders/impl/gemm_op_builder.cc | 33 ++++--- .../coreml/builders/model_builder.cc | 1 + .../providers/coreml/dump_mlprogram_model.py | 9 +- 5 files changed, 114 insertions(+), 17 deletions(-) create mode 100644 onnxruntime/core/providers/coreml/DebugMLProgram.md diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 8aaec8adef97..3d94d30947c7 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -54,11 +54,10 @@ jobs: --test \ --build_shared_lib \ --build_objc \ + --use_coreml \ --use_xnnpack \ --use_binskim_compliant_compile_flags - # TODO add --use_coreml once unit test failures are addressed - Objective-C-StaticAnalysis: runs-on: macos-14 diff --git a/onnxruntime/core/providers/coreml/DebugMLProgram.md b/onnxruntime/core/providers/coreml/DebugMLProgram.md new file mode 100644 index 000000000000..e41a51559430 --- /dev/null +++ b/onnxruntime/core/providers/coreml/DebugMLProgram.md @@ -0,0 +1,85 @@ +# Steps to debug an ML Program operator implementation + +Basic debugging of everything, excluding model execution, (e.g. partitioning, checking if operator is supported, +adding CoreML operator input/outputs) can be done anywhere as the code is setup to build and be able to create the +protobuf based CoreML Model on all platforms. + +To debug model execution issues you will need a macOS machine. + +## Debugging invalid output + +If there is a crash during execution or unexpected output, the best approach is to see what using coremltools directly +produces. + +NOTE: that doesn't guarantee coremltools is correct as there could be a bug in their implementation. It does however +provide a data point on whether we are generating the same CoreML model as the coremltools python. + +### Comparing to coremltools output + +Create a small test script that replicates the inputs/outputs of the operator you are debugging. +This script should use the coremltools library to run the operator and print the output. +This can be used to compare the CoreML EP's output with the coremltools output. + +https://apple.github.io/coremltools/docs-guides/source/model-intermediate-language.html#create-a-mil-program + +Usage is reasonably intuitive. The below example defines a model with 2 inputs and a matmul operator. +The model is printed, and run with randomly generated inputs. The output from doing so is printed. + +```python +import numpy as np +import coremltools as ct +from coremltools.converters.mil import Builder as mb + +target = ct.target.iOS15 + +x_shape = (1, 4) +y_shape = (10, 4, 3) + +@mb.program(input_specs=[mb.TensorSpec(shape=x_shape), mb.TensorSpec(shape=y_shape)], + opset_version=target) +def prog(x, y): + # For reference, a constant can be added using `mb.const` and specifying the data in the `val` parameter. + # c_shape = (3, ) + # c_data = np.random.random_sample(c_shape) + # c = mb.const(val=c_data) + + # call the operator you are debugging with the inputs/constants. + # See the spec for the operator names, input/outputs and supported data types. + # https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html + z = mb.matmul(x=x, y=y) + + # can have additional function calls here if there are multiple operators involved. + # Contrived example that uses a constant and the output from a previous operator: + # z = mb.add(x=z, y=c) + + return z + +# Prints the MIL program in a reasonably concise manner. +print(prog) + +# Convert to ML Program model +m = ct.convert(prog, minimum_deployment_target=target) + +# If you want to dump the full protobuf of the model uncomment this. +# You can compare the values to what is being set by the ORT CoreML EP code if you suspect any issues there. +# spec = m.get_spec() +# print(spec) + +# run the model to generate output for comparison with the CoreML EP output +x = np.random.rand(*x_shape) +y = np.random.rand(*y_shape) + +print(m.predict({'x': x, 'y': y})) +``` + +## Dumping the ORT generated mlmodel + +You can also dump the mlmodel generated by the ORT CoreML EP. This can be handy with larger models. + +In a debug build, set the ORT_COREML_EP_MODEL_DIR environment variable to a directory where you want the ML Package +containing the mlmodel to be saved. The model will remain after the CoreML EP exits, unlike the default behavior +where we write it to a temporary directory that is automatically removed on application exit. + +Script to dump: [dump_mlprogram_model.py](dump_mlprogram_model.py) + +See [here](https://github.com/microsoft/onnxruntime/blob/3c0b407709fd3c71755ed046edd688b30a786d94/onnxruntime/core/providers/coreml/model/host_utils.h#L70-L75) for environment variable setup and [usage](https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kOverrideModelOutputDirectoryEnvVar%20&type=code). diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc index 8daf64dc4a45..7338fc18fe77 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc @@ -109,19 +109,11 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N ORT_IGNORE_RETURN_VALUE(GetShape(b, b_shape, logger)); int64_t b0 = -1, b1 = -1; - // ML Program MatMul supports N-D input if (model_builder.CreateMLProgram() && is_matmul) { - if (b_shape.size() == 1) { - // B is treated as {b_shape[0], 1} according to the numpy rules. - b0 = b_shape[0]; - b1 = 1; - } else { - // last 2 dims are used - b0 = b_shape[b_shape.size() - 2]; - b1 = b_shape[b_shape.size() - 1]; - } + // ML Program MatMul supports N-D input, however we don't use the 'K' or 'N' values calculated below for it + // so we don't need to update b0 or b1. } else { - // we only support 2D input + // we only support 2D input for all other combinations b0 = b_shape[0]; b1 = b_shape[1]; } @@ -182,7 +174,6 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N model_builder.AddOperation(std::move(gemm_op)); } else { // CoreML implementation is the same as ONNX MatMul. - // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.matmul auto matmul_op = model_builder.CreateOperation(node, "matmul"); AddOperationInput(*matmul_op, "x", a.Name()); AddOperationInput(*matmul_op, "y", b.Name()); @@ -268,14 +259,28 @@ bool GemmOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara } if (is_matmul) { + const auto a_rank = a_shape.size(); + const auto b_rank = b_shape.size(); + if (input_params.create_mlprogram) { - // ML Program matmul op has numpy semantics the same as the ONNX spec so we can use directly + // ML Program matmul op has numpy semantics the same as the ONNX spec, so we can use directly. + // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.matmul + // + // There does appear to be a bug in handling one of the inputs being 1D, so for now skip these. + // See https://github.com/apple/coremltools/issues/2263 + // + // If required for perf we could manually do the shape alterations the spec documents (convert input to 2D, + // and remove extra dimension from output), as the 2D input is correctly handled by CoreML matmul. + if ((a_rank == 1 && b_rank > 1) || (a_rank > 1 && b_rank == 1)) { + LOGS(logger, VERBOSE) << "Skipping due to bug in CoreML ML Program when one of the inputs is 1D."; + return false; + } } else { // we could potentially support 1D and 3D if required. beyond 3D the dims that merge diverge. // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/onnx/_operators.py#L1607 // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/backend/nn/op_mapping.py#L1374 // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#innerproductlayerparams - if (a_shape.size() != 2 || b_shape.size() != 2) { + if (a_rank != 2 || b_rank != 2) { LOGS(logger, VERBOSE) << "a and b inputs must be 2D. "; return false; } diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc index 88b518ab2289..eec0fcce51db 100644 --- a/onnxruntime/core/providers/coreml/builders/model_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc @@ -906,6 +906,7 @@ Status ModelBuilder::SaveModel() { #if defined(COREML_ENABLE_MLPROGRAM) if (create_ml_program_) { + // we need to jump through some hoops to get the model path the ML Program load wants. std::string tmp_model_path = model_output_path_ + "/tmp/model.mlmodel"; CreateEmptyFile(tmp_model_path); diff --git a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py index a3ceee70684d..dce98e5138d9 100644 --- a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py +++ b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py @@ -5,6 +5,11 @@ if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") print("If generated by onnxruntime this will be /Data/com.microsoft.onnxruntime/model.mlmodel") + print( + "The ML Package created by the CoreML EP can saved to a specific directory in a debug build of onnxruntime " + "by setting the environment variable ORT_COREML_EP_MODEL_DIR to the desired directory." + ) + sys.exit(-1) model_path = sys.argv[1] @@ -13,7 +18,9 @@ spec = m.get_spec() print(spec) -# Example code if you want to filter output or do more advanced things +# Example code if you want to filter output or do more advanced things. +# In the below example we print out the value of an attribute of one specific node from a larger model. +# # main = spec.mlProgram.functions["main"] # block = main.block_specializations[main.opset] # print(f"{len(block.operations)} operators")