From 0d4f0f8ad2aeb534e3dbcb1acc3d02da1e60f87d Mon Sep 17 00:00:00 2001
From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Date: Tue, 19 Jul 2022 09:11:51 -0700
Subject: [PATCH] Cherry for release 1.12.0 final (#12218)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support optimizer opt for deepspeed 0.5.9

* resolve comments

* resolve comments

* FP16_Optimizer Support for more Deepspeed Versions (#12046)

* fp16_optimizer for more ds versions

* change ds version

* bugfix

* fix bug

* Fix unused function warning for decodeMIDR(). (#12069)

Changed from static function defined in header to function declared in header and defined in separate .cc file.

* pin protobuf version to be compatible with onnx (#12132)

Co-authored-by: Ashwini Khade <askhade@microsoft.com@orttrainingdev10.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>

* RoiAlign CPU EP add warning for max mode with samples != 1 (#12136)

* RoiAlign add warning about incorrect max summation when sample size not 1

* include coreml_provider_factory.h in macos build instead of coreml_ex… (#12138)

include coreml_provider_factory.h in macos build instead of coreml_execution_provider.h

* List 3.10 as supported python version and remove 3.6 (#12141)

list 3.10 as supported python version and remove 3.6

Co-authored-by: Randy Shuai <rashuai@microsoft.com>

* Use updated symbolic_helper.check_training_mode (#11900)

Co-authored-by: Jingyan Wang, Baiju Meswani

* Fix GH issue 12151 by using inverse perms for updating DQ axis attribute (#12158)

* Fix GH issue 12151.

Need to use inverse perms for updating that axis to what is used for transposing the input. This only applies if the DQ node is doing per-axis dequantization.

* fixing positions for beam search gpt2 (#12156)

* fixing positions for beam search gpt2
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>

* remove wrong placed libs (#12201)

* Add file mapping for windows platform. (#12183)

* Add file mapping for windows platform.

* Add unit test for file mapping for windows. Also add an error message for mis-aligned offset

* Add unit test for file mapping for windows. Also add an error message for mis-aligned offset

* Update data type to avoid warnings

* Compitable data type to avoid warnings. Update CreatFileMapping2 condition for winml compiling.

* Add type conversion to avoid warnings for X86 release build.

Co-authored-by: Ting Cao <ticao@microsoft.com>

* Fix bug where onnxruntime_USE_NCCL flag would default to ON (#12195)

Fix bug where onnxruntime_USE_NCCL flag would default to ON, causing ORT to not build properly. New functionality: flag is ON when training is enabled and NCCL is not disabled. Flag is OFF otherwise

Co-authored-by: zhijxu <zhijxu@microsoft.com>
Co-authored-by: zhijxu <zhijxu>
Co-authored-by: Vincent Wang <wangwchpku@outlook.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Ashwini Khade <askhade@microsoft.com>
Co-authored-by: Ashwini Khade <askhade@microsoft.com@orttrainingdev10.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Co-authored-by: Dwayne Robinson <dwayner@microsoft.com>
Co-authored-by: Carson Swope <carsonswope@users.noreply.github.com>
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
Co-authored-by: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com>
Co-authored-by: Scott McKay <skottmckay@gmail.com>
Co-authored-by: Viswanath Boga <44417868+viboga@users.noreply.github.com>
Co-authored-by: leqiao-1 <61653207+leqiao-1@users.noreply.github.com>
Co-authored-by: caoting-dotcom <71617901+caoting-dotcom@users.noreply.github.com>
Co-authored-by: Ting Cao <ticao@microsoft.com>
Co-authored-by: Sean Murray <59740888+seanmurr1@users.noreply.github.com>
---
 .../transformers/beam_search_device_helper.cc |  77 ++--
 .../transformers/beam_search_device_helper.h  |   9 +-
 .../cpu/transformers/beam_search_impl_gpt.h   |  25 +-
 .../transformers/beam_search_device_helper.cc |  39 +-
 .../transformers/beam_search_device_helper.h  |   4 +-
 .../cuda/transformers/beam_search_impl.cu     |   8 +-
 .../core/common/cpuid_arch_definition.h       |  14 +
 onnxruntime/core/common/cpuid_info.h          |  13 +-
 onnxruntime/core/common/cpuid_uarch.cc        | 369 ++++++++++++++++++
 onnxruntime/core/common/cpuid_uarch.h         | 365 +----------------
 .../transpose_optimizer.cc                    |   4 +-
 onnxruntime/core/platform/windows/env.cc      | 104 +++++
 .../providers/cpu/object_detection/roialign.h |   8 +-
 .../optimizer/transpose_optimizer_test.cc     |  39 +-
 onnxruntime/test/platform/file_io_test.cc     |  59 +++
 .../test/testdata/ort_github_issue_12151.onnx | Bin 0 -> 380 bytes
 .../python/training/optim/_ds_modifier.py     |  19 +-
 .../python/training/optim/_modifier.py        |   5 +
 .../training/optim/_modifier_registry.py      |  11 +-
 .../python/training/optim/fp16_optimizer.py   |   3 +
 .../_custom_autograd_function_exporter.py     |  17 +-
 requirements-training.txt                     |   2 +-
 setup.py                                      |   2 +-
 tools/ci_build/build.py                       |   2 +-
 .../github/linux/copy_strip_binary.sh         |   2 +-
 .../ci_build/github/windows/jar_packaging.ps1 |   2 +
 26 files changed, 728 insertions(+), 474 deletions(-)
 create mode 100644 onnxruntime/core/common/cpuid_arch_definition.h
 create mode 100644 onnxruntime/core/common/cpuid_uarch.cc
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_12151.onnx
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc
index 7b163dd923a3..68b1aab919c3 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc
@@ -197,7 +197,6 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
 
   // T5 does not need position, so next_positions is empty for T5.
   if (!beam_state->next_positions.empty()) {
-    memset(beam_state->next_positions.data(), 0, beam_state->next_positions.size_bytes());
     gsl::copy(sequence_lengths, beam_state->next_positions);
   }
 
@@ -274,13 +273,13 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
   gsl::span<T>& next_token_scores = beam_state->next_token_scores;
   ORT_RETURN_IF_ERROR(
-    SoftmaxCPU<T>(
-      batch_beam_size,  // rows
-      vocab_size,       // elements per row
-      (input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(),
-      next_token_scores.data(),
-      true,
-      thread_pool));
+      SoftmaxCPU<T>(
+          batch_beam_size,  // rows
+          vocab_size,       // elements per row
+          (input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(),
+          next_token_scores.data(),
+          true,
+          thread_pool));
 
 #ifdef DEBUG_BEAM_SEARCH
   dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size);
@@ -428,12 +427,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper) {
+    int gpt_subgraph_first_present_output_idx) {
   // last_outputs: logits, present_0, present_1, ...
   // next_inputs: input_ids, position_id, attention_mask, past_0, past_1
   ORT_UNUSED_PARAMETER(stream);
@@ -454,10 +453,12 @@ Status UpdateGptFeeds(
   }
   next_inputs[0] = input_ids;
 
-  // Update position IDs
-  int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
-  for (int i = 0; i < batch_beam_size; i++) {
-    position_data[i]++;
+  if (increase_position) {
+    // Update position IDs
+    int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
+    for (int i = 0; i < batch_beam_size; i++) {
+      position_data[i]++;
+    }
   }
   next_inputs[1] = position_ids;
 
@@ -477,14 +478,6 @@ Status UpdateGptFeeds(
   }
   next_inputs[2] = attention_mask;
 
-#ifdef DEBUG_BEAM_SEARCH
-  dumper->Print("input_ids", input_ids);
-  dumper->Print("position_ids", position_ids);
-  dumper->Print("attention_mask", attention_mask);
-#else
-  ORT_UNUSED_PARAMETER(dumper);
-#endif
-
   // Update past state
   if (num_beams == 1) {
     // feed present_* output to past_* inputs one by one
@@ -725,12 +718,12 @@ template Status UpdateGptFeeds<float>(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 template Status UpdateDecoderFeeds<float>(
     AllocatorPtr allocator,
@@ -751,28 +744,28 @@ template Status UpdateDecoderFeeds<float>(
 template void ExpandInputs<int32_t>(const OrtValue& input, int num_beams, AllocatorPtr allocator, OrtValue& expanded);
 
 template Status ExpandBuffer<int32_t>(
-  void* stream,
-  const OrtValue& input,
-  int num_beams,
-  AllocatorPtr allocator,
-  OrtValue& expanded,
-  bool only_copy_shape);
+    void* stream,
+    const OrtValue& input,
+    int num_beams,
+    AllocatorPtr allocator,
+    OrtValue& expanded,
+    bool only_copy_shape);
 
 template Status ExpandBuffer<float>(
-  void* stream,
-  const OrtValue& input,
-  int num_beams,
-  AllocatorPtr allocator,
-  OrtValue& expanded,
-  bool only_copy_shape);
+    void* stream,
+    const OrtValue& input,
+    int num_beams,
+    AllocatorPtr allocator,
+    OrtValue& expanded,
+    bool only_copy_shape);
 
 template Status ExpandBuffer<MLFloat16>(
-  void* stream,
-  const OrtValue& input,
-  int num_beams,
-  AllocatorPtr allocator,
-  OrtValue& expanded,
-  bool only_copy_shape);
+    void* stream,
+    const OrtValue& input,
+    int num_beams,
+    AllocatorPtr allocator,
+    OrtValue& expanded,
+    bool only_copy_shape);
 
 }  // namespace BeamSearchCpuDeviceHelper
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h
index ab18eec25cde..36ab8d8e93a9 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h
@@ -96,12 +96,12 @@ using UpdateGptFeedsFunc = std::function<Status(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper)>;
+    int gpt_subgraph_first_present_output_idx)>;
 
 // Create encoder inputs (for encoder-decoder model like T5).
 using CreateEncoderInputsFunc = std::function<Status(
@@ -142,7 +142,6 @@ using ExpandBufferFunc = std::function<Status(
     bool only_copy_shape)>;
 }  // namespace BeamSearchDeviceHelper
 
-
 // These are CPU specific device helper implementations
 namespace BeamSearchCpuDeviceHelper {
 Status TopK(
@@ -208,12 +207,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 // ---------------------------------------------------------------
 // Functions for encoder-decoder model like T5
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index 9cf5daeba929..7674c2a78105 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -56,6 +56,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
       std::vector<OrtValue>& next_inputs,
       int current_length,
       OrtValue& position_ids,
+      bool increase_position,
       gsl::span<const int32_t> beam_next_tokens,
       gsl::span<const int32_t> beam_indices);
 
@@ -93,6 +94,7 @@ Status BeamSearchGpt<T>::UpdateFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices) {
   return update_feeds_func_(this->temp_space_allocator_,
@@ -101,12 +103,12 @@ Status BeamSearchGpt<T>::UpdateFeeds(
                             next_inputs,
                             current_length,
                             position_ids,
+                            increase_position,
                             beam_next_tokens,
                             beam_indices,
                             this->parameters_->num_beams,
                             gpt_subgraph_.GetFirstPastInputIndex(),
-                            gpt_subgraph_.GetFirstPresentOutputIndex(),
-                            this->GetConsoleDumper());
+                            gpt_subgraph_.GetFirstPresentOutputIndex());
 }
 
 template <typename T>
@@ -186,11 +188,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
 
 #ifdef DEBUG_BEAM_SEARCH
   const IConsoleDumper* dumper = this->GetConsoleDumper();
-  dumper->Print("input_ids", feeds[0]);
-  dumper->Print("position_ids", feeds[1]);
-  dumper->Print("attention_mask", feeds[2]);
 #endif
-
   // Position ids for all iterations except the first. It uses memory buffer owned by next_positions.
   OrtValue position_ids;
   int64_t dims[] = {parameters->BatchBeamSize(), 1};
@@ -205,9 +203,19 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
   int iteration_counter = 0;
   while (current_length < parameters->max_length) {
     iteration_counter++;
+
 #ifdef DEBUG_BEAM_SEARCH
     auto cur_len = std::to_string(current_length);
     dumper->Print("***CurrentLength", cur_len, true);
+    dumper->Print("iteration", iteration_counter, true);
+
+    dumper->Print("input_ids", feeds[0]);
+    dumper->Print("position_ids", feeds[1]);
+    dumper->Print("attention_mask", feeds[2]);
+    for (size_t i = 3; i < feeds.size(); i++) {
+      dumper->Print("past", static_cast<int>(i) - 3, true);
+      dumper->Print("", feeds[i]);
+    }
 #endif
 
     status = utils::ExecuteSubgraph(this->decoder_session_state_,
@@ -241,8 +249,11 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
 
     // Prepare inputs for next round of subgraph call.
     if (current_length < parameters->max_length) {
+      // For the first iteration, position_ids is initialized as sequence lengths. We can add it to feeds directly.
+      // For the remaining iterations, we need increase position_ids first, then add it to feeds.
+      bool increase_position = (iteration_counter > 1);
       ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
-                                      position_ids,
+                                      position_ids, increase_position,
                                       beam_next_tokens.as_span<const int32_t>(),
                                       beam_indices.as_span<const int32_t>()));
     }
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc
index b712908259da..780e98909c60 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc
@@ -258,9 +258,8 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 
   // The output will be float for consideration of precision and easy integration with remaining parts.
   float* Y_data = next_token_scores.data();
-  const CudaT* X_data = (input_length == 1 && logits_batch_size == batch_beam_size) ?
-                        logits_data :
-                        reinterpret_cast<const CudaT*>(next_token_logits.data());
+  bool is_single_token = (input_length == 1 && logits_batch_size == batch_beam_size);
+  const CudaT* X_data = is_single_token ? logits_data : reinterpret_cast<const CudaT*>(next_token_logits.data());
 
   dispatch_blockwise_softmax_forward<CudaT, float, float, true>(
       cuda_stream, Y_data, X_data, vocab_size, vocab_size, batch_size * num_beams);
@@ -500,12 +499,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper) {
+    int gpt_subgraph_first_present_output_idx) {
   // Update input_ids with next tokens.
   int batch_beam_size = static_cast<int>(beam_next_tokens.length());
   int64_t dims[] = {batch_beam_size, 1};
@@ -519,7 +518,7 @@ Status UpdateGptFeeds(
   next_inputs[0] = input_ids;
 
   // Update position IDs
-  int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
+  int32_t* position_data = increase_position ? position_ids.GetMutable<Tensor>()->MutableData<int32_t>() : nullptr;
   next_inputs[1] = position_ids;
 
   // Update attention mask
@@ -538,14 +537,6 @@ Status UpdateGptFeeds(
 
   next_inputs[2] = attention_mask;
 
-#ifdef DEBUG_BEAM_SEARCH
-  dumper->Print("input_ids", input_ids);
-  dumper->Print("position_ids", position_ids);
-  dumper->Print("attention_mask", attention_mask);
-#else
-  ORT_UNUSED_PARAMETER(dumper);
-#endif
-
   // Update past state
   if (num_beams == 1) {
     const int k = gpt_subgraph_first_past_input_idx - gpt_subgraph_first_present_output_idx;
@@ -662,12 +653,12 @@ Status ExpandBuffer(void* stream,
   for (int i = 0; i < batch_size; i++) {
     for (int j = 0; j < num_beams; j++) {
       CUDA_RETURN_IF_ERROR(
-        cudaMemcpyAsync(
-          target,
-          input_data + i * chunk_size,
-          sizeof(T) * chunk_size,
-          cudaMemcpyDeviceToDevice,
-          cuda_stream));
+          cudaMemcpyAsync(
+              target,
+              input_data + i * chunk_size,
+              sizeof(T) * chunk_size,
+              cudaMemcpyDeviceToDevice,
+              cuda_stream));
       target += chunk_size;
     }
   }
@@ -714,12 +705,12 @@ template Status UpdateGptFeeds<float>(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 // Float16
 template void InitBeamState<MLFloat16>(transformers::IBeamSearchState<MLFloat16>* beam_state,
@@ -748,12 +739,12 @@ template Status UpdateGptFeeds<MLFloat16>(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 template Status UpdateDecoderFeeds<float>(
     AllocatorPtr allocator,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h
index 14f64e923e78..4424fee6d5cb 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h
@@ -68,12 +68,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 // ---------------------------------------------------------------
 // Functions for encoder-decoder model like T5
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
index 4f93b1dded93..6bc52758c7cc 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
@@ -248,9 +248,11 @@ __global__ void UpdateGptInputsKernel(const T* old_mask_data,
     int j = index % current_length;
     mask_data[index] = (j < current_length - 1) ? old_mask_data[i * (current_length - 1) + j] : static_cast<T>(1);
 
-    // Update sequence length (or next positions).
-    if (index < batch_beam_size) {
-      next_positions[index]++;
+    if (next_positions != nullptr) {
+      // Update sequence length (or next positions).
+      if (index < batch_beam_size) {
+        next_positions[index]++;
+      }
     }
   }
 }
diff --git a/onnxruntime/core/common/cpuid_arch_definition.h b/onnxruntime/core/common/cpuid_arch_definition.h
new file mode 100644
index 000000000000..a541eb66d8ba
--- /dev/null
+++ b/onnxruntime/core/common/cpuid_arch_definition.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file defines the CPUIDINFO_ARCH_* symbols.
+
+#pragma once
+
+#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__)
+#define CPUIDINFO_ARCH_X86
+#endif
+
+#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
+#define CPUIDINFO_ARCH_ARM
+#endif  // ARM or ARM64
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index f76f0b0a1527..ff535d889386 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -4,14 +4,7 @@
 #pragma once
 
 #include "core/common/common.h"
-
-#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__)
-#define CPUIDINFO_ARCH_X86
-#endif
-
-#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
-#define CPUIDINFO_ARCH_ARM
-#endif  // ARM or ARM64
+#include "core/common/cpuid_arch_definition.h"
 
 namespace onnxruntime {
 
@@ -31,7 +24,7 @@ class CPUIDInfo {
   bool HasSSE4_1() const { return has_sse4_1_; }
   bool IsHybrid() const { return is_hybrid_; }
 
-  // ARM 
+  // ARM
   bool HasArmNeonDot() const { return has_arm_neon_dot_; }
 
   uint32_t GetCurrentCoreIdx() const;
@@ -72,7 +65,7 @@ class CPUIDInfo {
     }
     return is_armv8_narrow_ld_[coreId];
   }
-  
+
   /**
   * @brief Some ARMv8 power efficient core has narrower 64b load/store
   *        that needs specialized optimiztion in kernels
diff --git a/onnxruntime/core/common/cpuid_uarch.cc b/onnxruntime/core/common/cpuid_uarch.cc
new file mode 100644
index 000000000000..e9d8de9732b1
--- /dev/null
+++ b/onnxruntime/core/common/cpuid_uarch.cc
@@ -0,0 +1,369 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/cpuid_uarch.h"
+
+#include "core/common/logging/logging.h"
+
+namespace onnxruntime {
+
+#if defined(CPUIDINFO_ARCH_ARM)
+
+#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000)
+#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000)
+#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000)
+#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0)
+#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F)
+
+#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24
+#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20
+#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16
+#define CPUINFO_ARM_MIDR_PART_OFFSET 4
+#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0
+
+inline static uint32_t midr_get_implementer(uint32_t midr) {
+  return (midr & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) >> CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET;
+}
+
+inline static uint32_t midr_get_part(uint32_t midr) {
+  return (midr & CPUINFO_ARM_MIDR_PART_MASK) >> CPUINFO_ARM_MIDR_PART_OFFSET;
+}
+
+inline static uint32_t midr_get_variant(uint32_t midr) {
+  return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET;
+}
+
+void decodeMIDR(
+    uint32_t midr,
+    uint32_t uarch[1]) {
+  switch (midr_get_implementer(midr)) {
+    case 'A':
+      switch (midr_get_part(midr)) {
+          //#if defined(_M_ARM) || defined(__arm__)
+        case 0xC05:
+          *uarch = cpuinfo_uarch_cortex_a5;
+          break;
+        case 0xC07:
+          *uarch = cpuinfo_uarch_cortex_a7;
+          break;
+        case 0xC08:
+          *uarch = cpuinfo_uarch_cortex_a8;
+          break;
+        case 0xC09:
+          *uarch = cpuinfo_uarch_cortex_a9;
+          break;
+        case 0xC0C:
+          *uarch = cpuinfo_uarch_cortex_a12;
+          break;
+        case 0xC0E:
+          *uarch = cpuinfo_uarch_cortex_a17;
+          break;
+        case 0xC0D:
+          /*
+					 * Rockchip RK3288 only.
+					 * Core information is ambiguous: some sources specify Cortex-A12, others - Cortex-A17.
+					 * Assume it is Cortex-A12.
+					 */
+          *uarch = cpuinfo_uarch_cortex_a12;
+          break;
+        case 0xC0F:
+          *uarch = cpuinfo_uarch_cortex_a15;
+          break;
+          //#endif /* ARM */
+        case 0xD01:
+          *uarch = cpuinfo_uarch_cortex_a32;
+          break;
+        case 0xD03:
+          *uarch = cpuinfo_uarch_cortex_a53;
+          break;
+        case 0xD04:
+          *uarch = cpuinfo_uarch_cortex_a35;
+          break;
+        case 0xD05:
+          // Note: use Variant, not Revision, field
+          *uarch = (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) == 0 ? cpuinfo_uarch_cortex_a55r0 : cpuinfo_uarch_cortex_a55;
+          break;
+        case 0xD06:
+          *uarch = cpuinfo_uarch_cortex_a65;
+          break;
+        case 0xD07:
+          *uarch = cpuinfo_uarch_cortex_a57;
+          break;
+        case 0xD08:
+          *uarch = cpuinfo_uarch_cortex_a72;
+          break;
+        case 0xD09:
+          *uarch = cpuinfo_uarch_cortex_a73;
+          break;
+        case 0xD0A:
+          *uarch = cpuinfo_uarch_cortex_a75;
+          break;
+        case 0xD0B:
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xD0C:
+          *uarch = cpuinfo_uarch_neoverse_n1;
+          break;
+          //#endif /* ARM64 && !defined(__ANDROID__) */
+        case 0xD0D:
+          *uarch = cpuinfo_uarch_cortex_a77;
+          break;
+        case 0xD0E: /* Cortex-A76AE */
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+        case 0xD41: /* Cortex-A78 */
+          *uarch = cpuinfo_uarch_cortex_a78;
+          break;
+        case 0xD44: /* Cortex-X1 */
+          *uarch = cpuinfo_uarch_cortex_x1;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xD4A:
+          *uarch = cpuinfo_uarch_neoverse_e1;
+          break;
+          //#endif /* ARM64 && !defined(__ANDROID__) */
+        default:
+          switch (midr_get_part(midr) >> 8) {
+              //#if defined(_M_ARM) || defined(__arm__)
+            case 7:
+              *uarch = cpuinfo_uarch_arm7;
+              break;
+            case 9:
+              *uarch = cpuinfo_uarch_arm9;
+              break;
+            case 11:
+              *uarch = cpuinfo_uarch_arm11;
+              break;
+              //#endif /* ARM */
+            default:
+              LOGS_DEFAULT(WARNING) << "unknown ARM CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+          }
+      }
+      break;
+    case 'B':
+      switch (midr_get_part(midr)) {
+        case 0x00F:
+          *uarch = cpuinfo_uarch_brahma_b15;
+          break;
+        case 0x100:
+          *uarch = cpuinfo_uarch_brahma_b53;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0x516:
+          /* Broadcom Vulkan was sold to Cavium before it reached the market, so we identify it as Cavium ThunderX2 */
+          *uarch = cpuinfo_uarch_thunderx2;
+          break;
+          //#endif
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Broadcom CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+    case 'C':
+      switch (midr_get_part(midr)) {
+        case 0x0A0: /* ThunderX */
+        case 0x0A1: /* ThunderX 88XX */
+        case 0x0A2: /* ThunderX 81XX */
+        case 0x0A3: /* ThunderX 83XX */
+          *uarch = cpuinfo_uarch_thunderx;
+          break;
+        case 0x0AF: /* ThunderX2 99XX */
+          *uarch = cpuinfo_uarch_thunderx2;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Cavium CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#endif
+    case 'H':
+      switch (midr_get_part(midr)) {
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xD01: /* Kunpeng 920 series */
+          *uarch = cpuinfo_uarch_taishan_v110;
+          break;
+          //#endif
+        case 0xD40: /* Kirin 980 Big/Medium cores -> Cortex-A76 */
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Huawei CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#if defined(_M_ARM) || defined(__arm__)
+    case 'i':
+      switch (midr_get_part(midr) >> 8) {
+        case 2: /* PXA 210/25X/26X */
+        case 4: /* PXA 27X */
+        case 6: /* PXA 3XX */
+          *uarch = cpuinfo_uarch_xscale;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Intel CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#endif /* ARM */
+    case 'N':
+      switch (midr_get_part(midr)) {
+        case 0x000:
+          *uarch = cpuinfo_uarch_denver;
+          break;
+        case 0x003:
+          *uarch = cpuinfo_uarch_denver2;
+          break;
+        case 0x004:
+          *uarch = cpuinfo_uarch_carmel;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Nvidia CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+#if !defined(__ANDROID__)
+    case 'P':
+      switch (midr_get_part(midr)) {
+        case 0x000:
+          *uarch = cpuinfo_uarch_xgene;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Applied Micro CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+#endif
+    case 'Q':
+      switch (midr_get_part(midr)) {
+          // #if defined(_M_ARM) || defined(__arm__)
+        case 0x00F:
+          /* Mostly Scorpions, but some Cortex A5 may report this value as well */
+          //if (has_vfpv4) {
+          //  /* Unlike Scorpion, Cortex-A5 comes with VFPv4 */
+          //  *vendor = cpuinfo_vendor_arm;
+          //  *uarch = cpuinfo_uarch_cortex_a5;
+          //} else {
+          *uarch = cpuinfo_uarch_scorpion;
+          //          }
+          break;
+        case 0x02D: /* Dual-core Scorpions */
+          *uarch = cpuinfo_uarch_scorpion;
+          break;
+        case 0x04D:
+          /*
+					 * Dual-core Krait:
+					 * - r1p0 -> Krait 200
+					 * - r1p4 -> Krait 200
+					 * - r2p0 -> Krait 300
+					 */
+        case 0x06F:
+          /*
+					 * Quad-core Krait:
+					 * - r0p1 -> Krait 200
+					 * - r0p2 -> Krait 200
+					 * - r1p0 -> Krait 300
+					 * - r2p0 -> Krait 400 (Snapdragon 800 MSMxxxx)
+					 * - r2p1 -> Krait 400 (Snapdragon 801 MSMxxxxPRO)
+					 * - r3p1 -> Krait 450
+					 */
+          *uarch = cpuinfo_uarch_krait;
+          break;
+          //#endif              /* ARM */
+        case 0x201: /* Qualcomm Snapdragon 821: Low-power Kryo "Silver" */
+        case 0x205: /* Qualcomm Snapdragon 820 & 821: High-performance Kryo "Gold" */
+        case 0x211: /* Qualcomm Snapdragon 820: Low-power Kryo "Silver" */
+          *uarch = cpuinfo_uarch_kryo;
+          break;
+        case 0x800: /* High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73 */
+          *uarch = cpuinfo_uarch_cortex_a73;
+          break;
+        case 0x801: /* Low-power Kryo 260 / 280 "Silver" -> Cortex-A53 */
+          *uarch = cpuinfo_uarch_cortex_a53;
+          break;
+        case 0x802: /* High-performance Kryo 385 "Gold" -> Cortex-A75 */
+          *uarch = cpuinfo_uarch_cortex_a75;
+          break;
+        case 0x803: /* Low-power Kryo 385 "Silver" -> Cortex-A55r0 */
+          *uarch = cpuinfo_uarch_cortex_a55r0;
+          break;
+        case 0x804: /* High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76 */
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+        case 0x805: /* Low-performance Kryo 485 "Silver" -> Cortex-A55 */
+          *uarch = cpuinfo_uarch_cortex_a55;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xC00:
+          *uarch = cpuinfo_uarch_falkor;
+          break;
+        case 0xC01:
+          *uarch = cpuinfo_uarch_saphira;
+          break;
+          //#endif /* ARM64 && !defined(__ANDROID__) */
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Qualcomm CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+    case 'S':
+      switch (midr & (CPUINFO_ARM_MIDR_VARIANT_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+        case 0x00100010:
+          /*
+					 * Exynos 8890 MIDR = 0x531F0011, assume Exynos M1 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x001
+					 */
+          *uarch = cpuinfo_uarch_exynos_m1;
+          break;
+        case 0x00400010:
+          /*
+					 * Exynos 8895 MIDR = 0x534F0010, assume Exynos M2 has:
+					 * - CPU variant 0x4
+					 * - CPU part 0x001
+					 */
+          *uarch = cpuinfo_uarch_exynos_m2;
+          break;
+        case 0x00100020:
+          /*
+					 * Exynos 9810 MIDR = 0x531F0020, assume Exynos M3 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x002
+					 */
+          *uarch = cpuinfo_uarch_exynos_m3;
+          break;
+        case 0x00100030:
+          /*
+					 * Exynos 9820 MIDR = 0x531F0030, assume Exynos M4 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x003
+					 */
+          *uarch = cpuinfo_uarch_exynos_m4;
+          break;
+        case 0x00100040:
+          /*
+					 * Exynos 9820 MIDR = 0x531F0040, assume Exynos M5 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x004
+					 */
+          *uarch = cpuinfo_uarch_exynos_m5;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Samsung CPU variant 0x"
+                                << std::hex << midr_get_variant(midr) << " part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#if defined(_M_ARM) || defined(__arm__)
+    case 'V':
+      switch (midr_get_part(midr)) {
+        case 0x581: /* PJ4 / PJ4B */
+        case 0x584: /* PJ4B-MP / PJ4C */
+          *uarch = cpuinfo_uarch_pj4;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Marvell CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#endif /* ARM */
+    default:
+      LOGS_DEFAULT(WARNING) << "unknown CPU uarch from MIDR value: 0x" << std::hex << midr;
+  }
+}
+
+#endif  // arm or arm64
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/common/cpuid_uarch.h b/onnxruntime/core/common/cpuid_uarch.h
index 73a05c4b91a3..c78156129220 100644
--- a/onnxruntime/core/common/cpuid_uarch.h
+++ b/onnxruntime/core/common/cpuid_uarch.h
@@ -22,6 +22,14 @@ Module Name:
 
 --*/
 
+#pragma once
+
+#include <cstdint>
+
+#include "core/common/cpuid_arch_definition.h"
+
+namespace onnxruntime {
+
 enum CPUIDINFOuarch {
   /** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */
   cpuinfo_uarch_unknown = 0,
@@ -175,359 +183,8 @@ enum CPUIDINFOuarch {
 
 #if defined(CPUIDINFO_ARCH_ARM)
 
-#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000)
-#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000)
-#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000)
-#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0)
-#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F)
-
-#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24
-#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20
-#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16
-#define CPUINFO_ARM_MIDR_PART_OFFSET 4
-#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0
-
-inline static uint32_t midr_get_implementer(uint32_t midr) {
-  return (midr & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) >> CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET;
-}
-
-inline static uint32_t midr_get_part(uint32_t midr) {
-  return (midr & CPUINFO_ARM_MIDR_PART_MASK) >> CPUINFO_ARM_MIDR_PART_OFFSET;
-}
-
-inline static uint32_t midr_get_variant(uint32_t midr) {
-  return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET;
-}
-
-static void decodeMIDR(
-    uint32_t midr,
-    uint32_t uarch[1]) {
-  switch (midr_get_implementer(midr)) {
-    case 'A':
-      switch (midr_get_part(midr)) {
-          //#if defined(_M_ARM) || defined(__arm__)
-        case 0xC05:
-          *uarch = cpuinfo_uarch_cortex_a5;
-          break;
-        case 0xC07:
-          *uarch = cpuinfo_uarch_cortex_a7;
-          break;
-        case 0xC08:
-          *uarch = cpuinfo_uarch_cortex_a8;
-          break;
-        case 0xC09:
-          *uarch = cpuinfo_uarch_cortex_a9;
-          break;
-        case 0xC0C:
-          *uarch = cpuinfo_uarch_cortex_a12;
-          break;
-        case 0xC0E:
-          *uarch = cpuinfo_uarch_cortex_a17;
-          break;
-        case 0xC0D:
-          /*
-					 * Rockchip RK3288 only.
-					 * Core information is ambiguous: some sources specify Cortex-A12, others - Cortex-A17.
-					 * Assume it is Cortex-A12.
-					 */
-          *uarch = cpuinfo_uarch_cortex_a12;
-          break;
-        case 0xC0F:
-          *uarch = cpuinfo_uarch_cortex_a15;
-          break;
-          //#endif /* ARM */
-        case 0xD01:
-          *uarch = cpuinfo_uarch_cortex_a32;
-          break;
-        case 0xD03:
-          *uarch = cpuinfo_uarch_cortex_a53;
-          break;
-        case 0xD04:
-          *uarch = cpuinfo_uarch_cortex_a35;
-          break;
-        case 0xD05:
-          // Note: use Variant, not Revision, field
-          *uarch = (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) == 0 ? cpuinfo_uarch_cortex_a55r0 : cpuinfo_uarch_cortex_a55;
-          break;
-        case 0xD06:
-          *uarch = cpuinfo_uarch_cortex_a65;
-          break;
-        case 0xD07:
-          *uarch = cpuinfo_uarch_cortex_a57;
-          break;
-        case 0xD08:
-          *uarch = cpuinfo_uarch_cortex_a72;
-          break;
-        case 0xD09:
-          *uarch = cpuinfo_uarch_cortex_a73;
-          break;
-        case 0xD0A:
-          *uarch = cpuinfo_uarch_cortex_a75;
-          break;
-        case 0xD0B:
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xD0C:
-          *uarch = cpuinfo_uarch_neoverse_n1;
-          break;
-          //#endif /* ARM64 && !defined(__ANDROID__) */
-        case 0xD0D:
-          *uarch = cpuinfo_uarch_cortex_a77;
-          break;
-        case 0xD0E: /* Cortex-A76AE */
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-        case 0xD41: /* Cortex-A78 */
-          *uarch = cpuinfo_uarch_cortex_a78;
-          break;
-        case 0xD44: /* Cortex-X1 */
-          *uarch = cpuinfo_uarch_cortex_x1;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xD4A:
-          *uarch = cpuinfo_uarch_neoverse_e1;
-          break;
-          //#endif /* ARM64 && !defined(__ANDROID__) */
-        default:
-          switch (midr_get_part(midr) >> 8) {
-              //#if defined(_M_ARM) || defined(__arm__)
-            case 7:
-              *uarch = cpuinfo_uarch_arm7;
-              break;
-            case 9:
-              *uarch = cpuinfo_uarch_arm9;
-              break;
-            case 11:
-              *uarch = cpuinfo_uarch_arm11;
-              break;
-              //#endif /* ARM */
-            default:
-              LOGS_DEFAULT(WARNING) << "unknown ARM CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-          }
-      }
-      break;
-    case 'B':
-      switch (midr_get_part(midr)) {
-        case 0x00F:
-          *uarch = cpuinfo_uarch_brahma_b15;
-          break;
-        case 0x100:
-          *uarch = cpuinfo_uarch_brahma_b53;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0x516:
-          /* Broadcom Vulkan was sold to Cavium before it reached the market, so we identify it as Cavium ThunderX2 */
-          *uarch = cpuinfo_uarch_thunderx2;
-          break;
-          //#endif
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Broadcom CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-    case 'C':
-      switch (midr_get_part(midr)) {
-        case 0x0A0: /* ThunderX */
-        case 0x0A1: /* ThunderX 88XX */
-        case 0x0A2: /* ThunderX 81XX */
-        case 0x0A3: /* ThunderX 83XX */
-          *uarch = cpuinfo_uarch_thunderx;
-          break;
-        case 0x0AF: /* ThunderX2 99XX */
-          *uarch = cpuinfo_uarch_thunderx2;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Cavium CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#endif
-    case 'H':
-      switch (midr_get_part(midr)) {
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xD01: /* Kunpeng 920 series */
-          *uarch = cpuinfo_uarch_taishan_v110;
-          break;
-          //#endif
-        case 0xD40: /* Kirin 980 Big/Medium cores -> Cortex-A76 */
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Huawei CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#if defined(_M_ARM) || defined(__arm__)
-    case 'i':
-      switch (midr_get_part(midr) >> 8) {
-        case 2: /* PXA 210/25X/26X */
-        case 4: /* PXA 27X */
-        case 6: /* PXA 3XX */
-          *uarch = cpuinfo_uarch_xscale;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Intel CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#endif /* ARM */
-    case 'N':
-      switch (midr_get_part(midr)) {
-        case 0x000:
-          *uarch = cpuinfo_uarch_denver;
-          break;
-        case 0x003:
-          *uarch = cpuinfo_uarch_denver2;
-          break;
-        case 0x004:
-          *uarch = cpuinfo_uarch_carmel;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Nvidia CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-#if !defined(__ANDROID__)
-    case 'P':
-      switch (midr_get_part(midr)) {
-        case 0x000:
-          *uarch = cpuinfo_uarch_xgene;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Applied Micro CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-#endif
-    case 'Q':
-      switch (midr_get_part(midr)) {
-          // #if defined(_M_ARM) || defined(__arm__)
-        case 0x00F:
-          /* Mostly Scorpions, but some Cortex A5 may report this value as well */
-          //if (has_vfpv4) {
-          //  /* Unlike Scorpion, Cortex-A5 comes with VFPv4 */
-          //  *vendor = cpuinfo_vendor_arm;
-          //  *uarch = cpuinfo_uarch_cortex_a5;
-          //} else {
-          *uarch = cpuinfo_uarch_scorpion;
-          //          }
-          break;
-        case 0x02D: /* Dual-core Scorpions */
-          *uarch = cpuinfo_uarch_scorpion;
-          break;
-        case 0x04D:
-          /*
-					 * Dual-core Krait:
-					 * - r1p0 -> Krait 200
-					 * - r1p4 -> Krait 200
-					 * - r2p0 -> Krait 300
-					 */
-        case 0x06F:
-          /*
-					 * Quad-core Krait:
-					 * - r0p1 -> Krait 200
-					 * - r0p2 -> Krait 200
-					 * - r1p0 -> Krait 300
-					 * - r2p0 -> Krait 400 (Snapdragon 800 MSMxxxx)
-					 * - r2p1 -> Krait 400 (Snapdragon 801 MSMxxxxPRO)
-					 * - r3p1 -> Krait 450
-					 */
-          *uarch = cpuinfo_uarch_krait;
-          break;
-          //#endif              /* ARM */
-        case 0x201: /* Qualcomm Snapdragon 821: Low-power Kryo "Silver" */
-        case 0x205: /* Qualcomm Snapdragon 820 & 821: High-performance Kryo "Gold" */
-        case 0x211: /* Qualcomm Snapdragon 820: Low-power Kryo "Silver" */
-          *uarch = cpuinfo_uarch_kryo;
-          break;
-        case 0x800: /* High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73 */
-          *uarch = cpuinfo_uarch_cortex_a73;
-          break;
-        case 0x801: /* Low-power Kryo 260 / 280 "Silver" -> Cortex-A53 */
-          *uarch = cpuinfo_uarch_cortex_a53;
-          break;
-        case 0x802: /* High-performance Kryo 385 "Gold" -> Cortex-A75 */
-          *uarch = cpuinfo_uarch_cortex_a75;
-          break;
-        case 0x803: /* Low-power Kryo 385 "Silver" -> Cortex-A55r0 */
-          *uarch = cpuinfo_uarch_cortex_a55r0;
-          break;
-        case 0x804: /* High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76 */
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-        case 0x805: /* Low-performance Kryo 485 "Silver" -> Cortex-A55 */
-          *uarch = cpuinfo_uarch_cortex_a55;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xC00:
-          *uarch = cpuinfo_uarch_falkor;
-          break;
-        case 0xC01:
-          *uarch = cpuinfo_uarch_saphira;
-          break;
-          //#endif /* ARM64 && !defined(__ANDROID__) */
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Qualcomm CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-    case 'S':
-      switch (midr & (CPUINFO_ARM_MIDR_VARIANT_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
-        case 0x00100010:
-          /*
-					 * Exynos 8890 MIDR = 0x531F0011, assume Exynos M1 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x001
-					 */
-          *uarch = cpuinfo_uarch_exynos_m1;
-          break;
-        case 0x00400010:
-          /*
-					 * Exynos 8895 MIDR = 0x534F0010, assume Exynos M2 has:
-					 * - CPU variant 0x4
-					 * - CPU part 0x001
-					 */
-          *uarch = cpuinfo_uarch_exynos_m2;
-          break;
-        case 0x00100020:
-          /*
-					 * Exynos 9810 MIDR = 0x531F0020, assume Exynos M3 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x002
-					 */
-          *uarch = cpuinfo_uarch_exynos_m3;
-          break;
-        case 0x00100030:
-          /*
-					 * Exynos 9820 MIDR = 0x531F0030, assume Exynos M4 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x003
-					 */
-          *uarch = cpuinfo_uarch_exynos_m4;
-          break;
-        case 0x00100040:
-          /*
-					 * Exynos 9820 MIDR = 0x531F0040, assume Exynos M5 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x004
-					 */
-          *uarch = cpuinfo_uarch_exynos_m5;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Samsung CPU variant 0x"
-                                << std::hex << midr_get_variant(midr) << " part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#if defined(_M_ARM) || defined(__arm__)
-    case 'V':
-      switch (midr_get_part(midr)) {
-        case 0x581: /* PJ4 / PJ4B */
-        case 0x584: /* PJ4B-MP / PJ4C */
-          *uarch = cpuinfo_uarch_pj4;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Marvell CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#endif /* ARM */
-    default:
-      LOGS_DEFAULT(WARNING) << "unknown CPU uarch from MIDR value: 0x" << std::hex << midr;
-  }
-}
+void decodeMIDR(uint32_t midr, uint32_t uarch[1]);
 
 #endif  // arm or arm64
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
index 6cdb63319a5b..0e996ec98a73 100644
--- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
+++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
@@ -2001,7 +2001,9 @@ OptimizeResult OptimizeImpl(OptimizerCtx& ctx) {
         continue;
       }
 
-      if (!HandleQuantizeDequantizeScale(ctx.graph, *perm, *dq_node, ctx.opset)) {
+      // we're moving the Transpose to before the DQ, so we need to use the inverse permutations to update the axis
+      // attribute correctly when doing per-axis dequantization
+      if (!HandleQuantizeDequantizeScale(ctx.graph, InvertPerm(*perm), *dq_node, ctx.opset)) {
         continue;
       }
 
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 4358ddc7c3e2..c18075cc7b4b 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -40,6 +40,23 @@ namespace onnxruntime {
 
 namespace {
 
+class UnmapFileParam {
+ public:
+  void* addr;
+  size_t len;
+};
+
+static void UnmapFile(void* param) noexcept {
+  UnmapFileParam* p = reinterpret_cast<UnmapFileParam*>(param);
+  bool ret = UnmapViewOfFile(p->addr);
+  if (!ret) {
+    const auto error_code = GetLastError();
+    LOGS_DEFAULT(ERROR) << "unmap view of file failed. error code: " << error_code
+                        << " error msg: " << std::system_category().message(error_code);
+  }
+  delete p;
+}
+
 std::wstring Basename(const std::wstring& path) {
   auto basename_index = path.find_last_of(L"/\\") + 1;  // results in 0 if no separator is found
   return path.substr(basename_index);
@@ -320,8 +337,95 @@ class WindowsEnv : public Env {
     return Status::OK();
   }
 
+  /**
   Status MapFileIntoMemory(_In_z_ const ORTCHAR_T*, FileOffsetType, size_t, MappedMemoryPtr&) const override {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "MapFileIntoMemory is not implemented on Windows.");
+  }*/
+
+  Status MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
+                           FileOffsetType offset,
+                           size_t length,
+                           MappedMemoryPtr& mapped_memory) const override {
+    ORT_RETURN_IF_NOT(file_path, "file_path == nullptr");
+    ORT_RETURN_IF_NOT(offset >= 0, "offset < 0");
+
+    if (length == 0) {
+      mapped_memory = MappedMemoryPtr{};
+      return Status::OK();
+    }
+
+#if WINVER >= _WIN32_WINNT_WIN8
+    wil::unique_hfile file_handle{
+        CreateFile2(file_path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL)};
+#else
+    wil::unique_hfile file_handle{
+        CreateFileW(file_path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)};
+#endif
+    if (file_handle.get() == INVALID_HANDLE_VALUE) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+          "open file ", ToUTF8String(Basename(file_path)),
+          " fail, errcode = ", error_code,
+          " - ", std::system_category().message(error_code));
+    }
+
+#if NTDDI_VERSION >= NTDDI_WIN10_RS5
+    wil::unique_hfile file_mapping_handle{
+        CreateFileMapping2(file_handle.get(),
+                           nullptr,
+                           FILE_MAP_READ,
+                           PAGE_READONLY,
+                           SEC_COMMIT,
+                           0,
+                           nullptr,
+                           nullptr,
+                           0)};
+#else
+    wil::unique_hfile file_mapping_handle{
+        CreateFileMappingW(file_handle.get(),
+                           nullptr,
+                           PAGE_READONLY,
+                           0,
+                           0,
+                           nullptr)};
+#endif
+    if (file_mapping_handle.get() == INVALID_HANDLE_VALUE) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+          "open file mapping ", ToUTF8String(Basename(file_path)),
+          " fail, errcode = ", error_code,
+          " - ", std::system_category().message(error_code));
+    }
+
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+
+    static const DWORD page_size = sysinfo.dwPageSize;
+    static const DWORD allocation_granularity = sysinfo.dwAllocationGranularity;
+    const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
+    const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
+    const FileOffsetType mapped_offset = offset - offset_to_page;
+    if (mapped_offset % allocation_granularity != 0) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+          "mapped offset must be a multiple of the allocation granularity",
+          " , mapped_offset = ", mapped_offset,
+          " , allocation_granularity = ", allocation_granularity,
+          " , errcode = ", error_code,
+          " - ", std::system_category().message(error_code));
+    }
+
+    void* const mapped_base = MapViewOfFile(file_mapping_handle.get(),
+                                            FILE_MAP_READ,
+                                            0,
+                                            static_cast<DWORD>(mapped_offset),
+                                            mapped_length);
+
+    mapped_memory =
+        MappedMemoryPtr{reinterpret_cast<char*>(mapped_base) + offset_to_page,
+                        OrtCallbackInvoker{OrtCallback{UnmapFile, new UnmapFileParam{mapped_base, mapped_length}}}};
+
+    return Status::OK();
   }
 
   bool FolderExists(const std::wstring& path) const override {
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.h b/onnxruntime/core/providers/cpu/object_detection/roialign.h
index 9ba7f89caf4d..1bb8bd34c5cb 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.h
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.h
@@ -29,7 +29,6 @@ class RoiAlignBase {
       } else {
         ORT_THROW("Invalid mode of value ", mode, " specified. It should be either avg or max");
       }
-      mode_ = mode == "avg" ? RoiAlignMode::avg : RoiAlignMode::max;
     }
 
     // output_height
@@ -64,6 +63,13 @@ class RoiAlignBase {
       else
         half_pixel_ = false;
     }
+
+    if (mode_ == RoiAlignMode::max && sampling_ratio_ != 1) {
+      // TODO(fdwr): Issue #6146. ORT 1.13 will correct the incorrect summation of max mode with PR #7354.
+      LOGS_DEFAULT(WARNING) << "The existing summation for max mode and sampling ratios besides 1 is incorrect "
+                            << "and will be fixed in the next ORT 1.13 release. Thus the results of RoiAlign "
+                            << "will be different.";
+    }
   }
 
  protected:
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index 68e2be1b34cf..cfeb9a220277 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -5,6 +5,8 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
 #include "graph_transform_test_builder.h"
 
 #include "core/graph/graph.h"
@@ -3620,7 +3622,6 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearTransposePropagation) {
     EXPECT_EQ(op_types_in_order, expected_op_types_in_order);
   };
 
-
   TransformerTester(build_test_case_1,
                     check_graph,
                     TransformerLevel::Default,
@@ -4047,5 +4048,41 @@ TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue10305) {
   ASSERT_STATUS_OK(session_object.Load(model_uri));
   ASSERT_STATUS_OK(session_object.Initialize());  // optimizers run during initialization
 }
+
+// regression test for a model with DQ node with per-axis dequantization followed by a Transpose.
+// the second phase can swap those around, but needs to use the correct perms for updating the 'axis'
+// attribute in the DQ node.
+// see https://github.com/microsoft/onnxruntime/issues/12151 for more details.
+TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue12151) {
+  Status status;
+  auto model_uri = ORT_TSTR("testdata/ort_github_issue_12151.onnx");
+
+  NameMLValMap feeds;  // no inputs for this model
+  std::vector<std::string> output_names{"Z"};
+  std::vector<OrtValue> fetches_orig;
+  std::vector<OrtValue> fetches;
+
+  SessionOptions so;
+  so.session_logid = "TransposeOptimizerTests.RegressionTest_GitHubIssue12151";
+
+  {
+    so.graph_optimization_level = TransformerLevel::Default;  // off
+    InferenceSession session_object{so, GetEnvironment()};
+    ASSERT_STATUS_OK(session_object.Load(model_uri));
+    ASSERT_STATUS_OK(session_object.Initialize());
+    ASSERT_STATUS_OK(session_object.Run(feeds, output_names, &fetches_orig));
+  }
+
+  {
+    so.graph_optimization_level = TransformerLevel::Level1;  // enable transpose optimizer
+    InferenceSession session_object{so, GetEnvironment()};
+    ASSERT_STATUS_OK(session_object.Load(model_uri));
+    ASSERT_STATUS_OK(session_object.Initialize());
+    ASSERT_STATUS_OK(session_object.Run(feeds, output_names, &fetches));
+  }
+
+  ASSERT_THAT(fetches_orig[0].Get<Tensor>().DataAsSpan<float>(),
+              testing::ContainerEq(fetches[0].Get<Tensor>().DataAsSpan<float>()));
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index e4670583f11b..9dec13767ef9 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -10,6 +10,8 @@
 
 #ifndef _WIN32
 #include <unistd.h>  // for sysconf() and _SC_PAGESIZE
+#else
+#include <Windows.h>
 #endif
 
 #include "gsl/gsl"
@@ -61,7 +63,11 @@ std::vector<char> GenerateData(size_t length, uint32_t seed = 0) {
 }
 
 void WriteDataToFile(gsl::span<const char> data, const PathString& path) {
+#ifndef _WIN32
   std::ofstream out{path, std::ios_base::out | std::ios_base::trunc};
+#else
+  std::ofstream out{path, std::ios_base::out | std::ios_base::trunc | std::ios_base::binary};
+#endif
   out.write(data.data(), data.size());
 }
 
@@ -144,6 +150,59 @@ TEST(FileIoTest, MapFileIntoMemory) {
     ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
   }
 }
+#else
+TEST(FileIoTest, MapFileIntoMemory) {
+  SYSTEM_INFO sysinfo;
+  GetSystemInfo(&sysinfo);
+  static const auto page_size = sysinfo.dwPageSize;
+  static const auto allocation_granularity = sysinfo.dwAllocationGranularity;
+  ASSERT_GT(page_size, static_cast<DWORD>(0));
+
+  TempFilePath tmp(ORT_TSTR("map_file_test_"));
+  const auto expected_data = GenerateData(page_size * 3 / 2);
+  WriteDataToFile(gsl::make_span(expected_data), tmp.path);
+
+  const auto offsets_and_lengths = GenerateValidOffsetLengthPairs(
+      0, expected_data.size(), page_size / 10);
+
+  for (const auto& offset_and_length : offsets_and_lengths) {
+    const auto offset = offset_and_length.first;
+    const auto length = offset_and_length.second;
+
+    // The offset must be a multiple of the allocation granularity
+    if (offset % allocation_granularity != 0) {
+      continue;
+    }
+
+    Env::MappedMemoryPtr mapped_memory{};
+    auto status = Env::Default().MapFileIntoMemory(
+        tmp.path.c_str(), offset, length, mapped_memory);
+    ASSERT_TRUE(status.IsOK())
+        << "MapFileIntoMemory failed for offset " << offset << " and length " << length
+        << " with error: " << status.ErrorMessage();
+
+    auto mapped_span = gsl::make_span(mapped_memory.get(), length);
+
+    auto expected_data_span = gsl::make_span(expected_data.data() + offset, length);
+
+    ASSERT_EQ(mapped_span, expected_data_span);
+  }
+
+  {
+    Env::MappedMemoryPtr mapped_memory{};
+
+    // invalid - offset is not a multiple of the allocation granularity
+    ASSERT_FALSE(Env::Default().MapFileIntoMemory(
+        tmp.path.c_str(), allocation_granularity * 3 / 2, page_size / 10, mapped_memory).IsOK());
+  }
+
+  {
+    Env::MappedMemoryPtr mapped_memory{};
+
+    // invalid - negative offset
+    ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
+  }
+}
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/testdata/ort_github_issue_12151.onnx b/onnxruntime/test/testdata/ort_github_issue_12151.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..f796b46f1bdc26302f4a23513f16cc8ebf77d15b
GIT binary patch
literal 380
zcmYk2UrWMJ0L6Fj=DfR8n-?v$5>X_W*l0wANY1tJ-%O$9zlU_$9tO49wm^L9t*_As
zs?X8aXcYZ<_`!k0;T#T%66`w8x*Ax1;MYDN)-aghh`%)NEsQso=gwEI?F0l&bNJ4B
zd@`ND-dIIJ`_;PbSf(fPm@(J3p8>A`VhMvO0ka$zc&6j8T;D2k#*Y+}m|0oggF-1#
zp_MQYCF!N>_(Aor;cJoj0uV$rVo*eYDwuTF<tEjNNvG{pJxX!sdZ8!wBl+f_mn<t+
zb%84FfFY|l4y9K9aw$$KY@B3eE_Hn{F@{I-knobxX|w$3xWM-H#U8n+?ayH}F&!XT
zEmgF!F`n~1c1u-DH`PV!L>@6(E1uEE9Wje&0jS@{bYm2yG$`u1@&2ZtiRsXS>c6wP
cG|kAjyBN&KHclZx1Pl=jVL~Dyv<ij9C!C>IVE_OC

literal 0
HcmV?d00001

diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py
index d9515041f5df..6ae6ccee5118 100644
--- a/orttraining/orttraining/python/training/optim/_ds_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py
@@ -10,10 +10,11 @@
 # - has_overflow_partitioned_grads_serial : https://github.com/microsoft/DeepSpeed/blob/d8e9ef6f99e27bb95e10bd146d145b3372b4cfda/deepspeed/runtime/zero/stage2.py#L1799
 # --------------------------------------------------------------------------
 
-import torch
 import types
 import warnings
 from distutils.version import LooseVersion
+
+import torch
 from numpy import inf
 
 from ._modifier import FP16OptimizerModifier, check_overflow, check_overflow_for_grads
@@ -27,14 +28,11 @@ def __init__(self, optimizer, **kwargs) -> None:
         super().__init__(optimizer)
 
     def can_be_modified(self):
-        try:
-            import deepspeed
-
-            v = LooseVersion(deepspeed.__version__)
-            if v > LooseVersion("0.5.4") or v < LooseVersion("0.4.0"):
-                warnings.warn("Unsupported DeepSpeed version to override, skipped.", UserWarning)
-                return False
-        except Exception as _:
+        import deepspeed
+
+        ds_version = LooseVersion(deepspeed.__version__)
+        if ds_version > LooseVersion("0.6.5") or ds_version < LooseVersion("0.4.0"):
+            warnings.warn("Skip modifying optimizer because of unsupported DeepSpeed version.", UserWarning)
             return False
 
         return self.check_requirements(
@@ -141,7 +139,8 @@ def has_overflow_partitioned_grads_serial(target):
             #### END OF THE ORIGINAL IMPLEMENTATION ####
 
             #### THIS IS THE FASTER IMPLEMENTATION ####
-            for i in range(len(target.fp16_groups)):
+            groups = target.fp16_groups if hasattr(target, "fp16_groups") else target.bit16_groups
+            for i in range(len(groups)):
                 grad_data = [grad.data for grad in target.averaged_gradients[i] if grad is not None]
                 if check_overflow_for_grads(grad_data):
                     return True
diff --git a/orttraining/orttraining/python/training/optim/_modifier.py b/orttraining/orttraining/python/training/optim/_modifier.py
index 9897ed41210e..b3ad73110d34 100644
--- a/orttraining/orttraining/python/training/optim/_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_modifier.py
@@ -9,6 +9,7 @@
 # --------------------------------------------------------------------------
 
 import torch
+import warnings
 from numpy import inf
 from ._multi_tensor_apply import MultiTensorApply
 
@@ -32,12 +33,16 @@ def check_requirements(self, required_funcs, require_apex=False, require_torch_n
             if require_torch_non_finite_check is True:
                 _ = torch._amp_foreach_non_finite_check_and_unscale_
         except Exception as _:
+            warnings.warn("Skip modifying optimizer because of Apex or torch_non_finite_check not found.", UserWarning)
             return False
 
         if required_funcs:
             for func_name in required_funcs:
                 func = getattr(self._optimizer, func_name, None)
                 if not func or not callable(func):
+                    warnings.warn(
+                        "Skip modifying optimizer because of specific function not found in optimizer.", UserWarning
+                    )
                     return False
         return True
 
diff --git a/orttraining/orttraining/python/training/optim/_modifier_registry.py b/orttraining/orttraining/python/training/optim/_modifier_registry.py
index 142999f3f72c..4291b792a460 100644
--- a/orttraining/orttraining/python/training/optim/_modifier_registry.py
+++ b/orttraining/orttraining/python/training/optim/_modifier_registry.py
@@ -7,12 +7,9 @@
 from ._megatron_modifier import LegacyMegatronLMModifier
 from ._apex_amp_modifier import ApexAMPModifier
 
-LEAGCY_MEGATRON_LM_OPTIMIZER_NAME = "megatron.fp16.fp16.FP16_Optimizer"
-DEEPSPEED_ZERO1_AND_ZERO2_OPTIMIZER_NAME = "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer"
-APEX_AMP_OPTIMIZER_NAME = "apex.amp.optimizer.unique_name_as_id"
-
 OptimizerModifierTypeRegistry = {
-    LEAGCY_MEGATRON_LM_OPTIMIZER_NAME: LegacyMegatronLMModifier,
-    DEEPSPEED_ZERO1_AND_ZERO2_OPTIMIZER_NAME: DeepSpeedZeROModifier,
-    APEX_AMP_OPTIMIZER_NAME: ApexAMPModifier,
+    "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier,
+    "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
+    "deepspeed.runtime.zero.stage_1_and_2.DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
+    "apex.amp.optimizer.unique_name_as_id": ApexAMPModifier,
 }
diff --git a/orttraining/orttraining/python/training/optim/fp16_optimizer.py b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
index c4c353249f1e..c3864ea711f2 100644
--- a/orttraining/orttraining/python/training/optim/fp16_optimizer.py
+++ b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import warnings
+
 from ._modifier_registry import OptimizerModifierTypeRegistry
 
 
@@ -90,6 +92,7 @@ def get_full_qualified_type_name(o):
 
     optimizer_full_qualified_name = get_full_qualified_type_name(optimizer)
     if optimizer_full_qualified_name not in OptimizerModifierTypeRegistry:
+        warnings.warn("Skip modifying optimizer because of optimizer name not found in registry.", UserWarning)
         return optimizer
 
     modifier = OptimizerModifierTypeRegistry[optimizer_full_qualified_name](optimizer, **kwargs)
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
index 63af43ce48eb..1459d3b86dcd 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
@@ -4,14 +4,17 @@
 # --------------------------------------------------------------------------
 
 import sys
+import warnings
+
 import torch
 import torch.utils.checkpoint
-import warnings
+from packaging import version
 from torch.onnx import symbolic_helper
 
 from onnxruntime.capi._pybind_state import register_torch_autograd_function
-from ._fallback import _FallbackManager, ORTModuleONNXModelException, ORTModuleTorchModelException, wrap_exception
+
 from . import _logger
+from ._fallback import ORTModuleONNXModelException, ORTModuleTorchModelException, _FallbackManager, wrap_exception
 
 # Some autograd.Function's shouldn't be exported as PythonOp.
 # If CheckpointFunction is exported as PythonOp, the checkpointed computation
@@ -37,7 +40,15 @@ def _export_pt_1_10(g, n, *args, **kwargs):
                 "wrap exportable sub-nn.Module's as ORTModule."
             )
         inplace = kwargs["inplace"]
-        training_mode = symbolic_helper._training_mode
+        # TODO move to public API once exporter team exposes that
+        training_mode = None
+        runtime_pytorch_version = version.parse(torch.__version__.split("+")[0])
+        if runtime_pytorch_version > version.parse("1.11"):
+            from torch.onnx import _globals
+
+            training_mode = _globals.GLOBALS.training_mode
+        else:
+            training_mode = symbolic_helper._training_mode
         cconv = n.cconv()
         input_tensor_types = []
         input_requires_grads = []
diff --git a/requirements-training.txt b/requirements-training.txt
index 4b1be6cef9b7..82f0331314da 100644
--- a/requirements-training.txt
+++ b/requirements-training.txt
@@ -4,6 +4,6 @@ h5py
 numpy >= 1.16.6
 onnx
 packaging
-protobuf
+protobuf >= 3.12.2, <= 3.20.1
 sympy
 setuptools>=41.4.0
diff --git a/setup.py b/setup.py
index db1331917444..0805a593723c 100644
--- a/setup.py
+++ b/setup.py
@@ -463,10 +463,10 @@ def finalize_options(self):
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.6",
     "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
 ]
 
 if not enable_training:
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index f7567fbee819..08cccff5e303 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -875,7 +875,7 @@ def generate_build_tree(
         "-Donnxruntime_ENABLE_TRAINING_TORCH_INTEROP=" + ("ON" if args.enable_training_torch_interop else "OFF"),
         # Enable advanced computations such as AVX for some traininig related ops.
         "-Donnxruntime_ENABLE_CPU_FP16_OPS=" + ("ON" if args.enable_training else "OFF"),
-        "-Donnxruntime_USE_NCCL=" + ("OFF" if args.disable_nccl else "ON"),
+        "-Donnxruntime_USE_NCCL=" + ("ON" if args.enable_training and not args.disable_nccl else "OFF"),
         "-Donnxruntime_BUILD_BENCHMARKS=" + ("ON" if args.build_micro_benchmarks else "OFF"),
         "-Donnxruntime_USE_ROCM=" + ("ON" if args.use_rocm else "OFF"),
         "-DOnnxruntime_GCOV_COVERAGE=" + ("ON" if args.code_coverage else "OFF"),
diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index e9e8ef4b3481..da86205b52f6 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -36,7 +36,7 @@ then
     strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME
     ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib
     # copy the CoreML EP header for macOS build (libs with .dylib ext)
-    cp $SOURCE_DIR/onnxruntime/core/providers/coreml/coreml_execution_provider.h  $BINARY_DIR/$ARTIFACT_NAME/include
+    cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
 elif [[ $LIB_NAME == *.so.* ]]
 then
     ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so
diff --git a/tools/ci_build/github/windows/jar_packaging.ps1 b/tools/ci_build/github/windows/jar_packaging.ps1
index 679e27b459ef..a132ba6b26e2 100644
--- a/tools/ci_build/github/windows/jar_packaging.ps1
+++ b/tools/ci_build/github/windows/jar_packaging.ps1
@@ -16,8 +16,10 @@ Remove-Item -Path libcustom_op_library.dylib
 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar .
 popd
 pushd onnxruntime-java-linux-aarch64
+Remove-Item -Path libcustom_op_library.so
 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar .
 popd
 pushd onnxruntime-java-osx-arm64
+Remove-Item -Path libcustom_op_library.dylib
 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar .
 popd