[js/web/training] Added parameters methods (microsoft#18250)

### Description * Implemented: `getParametersSize`, `getContiguousParameters` (equivalent to copyParametersToBuffer), and `loadParametersBuffer` (equivalent to copyParametersFromBuffer) * as part of these changes, getParametersSize was added to the TrainingSession interface so that users know what size buffer to create for loadParametersBuffer * The parameters methods in the interface were modified to take in a Float32Array instead ### Motivation and Context * part of the work for implementing web bindings for training * enables federated learning in the web * previous PR: microsoft#18006 --------- Co-authored-by: Ashwini Khade <askhade@microsoft.com>
reinhrst · Nov 27, 2023 · dd355e3 · dd355e3
1 parent a2fd8a6
commit dd355e3
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 40 deletions.
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
@@ -49,8 +49,9 @@ export interface TrainingSessionHandler extends SessionHandler {
       feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType,
       options: InferenceSession.RunOptions): Promise<SessionHandler.ReturnType>;
 
+  getParametersSize(trainableOnly: boolean): Promise<number>;
   loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
-  getContiguousParameters(trainableOnly: boolean): Promise<Uint8Array>;
+  getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
 }
 
 /**

diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
@@ -176,12 +176,24 @@ export class TrainingSession implements TrainingSessionInterface {
     return this.convertHandlerReturnTypeToMapOfTensors(results);
   }
 
-  async loadParametersBuffer(_array: Uint8Array, _trainableOnly: boolean): Promise<void> {
-    throw new Error('Method not implemented.');
+  async getParametersSize(trainableOnly = true): Promise<number> {
+    return this.handler.getParametersSize(trainableOnly);
   }
 
-  async getContiguousParameters(_trainableOnly: boolean): Promise<Uint8Array> {
-    throw new Error('Method not implemented.');
+  async loadParametersBuffer(array: Uint8Array, trainableOnly = true): Promise<void> {
+    const paramsSize = await this.getParametersSize(trainableOnly);
+    // checking that the size of the Uint8Array is equivalent to the byte length of a Float32Array of the number
+    // of parameters
+    if (array.length !== 4 * paramsSize) {
+      throw new Error(
+          'Size of the buffer passed into loadParametersBuffer must match the number of parameters in ' +
+          'the model. Please use getParametersSize method to check.');
+    }
+    return this.handler.loadParametersBuffer(array, trainableOnly);
+  }
+
+  async getContiguousParameters(trainableOnly = true): Promise<OnnxValue> {
+    return this.handler.getContiguousParameters(trainableOnly);
   }
 
   async release(): Promise<void> {

diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {InferenceSession} from './inference-session.js';
+import {OnnxValue} from './onnx-value.js';
 import {TrainingSession as TrainingSessionImpl} from './training-session-impl.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
@@ -49,21 +50,33 @@ export interface TrainingSession {
   // #endregion
 
   // #region copy parameters
+
+  /**
+   * Retrieves the size of all parameters for the training state. Calculates the total number of primitive (datatype of
+   * the parameters) elements of all the parameters in the training state.
+   *
+   * @param trainableOnly - When set to true, the size is calculated for trainable params only. Default value is true.
+   */
+  getParametersSize(trainableOnly: boolean): Promise<number>;
+
   /**
-   * Copies from a buffer containing parameters to the TrainingSession parameters.
+   * Copies parameter values from the given array to the training state. Currently, only supporting models with
+   * parameters of type Float32.
    *
-   * @param buffer - buffer containing parameters
-   * @param trainableOnly - True if trainable parameters only to be modified, false otherwise.
+   * @param buffer - Float32 buffer containing parameters converted to a Uint8Array.
+   * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
    */
   loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
 
   /**
-   * Copies from the TrainingSession parameters to a buffer.
+   * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
+   * Currently, only supporting models with parameters of type Float32.
    *
-   * @param trainableOnly - True if trainable parameters only to be copied, false othrwise.
-   * @returns A promise that resolves to a buffer of the requested parameters.
+   * @param trainableOnly - When set to true, only trainable parameters are copied. Trainable parameters are parameters
+   * for which requires_grad is set to true. Default value is true.
+   * @returns A promise that resolves to a Float32 OnnxValue of the requested parameters.
    */
-  getContiguousParameters(trainableOnly: boolean): Promise<Uint8Array>;
+  getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
   // #endregion
 
   // #region release()

diff --git a/js/web/lib/wasm/session-handler-training.ts b/js/web/lib/wasm/session-handler-training.ts
@@ -1,20 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {env, InferenceSession, SessionHandler, Tensor, TrainingSessionHandler} from 'onnxruntime-common';
+import {env, InferenceSession, OnnxValue, SessionHandler, Tensor, TrainingSessionHandler} from 'onnxruntime-common';
 
 import {SerializableModeldata, TensorMetadata} from './proxy-messages';
 import {decodeTensorMetadata, encodeTensorMetadata} from './session-handler-inference';
 import {createSessionAllocate, initRuntime, isOrtEnvInitialized} from './wasm-core-impl';
-import {createCheckpointHandle, createTrainingSessionHandle, releaseTrainingSessionAndCheckpoint, runTrainStep} from './wasm-training-core-impl';
+import {createCheckpointHandle, createTrainingSessionHandle, getContiguousParameters, getParametersSize, loadParametersBuffer, releaseTrainingSessionAndCheckpoint, runTrainStep} from './wasm-training-core-impl';
 
 export class OnnxruntimeWebAssemblyTrainingSessionHandler implements TrainingSessionHandler {
-  async loadParametersBuffer(_array: Uint8Array, _trainableOnly: boolean): Promise<void> {
-    throw new Error('Method not implemented.');
-  }
-  async getContiguousParameters(_trainableOnly: boolean): Promise<Uint8Array> {
-    throw new Error('Method not implemented.');
-  }
   private sessionId: number;
   private checkpointId: number;
 
@@ -124,6 +118,18 @@ export class OnnxruntimeWebAssemblyTrainingSessionHandler implements TrainingSes
     return this.convertTensorMetadataToReturnType(results, outputArray, outputIndices);
   }
 
+  async getParametersSize(trainableOnly: boolean): Promise<number> {
+    return getParametersSize(this.sessionId, trainableOnly);
+  }
+
+  async loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void> {
+    await loadParametersBuffer(this.sessionId, array, trainableOnly);
+  }
+  async getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue> {
+    const tensorResult = await getContiguousParameters(this.sessionId, trainableOnly);
+    return decodeTensorMetadata(tensorResult);
+  }
+
   async dispose(): Promise<void> {
     return releaseTrainingSessionAndCheckpoint(
         this.checkpointId, this.sessionId, this.inputEncodedNames, this.outputEncodedNames);

diff --git a/js/web/lib/wasm/wasm-training-core-impl.ts b/js/web/lib/wasm/wasm-training-core-impl.ts
@@ -6,7 +6,7 @@ import {InferenceSession, Tensor} from 'onnxruntime-common';
 import {SerializableModeldata, SerializableSessionMetadata, TensorMetadata} from './proxy-messages';
 import {setRunOptions} from './run-options';
 import {setSessionOptions} from './session-options';
-import {tensorDataTypeEnumToString, tensorTypeToTypedArrayConstructor} from './wasm-common';
+import {dataLocationStringToEnum, tensorDataTypeEnumToString, tensorDataTypeStringToEnum, tensorTypeToTypedArrayConstructor} from './wasm-common';
 import {prepareInputOutputTensor} from './wasm-core-impl';
 import {getInstance} from './wasm-factory';
 import {checkLastError} from './wasm-utils';
@@ -16,6 +16,22 @@ const NO_TRAIN_FUNCS_MSG =
     'functionality, and make sure that all the correct artifacts are built & moved to the correct folder if ' +
     'using a custom build. Check https://onnxruntime.ai/docs/build/web.html for more information.';
 
+/**
+ * Runs the checkLastError function which will throw an error, if the provided error code matches the specified
+ * pattern for an error code.
+ * @param errCode number to evaluated for if it's an error
+ * @param message message to pass into checkLastError
+ * @param checkNeqZero when true, treats not equal to zero as an error.
+ *                     When false, treats equal to zero as an error.
+ */
+const ifErrCodeCheckLastError = (errCode: number, message: string, checkNeqZero = true) => {
+  if (checkNeqZero && errCode !== 0) {
+    checkLastError(message);
+  } else if (!checkNeqZero && errCode === 0) {
+    checkLastError(message);
+  }
+};
+
 export const createCheckpointHandle = (checkpointData: SerializableModeldata): number => {
   const wasm = getInstance();
 
@@ -29,9 +45,7 @@ export const createCheckpointHandle = (checkpointData: SerializableModeldata): n
       throw new Error(NO_TRAIN_FUNCS_MSG);
     }
 
-    if (checkpointHandle === 0) {
-      checkLastError('Error occurred when trying to create a CheckpointState.');
-    }
+    ifErrCodeCheckLastError(checkpointHandle, 'Error occurred when trying to create a CheckpointState', false);
     return checkpointHandle;
   } catch (e) {
     if (wasm._OrtTrainingReleaseCheckpoint && checkpointHandle !== 0) {
@@ -52,9 +66,7 @@ const getModelInputOutputCount = (trainingSessionId: number, isEvalModel: boolea
     if (wasm._OrtTrainingGetModelInputOutputCount) {
       const errorCode =
           wasm._OrtTrainingGetModelInputOutputCount(trainingSessionId, dataOffset, dataOffset + 4, isEvalModel);
-      if (errorCode !== 0) {
-        checkLastError('Can\'t get session input/output count.');
-      }
+      ifErrCodeCheckLastError(errorCode, 'Can\'t get session input/output count.');
       return [wasm.HEAP32[dataOffset / 4], wasm.HEAP32[dataOffset / 4 + 1]];
     } else {
       throw new Error(NO_TRAIN_FUNCS_MSG);
@@ -74,9 +86,7 @@ const getModelInputOutputNamesLoop =
       for (let i = 0; i < count; i++) {
         if (wasm._OrtTrainingGetModelInputOutputName) {
           const name = wasm._OrtTrainingGetModelInputOutputName(trainingSessionId, i, isInput, isEvalModel);
-          if (name === 0) {
-            checkLastError('Can\'t get input or output name');
-          }
+          ifErrCodeCheckLastError(name, `Can't get input or output name -- is input: ${isInput}, index ${i}`, false);
 
           namesUTF8Encoded.push(name);
           names.push(wasm.UTF8ToString(name));
@@ -122,9 +132,7 @@ export const createTrainingSessionHandle =
           throw new Error(NO_TRAIN_FUNCS_MSG);
         }
 
-        if (trainingSessionHandle === 0) {
-          checkLastError('Error occurred when trying to create a TrainingSession.');
-        }
+        ifErrCodeCheckLastError(trainingSessionHandle, 'Error occurred when trying to create a TrainingSession', false);
 
         [inputNames, inputNamesUTF8Encoded, outputNames, outputNamesUTF8Encoded] =
             getTrainingModelInputOutputNames(trainingSessionHandle);
@@ -213,9 +221,8 @@ const moveOutputToTensorMetadataArr =
         try {
           const errorCode = wasm._OrtGetTensorData(
               tensor, tensorDataOffset, tensorDataOffset + 4, tensorDataOffset + 8, tensorDataOffset + 12);
-          if (errorCode !== 0) {
-            checkLastError(`Can't access output tensor data on index ${i}.`);
-          }
+          ifErrCodeCheckLastError(errorCode, `Can't access output tensor data on index ${i}.`);
+
           let tensorDataIndex = tensorDataOffset / 4;
           const dataType = wasm.HEAPU32[tensorDataIndex++];
           dataOffset = wasm.HEAPU32[tensorDataIndex++];
@@ -290,10 +297,7 @@ export const runTrainStep = async(
     if (wasm._OrtTrainingRunTrainStep) {
       const errorCode = wasm._OrtTrainingRunTrainStep(
           trainingSessionId, inputValuesOffset, inputCount, outputValuesOffset, outputCount, runOptionsHandle);
-
-      if (errorCode !== 0) {
-        checkLastError('failed to call OrtTrainingRunTrainStep in the WebAssembly layer');
-      }
+      ifErrCodeCheckLastError(errorCode, 'failed to call OrtTrainingRunTrainStep in the WebAssembly layer');
     } else {
       throw new Error(NO_TRAIN_FUNCS_MSG);
     }
@@ -313,6 +317,128 @@ export const runTrainStep = async(
   }
 };
 
+export const getParametersSize = (trainingSessionId: number, trainableOnly: boolean): number => {
+  const wasm = getInstance();
+  const stack = wasm.stackSave();
+
+  try {
+    const sizeOffset = wasm.stackAlloc(4);
+    if (wasm._OrtTrainingGetParametersSize) {
+      const errorCode = wasm._OrtTrainingGetParametersSize(trainingSessionId, sizeOffset, trainableOnly);
+      ifErrCodeCheckLastError(errorCode, 'Can\'t get parameters size');
+
+      return wasm.HEAP32[sizeOffset / 4];
+    } else {
+      throw new Error(NO_TRAIN_FUNCS_MSG);
+    }
+  } finally {
+    wasm.stackRestore(stack);
+  }
+};
+
+export const getContiguousParameters =
+    async(trainingSessionId: number, trainableOnly: boolean): Promise<TensorMetadata> => {
+  const wasm = getInstance();
+  const stack = wasm.stackSave();
+
+  const tensorTypeAsString = 'float32';
+  const locationAsString = 'cpu';
+
+  const parametersSize = getParametersSize(trainingSessionId, trainableOnly);
+  let tensor = 0;
+
+  // allocates a buffer of the correct size on the WASM heap
+  const paramsByteLength = 4 * parametersSize;
+  const paramsOffset = wasm._malloc(paramsByteLength);
+
+  // handles the dimensions-related createTensor parameters
+  const dims = [parametersSize];
+
+  const dimsOffset = wasm.stackAlloc(4);
+  const dimsIndex = dimsOffset / 4;
+  wasm.HEAP32[dimsIndex] = parametersSize;
+
+  try {
+    // wraps allocated array in a tensor
+    tensor = wasm._OrtCreateTensor(
+        tensorDataTypeStringToEnum(tensorTypeAsString), paramsOffset, paramsByteLength, dimsOffset, dims.length,
+        dataLocationStringToEnum(locationAsString));
+    ifErrCodeCheckLastError(
+        tensor, `Can't create tensor for getContiguousParameters. session=${trainingSessionId}.`, false);
+
+    if (wasm._OrtTrainingCopyParametersToBuffer) {
+      const errCode = wasm._OrtTrainingCopyParametersToBuffer(trainingSessionId, tensor, parametersSize, trainableOnly);
+      ifErrCodeCheckLastError(errCode, 'Can\'t get contiguous parameters.');
+
+    } else {
+      throw new Error(NO_TRAIN_FUNCS_MSG);
+    }
+
+    // copies from WASM memory to a JavaScript typed array, which is then put into a TensorMetadata object
+    const typedArrayConstructor = tensorTypeToTypedArrayConstructor(tensorTypeAsString);
+    const data = new typedArrayConstructor(parametersSize);
+    const output: TensorMetadata[] = [];
+    new Uint8Array(data.buffer, data.byteOffset, data.byteLength)
+        .set(wasm.HEAPU8.subarray(paramsOffset, paramsOffset + paramsByteLength));
+    output.push([tensorTypeAsString, dims, data, locationAsString]);
+    if (output.length !== 1) {
+      throw new Error(`something unexpected happened in the getContiguousParameters function. Expected output length of
+     one, got ${output.length}`);
+    } else {
+      return output[0];
+    }
+  } finally {
+    if (tensor !== 0) {
+      wasm._OrtReleaseTensor(tensor);
+    }
+    wasm._free(paramsOffset);
+    wasm._free(dimsOffset);
+    wasm.stackRestore(stack);
+  }
+};
+
+export const loadParametersBuffer =
+    async(trainingSessionId: number, buffer: Uint8Array, trainableOnly: boolean): Promise<void> => {
+  const wasm = getInstance();
+  const stack = wasm.stackSave();
+
+  const tensorTypeAsString = 'float32';
+  const locationAsString = 'cpu';
+
+  // allocates & copies JavaScript buffer to WASM heap
+  const bufferByteLength = buffer.length;
+  const bufferCount = bufferByteLength / 4;
+  const bufferOffset = wasm._malloc(bufferByteLength);
+  wasm.HEAPU8.set(buffer, bufferOffset);
+
+  // allocates and handles moving dimensions information to WASM memory
+  const dimsOffset = wasm.stackAlloc(4);
+  wasm.HEAP32[dimsOffset / 4] = bufferCount;
+  const dimsLength = 1;
+  let tensor = 0;
+
+  try {
+    tensor = wasm._OrtCreateTensor(
+        tensorDataTypeStringToEnum(tensorTypeAsString), bufferOffset, bufferByteLength, dimsOffset, dimsLength,
+        dataLocationStringToEnum(locationAsString));
+    ifErrCodeCheckLastError(tensor, `Can't create tensor for input/output. session=${trainingSessionId}`, false);
+
+    if (wasm._OrtTrainingCopyParametersFromBuffer) {
+      const errCode = wasm._OrtTrainingCopyParametersFromBuffer(trainingSessionId, tensor, bufferCount, trainableOnly);
+      ifErrCodeCheckLastError(errCode, 'Can\'t copy buffer to parameters.');
+    } else {
+      throw new Error(NO_TRAIN_FUNCS_MSG);
+    }
+  } finally {
+    if (tensor !== 0) {
+      wasm._OrtReleaseTensor(tensor);
+    }
+    wasm.stackRestore(stack);
+    wasm._free(bufferOffset);
+    wasm._free(dimsOffset);
+  }
+};
+
 export const releaseTrainingSessionAndCheckpoint =
     (checkpointId: number, sessionId: number, inputNamesUTF8Encoded: number[], outputNamesUTF8Encoded: number[]):
         void => {