[WIP] MLTable conversion (#108)

* Adding requirements Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Starting to put together the Adult dataset in MLTable Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Forgot to update Dockerfile.... Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Fix MLTable paths Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Try bumping SDKv1 version Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Fix inputs in the YAML pipeline Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Try tweaking the training script... Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Convert RAI data loading Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Change causal test to use MLTable data Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Need to change conftest too.... Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Stupid mistake.... Signed-off-by: Richard Edgar <riedgar@microsoft.com> * More on the same typo.... Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Switch counterfactual all args classification Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Try to switch some more tests Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Accepting either input Signed-off-by: Richard Edgar <riedgar@microsoft.com> * The internal data will be parquet.... Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Fixes for registration script Signed-off-by: Richard Edgar <riedgar@microsoft.com> * I had the wrong call? Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Change Programmer notebook to MLTable Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Need to fix the path to the MLTable datasets Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Too much data Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Why is second replacement not working? Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Need to set 'f' format Signed-off-by: Richard Edgar <riedgar@microsoft.com> * Remove test tabular dataset test for mltable migration. Co-authored-by: kicha0 <kicha@microsoft.com>
wutaomsft · Jul 21, 2022 · 591067b · 591067b
1 parent 0240448
commit 591067b
Show file tree

Hide file tree

Showing 24 changed files with 150 additions and 299 deletions.
diff --git a/examples/CLI/pipeline_rai_adult.yaml b/examples/CLI/pipeline_rai_adult.yaml
@@ -5,12 +5,12 @@ type: pipeline
 inputs:
   target_column_name: income
   my_training_data:
-    type: uri_file
-    path: azureml:adult_train_pq:1
+    type: mltable
+    path: azureml:adult_train:1
     mode: download
   my_test_data:
-    type: uri_file
-    path: azureml:adult_test_pq:1
+    type: mltable
+    path: azureml:adult_test:1
     mode: download
 
 settings:

diff --git a/examples/notebooks/data-programmer-regression/test/MLTable b/examples/notebooks/data-programmer-regression/test/MLTable
@@ -0,0 +1,6 @@
+$schema: http://azureml/sdk-2-0/MLTable.json
+type: mltable
+paths:
+  - file: ./programmers-test.parquet
+transformations:
+  - read_parquet
diff --git a/...s/notebooks/data/programmers-test.parquet → ...-regression/test/programmers-test.parquet b/...s/notebooks/data/programmers-test.parquet → ...-regression/test/programmers-test.parquet
diff --git a/examples/notebooks/data-programmer-regression/train/MLTable b/examples/notebooks/data-programmer-regression/train/MLTable
@@ -0,0 +1,6 @@
+$schema: http://azureml/sdk-2-0/MLTable.json
+type: mltable
+paths:
+  - file: ./programmers-train.parquet
+transformations:
+  - read_parquet
diff --git a/.../notebooks/data/programmers-train.parquet → ...egression/train/programmers-train.parquet b/.../notebooks/data/programmers-train.parquet → ...egression/train/programmers-train.parquet
diff --git a/examples/notebooks/responsibleaidashboard-programmer-regression-model-debugging.ipynb b/examples/notebooks/responsibleaidashboard-programmer-regression-model-debugging.ipynb
@@ -65,7 +65,7 @@
    "source": [
     "## Accessing the Data\n",
     "\n",
-    "We supply the synthetic data as a pair of parquet files. We can read them in and take a brief look:"
+    "We supply the synthetic data as a pair of parquet files and accompanying `MLTable` file. We can read them in and take a brief look:"
    ]
   },
   {
@@ -94,7 +94,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data_path = 'data/programmers-train.parquet'"
+    "train_data_path = 'data-programmer-regression/train/'"
    ]
   },
   {
@@ -104,7 +104,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_data_path = 'data/programmers-test.parquet'"
+    "test_data_path = 'data-programmer-regression/test/'"
    ]
   },
   {
@@ -122,7 +122,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_df = pd.read_parquet(train_data_path)\n",
+    "import mltable\n",
+    "\n",
+    "tbl = mltable.load(train_data_path)\n",
+    "train_df: pd.DataFrame = tbl.to_pandas_dataframe()\n",
     "\n",
     "display(train_df)"
    ]
@@ -184,12 +187,12 @@
     "from azure.ai.ml.entities import Data\n",
     "from azure.ai.ml.constants import AssetTypes\n",
     "\n",
-    "input_train_data = \"Programmers_Train_Data\"\n",
-    "input_test_data = \"Programmers_Test_Data\"\n",
+    "input_train_data = \"Programmers_Train_MLTable\"\n",
+    "input_test_data = \"Programmers_Test_MLTable\"\n",
     "\n",
     "train_data = Data(\n",
     "    path=train_data_path,\n",
-    "    type=AssetTypes.URI_FILE,\n",
+    "    type=AssetTypes.MLTABLE,\n",
     "    description=\"RAI programmers training data\",\n",
     "    name=input_train_data,\n",
     "    version=rai_programmer_example_version_string,\n",
@@ -198,7 +201,7 @@
     "\n",
     "test_data = Data(\n",
     "    path=test_data_path,\n",
-    "    type=AssetTypes.URI_FILE,\n",
+    "    type=AssetTypes.MLTABLE,\n",
     "    description=\"RAI programmers test data\",\n",
     "    name=input_test_data,\n",
     "    version=rai_programmer_example_version_string,\n",
@@ -258,6 +261,8 @@
     "import mlflow\n",
     "import mlflow.sklearn\n",
     "\n",
+    "import mltable\n",
+    "\n",
     "import pandas as pd\n",
     "from sklearn.linear_model import LinearRegression\n",
     "from sklearn.pipeline import Pipeline\n",
@@ -314,7 +319,8 @@
     "    \n",
     "    # Read in data\n",
     "    print(\"Reading data\")\n",
-    "    all_data = pd.read_parquet(args.training_data)\n",
+    "    tbl = mltable.load(args.training_data)\n",
+    "    all_data = tbl.to_pandas_dataframe()\n",
     "\n",
     "    print(\"Extracting X_train, y_train\")\n",
     "    print(\"all_data cols: {0}\".format(all_data.columns))\n",
@@ -488,11 +494,11 @@
     "train_model_component = ml_client.components.get(\n",
     "    name=\"rai_programmers_training_component\", version=rai_programmer_example_version_string\n",
     ")\n",
-    "programmers_train_pq = Input(\n",
-    "    type=\"uri_file\", path=f\"{input_train_data}:{rai_programmer_example_version_string}\", mode=\"download\"\n",
+    "programmers_train_mltable = Input(\n",
+    "    type=\"mltable\", path=f\"{input_train_data}:{rai_programmer_example_version_string}\", mode=\"download\"\n",
     ")\n",
-    "programmers_test_pq = Input(\n",
-    "    type=\"uri_file\", path=f\"{input_test_data}:{rai_programmer_example_version_string}\", mode=\"download\"\n",
+    "programmers_test_mltable = Input(\n",
+    "    type=\"mltable\", path=f\"{input_test_data}:{rai_programmer_example_version_string}\", mode=\"download\"\n",
     ")\n",
     "\n",
     "@dsl.pipeline(\n",
@@ -503,7 +509,7 @@
     "def my_training_pipeline(target_column_name, training_data):\n",
     "    trained_model = train_component_definition(\n",
     "        target_column_name=target_column_name,\n",
-    "        training_data=programmers_train_pq\n",
+    "        training_data=training_data\n",
     "    )\n",
     "    trained_model.set_limits(timeout=120)\n",
     "\n",
@@ -515,7 +521,7 @@
     "\n",
     "    return {}\n",
     "\n",
-    "model_registration_pipeline_job = my_training_pipeline(target_column_name, programmers_train_pq)"
+    "model_registration_pipeline_job = my_training_pipeline(target_column_name, programmers_train_mltable)"
    ]
   },
   {
@@ -732,8 +738,8 @@
     "\n",
     "insights_pipeline_job = rai_programmer_regression_pipeline(\n",
     "    target_column_name=target_column_name,\n",
-    "    train_data=programmers_train_pq,\n",
-    "    test_data=programmers_test_pq,\n",
+    "    train_data=programmers_train_mltable,\n",
+    "    test_data=programmers_test_mltable,\n",
     ")\n",
     "\n",
     "rand_path = str(uuid.uuid4())\n",
@@ -845,12 +851,12 @@
     "inputs:\n",
     "  target_column_name: {target_column_name}\n",
     "  my_training_data:\n",
-    "    type: uri_file\n",
+    "    type: mltable\n",
     "    path: azureml:{input_train_data}:{rai_programmer_example_version_string}\n",
     "    mode: download\n",
     "  my_test_data:\n",
-    "    type: uri_file\n",
-    "    path: azureml:{input_train_data}:{rai_programmer_example_version_string}\n",
+    "    type: mltable\n",
+    "    path: azureml:{input_test_data}:{rai_programmer_example_version_string}\n",
     "    mode: download\n",
     "\n",
     "settings:\n",
@@ -966,6 +972,7 @@
   }
  ],
  "metadata": {
+  "celltoolbar": "Raw Cell Format",
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,6 +1,9 @@
 azure-common
 --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-cli-v2
 azure-ai-ml
+mltable
+azureml_dataprep
+azureml_dataprep_rslex
 jupyter
 pandas
 pyarrow

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,8 @@
 azure-common
 azure-ai-ml==0.1.0b4
+mltable==0.1.0b3
+azureml_dataprep
+azureml_dataprep_rslex
 responsibleai~=0.18.0
 raiwidgets~=0.18.0
 jupyter

diff --git a/src/responsibleai/docker_env/Dockerfile b/src/responsibleai/docker_env/Dockerfile
@@ -16,14 +16,17 @@ RUN pip install 'responsibleai~=0.18.1' \
                 'raiwidgets~=0.18.1' \
                 'pyarrow' \
                 'mlflow' \
-                'azureml-core==1.41.0.post1' \
-                'azureml-dataset-runtime==1.41.0' \
-                'azureml-mlflow==1.41.0' \
-                'azureml-telemetry==1.41.0' \
+                'azureml-core~=1.42.0' \
+                'azureml-dataset-runtime~=1.42.0' \
+                'azureml-mlflow~=1.42.0' \
+                'azureml-telemetry~=1.42.0' \
                 'pdfkit==1.0.0' \
                 'plotly==5.6.0' \
                 'kaleido==0.2.1' \
-                'protobuf<4'
+                'protobuf<4' \
+                'mltable' \
+                'azureml_dataprep' \
+                'azureml_dataprep_rslex'
 
 RUN pip install --pre azure-ai-ml
 

diff --git a/src/responsibleai/rai_analyse/rai_component_utilities.py b/src/responsibleai/rai_analyse/rai_component_utilities.py
@@ -17,6 +17,8 @@
 
 import mlflow
 
+import mltable
+
 from azureml.core import Model, Run, Workspace
 
 from responsibleai import RAIInsights, __version__ as responsibleai_version
@@ -68,9 +70,30 @@ def load_mlflow_model(workspace: Workspace, model_id: str) -> Any:
     return mlflow.pyfunc.load_model(model_uri)._model_impl
 
 
-def load_dataset(parquet_path: str):
+def load_mltable(mltable_path: str) -> pd.DataFrame:
+    _logger.info("Loading MLTable: {0}".format(mltable_path))
+    df: pd.DataFrame = None
+    try:
+        tbl = mltable.load(mltable_path)
+        df: pd.DataFrame = tbl.to_pandas_dataframe()
+    except Exception as e:
+        _logger.info("Failed to load MLTable")
+        _logger.info(e)
+    return df
+
+
+def load_parquet(parquet_path: str) -> pd.DataFrame:
     _logger.info("Loading parquet file: {0}".format(parquet_path))
     df = pd.read_parquet(parquet_path)
+    return df
+
+
+def load_dataset(dataset_path: str) -> pd.DataFrame:
+    _logger.info(f"Attempting to load: {dataset_path}")
+    df = load_mltable(dataset_path)
+    if df is None:
+        df = load_parquet(dataset_path)
+
     print(df.dtypes)
     print(df.head(10))
     return df

diff --git a/test/components/src_train_logreg/train.py b/test/components/src_train_logreg/train.py
@@ -13,6 +13,8 @@
 import mlflow
 import mlflow.sklearn
 
+import mltable
+
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 
@@ -47,7 +49,8 @@ def main(args):
 
     # Read in data
     print("Reading data")
-    all_data = pd.read_parquet(args.training_data)
+    tbl = mltable.load(args.training_data)
+    all_data = tbl.to_pandas_dataframe()
 
     print("Extracting X_train, y_train")
     print("all_data cols: {0}".format(all_data.columns))

diff --git a/test/conftest.py b/test/conftest.py
@@ -113,8 +113,8 @@ def registered_adult_model_id(ml_client, component_config):
     register_component = ml_client.components.get(
         name="register_model", version=version_string
     )
-    adult_train_pq = Input(
-        type="uri_file", path=f"adult_train_pq:{version_string}", mode="download"
+    adult_train = Input(
+        type="mltable", path=f"adult_train:{version_string}", mode="download"
     )
 
     @dsl.pipeline(
@@ -137,7 +137,7 @@ def my_training_pipeline(target_column_name, training_data):
 
         return {}
 
-    training_pipeline = my_training_pipeline("income", adult_train_pq)
+    training_pipeline = my_training_pipeline("income", adult_train)
 
     training_pipeline_job = submit_and_wait(ml_client, training_pipeline)
     assert training_pipeline_job is not None

diff --git a/test/data/adult/data_adult_test.yaml b/test/data/adult/data_adult_test.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlsdk2.blob.core.windows.net/latest/asset.schema.json
-name: adult_test_pq
+name: adult_test
 version: VERSION_REPLACEMENT_STRING
-type: uri_file
+type: mltable
 description: Adult Census dataset, provided via shap. Training sample
-path: ./adult_test.parquet
+path: ./test
diff --git a/test/data/adult/data_adult_train.yaml b/test/data/adult/data_adult_train.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlsdk2.blob.core.windows.net/latest/asset.schema.json
-name: adult_train_pq
+name: adult_train
 version: VERSION_REPLACEMENT_STRING
-type: uri_file
+type: mltable
 description: Adult Census dataset, provided via shap. Training sample
-path: ./adult_train.parquet
+path: ./train/
diff --git a/test/data/adult/fetch_adult.py b/test/data/adult/fetch_adult.py
@@ -16,5 +16,5 @@
 
 # Don't write out the row indices to the CSV.....
 print("Saving to files")
-data_train.to_parquet("adult_train.parquet", index=False)
-data_test.to_parquet("adult_test.parquet", index=False)
+data_train.to_parquet("./train/adult_train.parquet", index=False)
+data_test.to_parquet("./test/adult_test.parquet", index=False)
diff --git a/test/data/adult/test/MLTable b/test/data/adult/test/MLTable
@@ -0,0 +1,6 @@
+$schema: http://azureml/sdk-2-0/MLTable.json
+type: mltable
+paths:
+  - file: ./adult_test.parquet
+transformations:
+  - read_parquet
diff --git a/test/data/adult/train/MLTable b/test/data/adult/train/MLTable
@@ -0,0 +1,6 @@
+$schema: http://azureml/sdk-2-0/MLTable.json
+type: mltable
+paths:
+  - file: ./adult_train.parquet
+transformations:
+  - read_parquet
diff --git a/test/notebooks/test_notebooks.py b/test/notebooks/test_notebooks.py
@@ -17,6 +17,7 @@ def update_cells(input_nb_path, output_nb_path, replacement_strings: Dict[str, s
     for cell in notebook["cells"]:
         for original, update in replacement_strings.items():
             if cell["source"] == original:
+                print(f"Replacing ---{original}--- with ---{update}---")
                 cell["source"] = update
 
     nbf.write(notebook, output_nb_path)
@@ -100,22 +101,22 @@ def test_responsibleaidashboard_programmer_regression_model_debugging(
 
     current_file_directory = os.path.dirname(os.path.abspath(__file__))
     data_dir = os.path.abspath(
-        os.path.join(current_file_directory, "../..", "examples/notebooks/data")
+        os.path.join(current_file_directory, "../..", "examples/notebooks")
     )
-    train_filename = "programmers-train.parquet"
-    test_filename = "programmers-test.parquet"
+    train_path = 'data-programmer-regression/train/'
+    test_path = 'data-programmer-regression/test/'
 
     replacements = dict()
     replacements["version_string = '1'"] = f"version_string = '{version_string}'"
     replacements[
         "rai_programmer_example_version_string = '5'"
     ] = f"rai_programmer_example_version_string = '{train_version_string}'"
     replacements[
-        "train_data_path = 'data/programmers-train.parquet'"
-    ] = f'train_data_path = r"{os.path.join(data_dir, train_filename)}"'
+        f"train_data_path = '{train_path}'"
+    ] = f'train_data_path = r"{os.path.join(data_dir, train_path)}"'
     replacements[
-        "test_data_path = 'data/programmers-test.parquet'"
-    ] = f'test_data_path = r"{os.path.join(data_dir, test_filename)}"'
+        f"test_data_path = '{test_path}'"
+    ] = f'test_data_path = r"{os.path.join(data_dir, test_path)}"'
 
     assay_one_notebook(nb_name, dict(), replacements)