Fix mlflow#825: add matplotlib to fix missing module error under dock…

…er image (mlflow#827) * add matplotlib to fix missing module error under docker * restored prior version * Initial check-in * adapted to MLProject structure * Split example into platform-specific subdirectories * Add README explaining platform differences * README links * readme link fix * Reveert formatting changes to java readme
marcusrehm · Feb 18, 2019 · 96b493c · 96b493c
1 parent 748d1f7
commit 96b493c
Show file tree

Hide file tree

Showing 7 changed files with 174 additions and 1 deletion.
diff --git a/examples/sklearn_elasticnet_diabetes/README.md b/examples/sklearn_elasticnet_diabetes/README.md
@@ -0,0 +1,3 @@
+# Scikit-learn ElasticNet Diabetes Example
+
+This example trains an ElasticNet regression model for predicting diabetes progression. The example uses [matplotlib](https://matplotlib.org/), which requires different Python dependencies for Linux and OSX. The [linux](linux) and [osx](osx) subdirectories include appropriate MLflow projects for each respective platform.
diff --git a/examples/sklearn_elasticnet_diabetes/linux/MLproject b/examples/sklearn_elasticnet_diabetes/linux/MLproject
@@ -0,0 +1,10 @@
+name: tutorial
+
+conda_env: conda.yaml
+
+entry_points:
+  main:
+    parameters:
+      alpha: {type: float, default: 0.01}
+      l1_ratio: {type: float, default: 0.1}
+    command: "python train_diabetes.py {alpha} {l1_ratio}"
diff --git a/examples/sklearn_elasticnet_diabetes/linux/conda.yaml b/examples/sklearn_elasticnet_diabetes/linux/conda.yaml
@@ -0,0 +1,12 @@
+name: tutorial
+channels:
+  - defaults
+dependencies:
+  - cloudpickle=0.6.1
+  - python=3.6
+  - numpy=1.14.3
+  - matplotlib=3.0.2
+  - pandas=0.22.0
+  - scikit-learn=0.19.1
+  - pip:
+    - mlflow
diff --git a/...arn_elasticnet_diabetes/train_diabetes.py → ...asticnet_diabetes/linux/train_diabetes.py b/...arn_elasticnet_diabetes/train_diabetes.py → ...asticnet_diabetes/linux/train_diabetes.py
@@ -122,4 +122,4 @@ def eval_metrics(actual, pred):
     plt.close(fig)
 
     # Log artifacts (output files)
-    mlflow.log_artifact("ElasticNet-paths.png")
+    mlflow.log_artifact("ElasticNet-paths.png")
diff --git a/examples/sklearn_elasticnet_diabetes/osx/MLproject b/examples/sklearn_elasticnet_diabetes/osx/MLproject
@@ -0,0 +1,10 @@
+name: tutorial
+
+conda_env: conda.yaml
+
+entry_points:
+  main:
+    parameters:
+      alpha: {type: float, default: 0.01}
+      l1_ratio: {type: float, default: 0.1}
+    command: "pythonw train_diabetes.py {alpha} {l1_ratio}"
diff --git a/examples/sklearn_elasticnet_diabetes/osx/conda.yaml b/examples/sklearn_elasticnet_diabetes/osx/conda.yaml
@@ -0,0 +1,13 @@
+name: tutorial
+channels:
+  - defaults
+dependencies:
+  - cloudpickle=0.6.1
+  - python=3.6
+  - numpy=1.14.3
+  - matplotlib=3.0.2
+  - pandas=0.22.0
+  - scikit-learn=0.19.1
+  - python.app
+  - pip:
+    - mlflow
diff --git a/examples/sklearn_elasticnet_diabetes/osx/train_diabetes.py b/examples/sklearn_elasticnet_diabetes/osx/train_diabetes.py
@@ -0,0 +1,125 @@
+#
+# train_diabetes.py
+#
+#   MLflow model using ElasticNet (sklearn) and Plots ElasticNet Descent Paths
+#
+#   Uses the sklearn Diabetes dataset to predict diabetes progression using ElasticNet
+#       The predicted "progression" column is a quantitative measure of disease progression one year after baseline
+#       http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html
+#   Combines the above with the Lasso Coordinate Descent Path Plot
+#       http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_coordinate_descent_path.html
+#       Original author: Alexandre Gramfort <alexandre.gramfort@inria.fr>; License: BSD 3 clause
+#
+#  Usage:
+#    python train_diabetes.py 0.01 0.01
+#    python train_diabetes.py 0.01 0.75
+#    python train_diabetes.py 0.01 1.0
+#
+
+import os
+import warnings
+import sys
+
+import pandas as pd
+import numpy as np
+from itertools import cycle
+import matplotlib.pyplot as plt
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import ElasticNet
+from sklearn.linear_model import lasso_path, enet_path
+from sklearn import datasets
+
+# Load Diabetes datasets
+diabetes = datasets.load_diabetes()
+X = diabetes.data
+y = diabetes.target
+
+# Create pandas DataFrame for sklearn ElasticNet linear_model
+Y = np.array([y]).transpose()
+d = np.concatenate((X, Y), axis=1)
+cols = diabetes.feature_names + ['progression']
+data = pd.DataFrame(d, columns=cols)
+
+
+# Import mlflow
+import mlflow
+import mlflow.sklearn
+
+
+# Evaluate metrics
+def eval_metrics(actual, pred):
+    rmse = np.sqrt(mean_squared_error(actual, pred))
+    mae = mean_absolute_error(actual, pred)
+    r2 = r2_score(actual, pred)
+    return rmse, mae, r2
+
+
+
+if __name__ == "__main__":
+    warnings.filterwarnings("ignore")
+    np.random.seed(40)
+
+    # Split the data into training and test sets. (0.75, 0.25) split.
+    train, test = train_test_split(data)
+
+    # The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
+    train_x = train.drop(["progression"], axis=1)
+    test_x = test.drop(["progression"], axis=1)
+    train_y = train[["progression"]]
+    test_y = test[["progression"]]
+
+    alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.05
+    l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.05
+
+    # Run ElasticNet
+    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
+    lr.fit(train_x, train_y)
+    predicted_qualities = lr.predict(test_x)
+    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
+
+    # Print out ElasticNet model metrics
+    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
+    print("  RMSE: %s" % rmse)
+    print("  MAE: %s" % mae)
+    print("  R2: %s" % r2)
+
+    # Log mlflow attributes for mlflow UI
+    mlflow.log_param("alpha", alpha)
+    mlflow.log_param("l1_ratio", l1_ratio)
+    mlflow.log_metric("rmse", rmse)
+    mlflow.log_metric("r2", r2)
+    mlflow.log_metric("mae", mae)
+    mlflow.sklearn.log_model(lr, "model")
+
+
+    # Compute paths
+    eps = 5e-3  # the smaller it is the longer is the path
+
+    print("Computing regularization path using the elastic net.")
+    alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False)
+
+    # Display results
+    fig = plt.figure(1)
+    ax = plt.gca()
+
+    colors = cycle(['b', 'r', 'g', 'c', 'k'])
+    neg_log_alphas_enet = -np.log10(alphas_enet)
+    for coef_e, c in zip(coefs_enet, colors):
+        l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)
+
+    plt.xlabel('-Log(alpha)')
+    plt.ylabel('coefficients')
+    title = 'ElasticNet Path by alpha for l1_ratio = ' + str(l1_ratio)
+    plt.title(title)
+    plt.axis('tight')
+
+
+    # Save figures
+    fig.savefig("ElasticNet-paths.png")
+
+    # Close plot
+    plt.close(fig)
+
+    # Log artifacts (output files)
+    mlflow.log_artifact("ElasticNet-paths.png")