Update fetch_project to no longer remove mlruns from working directory (

mlflow#164)
parano · Jul 30, 2018 · db79c58 · db79c58
1 parent fd36126
commit db79c58
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 90 deletions.
diff --git a/mlflow/projects/__init__.py b/mlflow/projects/__init__.py
@@ -197,11 +197,6 @@ def _fetch_project(uri, subdirectory, version, dst_dir, git_username, git_passwo
         if uri != dst_dir:
             dir_util.copy_tree(src=uri, dst=dst_dir)
 
-    # Make sure they don't have an outputs or mlruns directory (will need to change if we change
-    # how we log results locally)
-    shutil.rmtree(os.path.join(dst_dir, "outputs"), ignore_errors=True)
-    shutil.rmtree(os.path.join(dst_dir, "mlruns"), ignore_errors=True)
-
     # Make sure there is a MLproject file in the specified working directory.
     if not os.path.isfile(os.path.join(dst_dir, subdirectory, "MLproject")):
         if subdirectory == '':

diff --git a/tests/projects/test_projects.py b/tests/projects/test_projects.py
@@ -12,10 +12,10 @@
 from mlflow.entities.run_status import RunStatus
 from mlflow.projects import ExecutionException
 from mlflow.store.file_store import FileStore
-from mlflow.utils.file_utils import TempDir
 from mlflow.utils import env
 
 from tests.projects.utils import TEST_PROJECT_DIR, GIT_PROJECT_URI, TEST_DIR, validate_exit_status
+from tests.projects.utils import tracking_uri_mock  # pylint: disable=unused-import
 
 
 def _assert_dirs_equal(expected, actual):
@@ -88,6 +88,18 @@ def test_fetch_project(tmpdir):
                                        dst_dir=dst_dir, git_username=None, git_password=None)
 
 
+def test_dont_remove_mlruns(tmpdir):
+    # Fetching a directory containing an "mlruns" folder doesn't remove the "mlruns" folder
+    src_dir = tmpdir.mkdir("mlruns-src-dir")
+    src_dir.mkdir("mlruns").join("some-file.txt").write("hi")
+    src_dir.join("MLproject").write("dummy MLproject contents")
+    dst_dir_path = tmpdir.join("mlruns-work-dir").strpath
+    mlflow.projects._fetch_project(
+        uri=src_dir.strpath, subdirectory="", version=None, dst_dir=dst_dir_path, git_username=None,
+        git_password=None)
+    _assert_dirs_equal(expected=src_dir.strpath, actual=dst_dir_path)
+
+
 def test_parse_subdirectory():
     # Make sure the parsing works as intended.
     test_uri = "uri#subdirectory"
@@ -101,26 +113,68 @@ def test_parse_subdirectory():
         mlflow.projects._parse_subdirectory(period_fail_uri)
 
 
-def test_invalid_run_mode():
+def test_invalid_run_mode(tracking_uri_mock):  # pylint: disable=unused-argument
     """ Verify that we raise an exception given an invalid run mode """
-    with TempDir() as tmp, mock.patch("mlflow.tracking.get_tracking_uri") as get_tracking_uri_mock:
-        get_tracking_uri_mock.return_value = tmp.path()
-        with pytest.raises(ExecutionException):
-            mlflow.projects.run(uri=TEST_PROJECT_DIR, mode="some unsupported mode")
+    with pytest.raises(ExecutionException):
+        mlflow.projects.run(uri=TEST_PROJECT_DIR, mode="some unsupported mode")
 
 
-def test_use_conda():
+def test_use_conda(tracking_uri_mock):  # pylint: disable=unused-argument
     """ Verify that we correctly handle the `use_conda` argument."""
-    with TempDir() as tmp, mock.patch("mlflow.tracking.get_tracking_uri") as get_tracking_uri_mock:
-        get_tracking_uri_mock.return_value = tmp.path()
-        # Verify we throw an exception when conda is unavailable
-        old_path = os.environ["PATH"]
-        env.unset_variable("PATH")
-        try:
-            with pytest.raises(ExecutionException):
-                mlflow.projects.run(TEST_PROJECT_DIR, use_conda=True)
-        finally:
-            os.environ["PATH"] = old_path
+    # Verify we throw an exception when conda is unavailable
+    old_path = os.environ["PATH"]
+    env.unset_variable("PATH")
+    try:
+        with pytest.raises(ExecutionException):
+            mlflow.projects.run(TEST_PROJECT_DIR, use_conda=True)
+    finally:
+        os.environ["PATH"] = old_path
+
+
+@pytest.mark.skip(reason="flaky running in travis")
+@pytest.mark.parametrize("use_start_run", map(str, [0, 1]))
+def test_run(tmpdir, tracking_uri_mock, use_start_run):  # pylint: disable=unused-argument
+    submitted_run = mlflow.projects.run(
+        TEST_PROJECT_DIR, entry_point="test_tracking",
+        parameters={"use_start_run": use_start_run},
+        use_conda=False, experiment_id=0)
+    # Blocking runs should be finished when they return
+    validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
+    # Test that we can call wait() on a synchronous run & that the run has the correct
+    # status after calling wait().
+    submitted_run.wait()
+    validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
+    # Validate run contents in the FileStore
+    run_uuid = submitted_run.run_id
+    store = FileStore(tmpdir.strpath)
+    run_infos = store.list_run_infos(experiment_id=0)
+    assert len(run_infos) == 1
+    store_run_uuid = run_infos[0].run_uuid
+    assert run_uuid == store_run_uuid
+    run = store.get_run(run_uuid)
+    expected_params = {"use_start_run": use_start_run}
+    assert run.info.status == RunStatus.FINISHED
+    assert len(run.data.params) == len(expected_params)
+    for param in run.data.params:
+        assert param.value == expected_params[param.key]
+    expected_metrics = {"some_key": 3}
+    for metric in run.data.metrics:
+        assert metric.value == expected_metrics[metric.key]
+
+
+@pytest.mark.skip(reason="flaky running in travis")
+def test_run_async(tracking_uri_mock):  # pylint: disable=unused-argument
+    submitted_run0 = mlflow.projects.run(
+        TEST_PROJECT_DIR, entry_point="sleep", parameters={"duration": 2},
+        use_conda=False, experiment_id=0, block=False)
+    validate_exit_status(submitted_run0.get_status(), RunStatus.RUNNING)
+    submitted_run0.wait()
+    validate_exit_status(submitted_run0.get_status(), RunStatus.FINISHED)
+    submitted_run1 = mlflow.projects.run(
+        TEST_PROJECT_DIR, entry_point="sleep", parameters={"duration": -1, "invalid-param": 30},
+        use_conda=False, experiment_id=0, block=False)
+    submitted_run1.wait()
+    validate_exit_status(submitted_run1.get_status(), RunStatus.FAILED)
 
 
 @pytest.mark.parametrize(
@@ -134,71 +188,15 @@ def test_conda_path(mock_env, expected):
 
 
 @pytest.mark.skip(reason="flaky running in travis")
-def test_run():
-    for use_start_run in map(str, [0, 1]):
-        with TempDir() as tmp, mock.patch("mlflow.tracking.get_tracking_uri")\
-                as get_tracking_uri_mock:
-            tmp_dir = tmp.path()
-            get_tracking_uri_mock.return_value = tmp_dir
-            submitted_run = mlflow.projects.run(
-                TEST_PROJECT_DIR, entry_point="test_tracking",
-                parameters={"use_start_run": use_start_run},
-                use_conda=False, experiment_id=0)
-            # Blocking runs should be finished when they return
-            validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
-            # Test that we can call wait() on a synchronous run & that the run has the correct
-            # status after calling wait().
-            submitted_run.wait()
-            validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
-            # Validate run contents in the FileStore
-            run_uuid = submitted_run.run_id
-            store = FileStore(tmp_dir)
-            run_infos = store.list_run_infos(experiment_id=0)
-            assert len(run_infos) == 1
-            store_run_uuid = run_infos[0].run_uuid
-            assert run_uuid == store_run_uuid
-            run = store.get_run(run_uuid)
-            expected_params = {"use_start_run": use_start_run}
-            assert run.info.status == RunStatus.FINISHED
-            assert len(run.data.params) == len(expected_params)
-            for param in run.data.params:
-                assert param.value == expected_params[param.key]
-            expected_metrics = {"some_key": 3}
-            for metric in run.data.metrics:
-                assert metric.value == expected_metrics[metric.key]
-
-
-@pytest.mark.skip(reason="flaky running in travis")
-def test_run_async():
-    with TempDir() as tmp, mock.patch("mlflow.tracking.get_tracking_uri") as get_tracking_uri_mock:
-        tmp_dir = tmp.path()
-        get_tracking_uri_mock.return_value = tmp_dir
-        submitted_run0 = mlflow.projects.run(
-            TEST_PROJECT_DIR, entry_point="sleep", parameters={"duration": 2},
-            use_conda=False, experiment_id=0, block=False)
-        validate_exit_status(submitted_run0.get_status(), RunStatus.RUNNING)
-        submitted_run0.wait()
-        validate_exit_status(submitted_run0.get_status(), RunStatus.FINISHED)
-        submitted_run1 = mlflow.projects.run(
-            TEST_PROJECT_DIR, entry_point="sleep", parameters={"duration": -1, "invalid-param": 30},
-            use_conda=False, experiment_id=0, block=False)
-        submitted_run1.wait()
-        validate_exit_status(submitted_run1.get_status(), RunStatus.FAILED)
-
-
-@pytest.mark.skip(reason="flaky running in travis")
-def test_cancel_run():
-    with TempDir() as tmp, mock.patch("mlflow.tracking.get_tracking_uri") as get_tracking_uri_mock:
-        tmp_dir = tmp.path()
-        get_tracking_uri_mock.return_value = tmp_dir
-        submitted_run0, submitted_run1 = [mlflow.projects.run(
-            TEST_PROJECT_DIR, entry_point="sleep", parameters={"duration": 2},
-            use_conda=False, experiment_id=0, block=False) for _ in range(2)]
-        submitted_run0.cancel()
-        validate_exit_status(submitted_run0.get_status(), RunStatus.FAILED)
-        # Sanity check: cancelling one run has no effect on the other
-        submitted_run1.wait()
-        validate_exit_status(submitted_run1.get_status(), RunStatus.FINISHED)
+def test_cancel_run(tracking_uri_mock):  # pylint: disable=unused-argument
+    submitted_run0, submitted_run1 = [mlflow.projects.run(
+        TEST_PROJECT_DIR, entry_point="sleep", parameters={"duration": 2},
+        use_conda=False, experiment_id=0, block=False) for _ in range(2)]
+    submitted_run0.cancel()
+    validate_exit_status(submitted_run0.get_status(), RunStatus.FAILED)
+    # Sanity check: cancelling one run has no effect on the other
+    submitted_run1.wait()
+    validate_exit_status(submitted_run1.get_status(), RunStatus.FINISHED)
 
 
 def test_get_dest_dir():
@@ -212,11 +210,10 @@ def test_get_dest_dir():
                os.path.abspath(TEST_PROJECT_DIR)
 
 
-def test_storage_dir():
+def test_storage_dir(tmpdir):
     """
     Test that we correctly handle the `storage_dir` argument, which specifies where to download
     distributed artifacts passed to arguments of type `path`.
     """
-    with TempDir() as tmp_dir:
-        assert os.path.dirname(mlflow.projects._get_storage_dir(tmp_dir.path())) == tmp_dir.path()
+    assert os.path.dirname(mlflow.projects._get_storage_dir(tmpdir.strpath)) == tmpdir.strpath
     assert os.path.dirname(mlflow.projects._get_storage_dir(None)) == tempfile.gettempdir()