Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pipeline run tests for DVC #867

Merged
merged 3 commits into from
Jan 13, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add DVC pipeline run tests
  • Loading branch information
andycui97 committed Jan 9, 2023
commit af6f772e4bae08a92e9612ae5e917f89f493668a
38 changes: 38 additions & 0 deletions .github/workflows/dvc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: DVC

on:
push:
branches: [main]
paths:
- "lineapy/plugins/jinja_templates/*"
- "lineapy/plugins/dvc_pipeline_writer.py"
- "lineapy/plugins/base_pipeline_writer.py"
- "tests/test_pipeline_run_dvc.py"
pull_request:
paths:
- "lineapy/plugins/jinja_templates/*"
- "lineapy/plugins/dvc_pipeline_writer.py"
- "lineapy/plugins/base_pipeline_writer.py"
- "tests/test_pipeline_run_dvc.py"
- "dvc-requirements.txt"
andycui97 marked this conversation as resolved.
Show resolved Hide resolved
jobs:
dvc:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
lfs: true
submodules: "recursive"
- name: Set up Python 3.9
andycui97 marked this conversation as resolved.
Show resolved Hide resolved
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
pip install -r requirements.txt && python setup.py install && rm -rf build
- name: Test with pytest
run: |
pytest -vv -m 'dvc'
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ build-docs:

test:
make deps
docker-compose run --rm ${service_name} pytest ${args} --snapshot-update --no-cov -m "not slow and not airflow and not ray and not integration" tests/
docker-compose run --rm ${service_name} pytest ${args} --snapshot-update --no-cov -m "not slow and not airflow and not ray and not dvc and not integration" tests/
andycui97 marked this conversation as resolved.
Show resolved Hide resolved

test-github-action:
docker-compose run --rm ${service_name} pytest ${args}
Expand All @@ -64,7 +64,7 @@ test-github-action:
# Additionally, the package pg and psycopg2 should be installed in the main service.
test-parallel:
make deps
docker-compose run --rm ${service_name} pytest ${args} -n 3 --dist=loadscope --snapshot-update --no-cov -m "not (slow or airflow or ray)" tests/
docker-compose run --rm ${service_name} pytest ${args} -n 3 --dist=loadscope --snapshot-update --no-cov -m "not (slow or airflow or ray or dvc)" tests/

test-airflow:
docker-compose run --rm ${service_name}-airflow pytest ${args} --snapshot-update --no-cov -m "airflow" tests/
Expand Down
4 changes: 4 additions & 0 deletions dvc-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dvc==2.38.1
pandas
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we install the additional packages (panda, scikit-learn and SQLAlchemy) using the pipeline generated requirements file? As additional virtualenv.run() command after 'git init' and 'dvc init'?

scikit-learn==1.0.2
SQLAlchemy==1.3.24
4 changes: 1 addition & 3 deletions lineapy/plugins/dvc_pipeline_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def docker_template_name(self) -> str:
return "dvc_dockerfile.jinja"

def _write_dag(self) -> None:
dag_flavor = self.dag_config.get(
"dag_flavor", "SingleStageAllSessions"
)
dag_flavor = self.dag_config.get("dag_flavor", "StagePerArtifact")
andycui97 marked this conversation as resolved.
Show resolved Hide resolved

# Check if the given DAG flavor is a supported/valid one
if dag_flavor not in DVCDagFlavor.__members__:
Expand Down
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ markers =
integration: marks tests as integration tests (deselect with '-m "not integration"')
airflow: marks tests as running airflow (deselect with '-m "not airflow"')
ray: marks tests as running ray (deselect with '-m "not ray"')
dvc: marks tests as running dvc (deselect with '-m "not dvc"')
norecursedirs =
__snapshots__
.ipynb_checkpoints
Expand Down Expand Up @@ -42,7 +43,7 @@ addopts =
--ignore-glob "tests/unit/plugins/expected/*"
--ignore-glob "tests/pipeline_*.py"
--ignore-glob ".colab/*"
-m "not airflow and not ray and not integration"
-m "not airflow and not ray and not dvc and not integration"
--nbval
--doctest-modules
xfail_strict=true
115 changes: 115 additions & 0 deletions tests/test_pipeline_run_dvc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import subprocess
from pathlib import Path

import pytest

from lineapy.api.models.linea_artifact import get_lineaartifactdef
from lineapy.data.types import PipelineType
from lineapy.graph_reader.artifact_collection import ArtifactCollection
from lineapy.plugins.pipeline_writer_factory import PipelineWriterFactory


@pytest.mark.skipif(
subprocess.check_call(["git", "--version"]) != 0,
reason="dvc requires git to be installed",
)
@pytest.mark.dvc
@pytest.mark.slow
@pytest.mark.parametrize(
"input_script1, input_script2, artifact_list, pipeline_name, dependencies, dag_config, input_parameters",
[
pytest.param(
"housing",
"",
["y", "p value"],
andycui97 marked this conversation as resolved.
Show resolved Hide resolved
"dvc_pipeline_housing_artifacts_w_dependencies",
{"p value": {"y"}},
{},
[],
andycui97 marked this conversation as resolved.
Show resolved Hide resolved
id="dvc_pipeline_housing_artifacts_w_dependencies",
),
pytest.param(
"housing",
"",
["y", "p value"],
"dvc_pipeline_housing_session_w_dependencies",
{"p value": {"y"}},
{},
[],
id="dvc_pipeline_housing_session_w_dependencies",
),
pytest.param(
"simple",
"complex",
["a0", "b0"],
"script_pipeline_a0_b0_dependencies",
{"a0": {"b0"}},
{},
[],
id="dvc_two_session_w_dependencies",
),
],
)
def test_run_dvc_dag(
virtualenv,
tmp_path,
linea_db,
execute,
input_script1,
input_script2,
artifact_list,
pipeline_name,
dependencies,
dag_config,
input_parameters,
):
"""
Verifies that the dvc flavored pipeline APIs produce a working dvc DAG
by running the DAG locally.
"""

code1 = Path(
"tests", "unit", "graph_reader", "inputs", input_script1
).read_text()
execute(code1, snapshot=False)

if input_script2 != "":
code2 = Path(
"tests", "unit", "graph_reader", "inputs", input_script2
).read_text()
execute(code2, snapshot=False)

# Write out pipeline files
artifact_def_list = [get_lineaartifactdef(art) for art in artifact_list]
artifact_collection = ArtifactCollection(
linea_db,
artifact_def_list,
input_parameters=input_parameters,
dependencies=dependencies,
)

# Construct pipeline writer
pipeline_writer = PipelineWriterFactory.get(
pipeline_type=PipelineType.DVC,
artifact_collection=artifact_collection,
pipeline_name=pipeline_name,
output_dir=tmp_path,
dag_config=dag_config,
)
pipeline_writer.write_pipeline_files()

# Run dvc in new virtual env so we don't end up with version conflicts
# with lineapy deps
# https://github.com/man-group/pytest-plugins/tree/master/pytest-virtualenv#installing-packages
virtualenv.run(
"pip install -r dvc-requirements.txt", capture=False, cd="."
)

virtualenv.run("git init", capture=False, cd=tmp_path)
virtualenv.run("dvc init", capture=False, cd=tmp_path)

dag_path = Path(tmp_path, f"{pipeline_name}_dag.py")

# This run command will error if the dag is not runnable by dvc
out = virtualenv.run("dvc repro", capture=True, cd=tmp_path)
print(out)