Skip to content

Commit

Permalink
✨ ✅ Validate output data and add tests (#20) (#66)
Browse files Browse the repository at this point in the history
* validate also output data and add tests

* remove convert parameters in test

* add check if dataset is already validated

---------

Co-authored-by: Manuel Spierenburg <manuel.spierenburg@sbb.ch>
  • Loading branch information
mjspier and Manuel Spierenburg authored May 27, 2024
1 parent 56d9cd1 commit dc12425
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 87 deletions.
30 changes: 26 additions & 4 deletions kedro_pandera/framework/hooks/pandera_hook.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import logging
from typing import Any, Dict, Set

from kedro.framework.context import KedroContext
from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog
from kedro.pipeline.node import Node
from pandera.errors import SchemaError

from kedro_pandera.framework.config.resolvers import (
Expand All @@ -22,6 +25,9 @@


class PanderaHook:
def __init__(self) -> None:
self._validated_datasets: Set[str] = set()

@property
def _logger(self) -> logging.Logger:
return logging.getLogger(__name__)
Expand All @@ -45,17 +51,18 @@ def after_context_created(
}
)

@hook_impl
def before_node_run( # noqa : PLR0913
self, node, catalog, inputs, is_async, session_id
def _validate_datasets(
self, node: Node, catalog: DataCatalog, datasets: Dict[str, Any]
):
for name, data in inputs.items():
for name, data in datasets.items():
if (
catalog._datasets[name].metadata is not None
and "pandera" in catalog._datasets[name].metadata
and name not in self._validated_datasets
):
try:
catalog._datasets[name].metadata["pandera"]["schema"].validate(data)
self._validated_datasets.add(name)
except SchemaError as err:
self._logger.error(
f"Dataset '{name}' pandera validation failed before running '{node.name}', see details in the error message. "
Expand All @@ -71,5 +78,20 @@ def before_node_run( # noqa : PLR0913
f"(kedro-pandera) Dataset '{name}' was successfully validated with pandera"
)

@hook_impl
def before_node_run( # noqa : PLR0913
self,
node: Node,
catalog: DataCatalog,
inputs: Dict[str, Any],
is_async,
session_id,
):
self._validate_datasets(node, catalog, inputs)

@hook_impl
def after_node_run(self, node: Node, catalog: DataCatalog, outputs: Dict[str, Any]):
self._validate_datasets(node, catalog, outputs)


pandera_hook = PanderaHook()
157 changes: 78 additions & 79 deletions tests/data/iris_schema.yml
Original file line number Diff line number Diff line change
@@ -1,85 +1,84 @@
_example_iris_data_schema:
schema_type: dataframe
version: 0.18.3
columns:
sepal_length:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 4.3
less_than_or_equal_to: 7.9
unique: false
coerce: false
required: true
regex: false
sepal_width:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 2.0
less_than_or_equal_to: 4.4
unique: false
coerce: false
required: true
regex: false
petal_length:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 1.0
less_than_or_equal_to: 6.9
unique: false
coerce: false
required: true
regex: false
petal_width:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 0.1
less_than_or_equal_to: 2.5
unique: false
coerce: false
required: true
regex: false
species:
title: null
description: null
dtype: object
nullable: false
checks: null
unique: false
coerce: false
required: true
regex: false
checks: null
index:
- title: null
schema_type: dataframe
version: 0.18.3
columns:
sepal_length:
title: null
description: null
dtype: int64
dtype: float32
nullable: false
checks:
greater_than_or_equal_to: 0.0
less_than_or_equal_to: 149.0
name: null
greater_than_or_equal_to: 4.3
less_than_or_equal_to: 7.9
unique: false
coerce: false
dtype: null
coerce: true
strict: false
name: null
ordered: false
unique: null
report_duplicates: all
unique_column_names: false
add_missing_columns: false
title: null
required: true
regex: false
sepal_width:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 2.0
less_than_or_equal_to: 4.4
unique: false
coerce: false
required: true
regex: false
petal_length:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 1.0
less_than_or_equal_to: 6.9
unique: false
coerce: false
required: true
regex: false
petal_width:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 0.1
less_than_or_equal_to: 2.5
unique: false
coerce: false
required: true
regex: false
species:
title: null
description: null
dtype: string
nullable: false
checks: null
unique: false
coerce: false
required: true
regex: false
checks: null
index:
- title: null
description: null
dtype: int64
nullable: false
checks:
greater_than_or_equal_to: 0.0
less_than_or_equal_to: 149.0
name: null
unique: false
coerce: false
dtype: null
coerce: true
strict: false
name: null
ordered: false
unique: null
report_duplicates: all
unique_column_names: false
add_missing_columns: false
title: null
description: null
85 changes: 85 additions & 0 deletions tests/data/iris_schema_fail.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
schema_type: dataframe
version: 0.18.3
columns:
sepal_length:
title: null
description: null
dtype: float32
nullable: false
checks:
greater_than_or_equal_to: 4.3
less_than_or_equal_to: 7.9
unique: false
coerce: false
required: true
regex: false
sepal_width:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 2.0
less_than_or_equal_to: 4.4
unique: false
coerce: false
required: true
regex: false
petal_length:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 1.0
less_than_or_equal_to: 6.9
unique: false
coerce: false
required: true
regex: false
petal_width:
title: null
description: null
dtype: float64
nullable: false
checks:
greater_than_or_equal_to: 0.1
less_than_or_equal_to: 2.5
unique: false
coerce: false
required: true
regex: false
species:
title: null
description: null
dtype: string
nullable: false
checks:
isin: ["satosa", "versicolor", "virginica"]
unique: false
coerce: false
required: true
regex: false
checks: null
index:
- title: null
description: null
dtype: int64
nullable: false
checks:
greater_than_or_equal_to: 0.0
less_than_or_equal_to: 149.0
name: null
unique: false
coerce: false
dtype: null
coerce: true
strict: false
name: null
ordered: false
unique: null
report_duplicates: all
unique_column_names: false
add_missing_columns: false
title: null
description: null
Loading

0 comments on commit dc12425

Please sign in to comment.