diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000000..c9346bb1f6 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,23 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.8" + nodejs: "19" + +sphinx: + builder: html + configuration: docs/source/conf.py + fail_on_warning: true + +python: + install: + - method: pip + path: package/ + extra_requirements: + - docs diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000..d0c3cbf102 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000..747ffb7b30 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/_templates/autosummary/base.rst b/docs/source/_templates/autosummary/base.rst new file mode 100644 index 0000000000..b7556ebf7b --- /dev/null +++ b/docs/source/_templates/autosummary/base.rst @@ -0,0 +1,5 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst new file mode 100644 index 0000000000..10c8ff8bec --- /dev/null +++ b/docs/source/_templates/autosummary/class.rst @@ -0,0 +1,32 @@ +{{ fullname | escape | underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :undoc-members: + :inherited-members: + + {% block attributes %} + {% if attributes %} + .. rubric:: Attributes + + .. autosummary:: + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block methods %} + {% if methods %} + .. rubric:: Methods + + .. autosummary:: + {% for item in all_methods %} + {%- if not item.startswith('_') %} + ~{{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/docs/source/_templates/autosummary/module.rst b/docs/source/_templates/autosummary/module.rst new file mode 100644 index 0000000000..a496ca3f5f --- /dev/null +++ b/docs/source/_templates/autosummary/module.rst @@ -0,0 +1,56 @@ +{{ fullname | escape | underline }} + +.. rubric:: Description + +.. automodule:: {{ fullname }} + + {% block functions %} + {% if functions %} + .. rubric:: Functions + + .. autosummary:: + :toctree: + {% for item in functions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block classes %} + {% if classes %} + .. rubric:: Classes + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + {% for item in classes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block exceptions %} + {% if exceptions %} + .. rubric:: Exceptions + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + {% for item in exceptions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + +{% block modules %} +{% if modules %} +.. rubric:: Modules + +.. autosummary:: + :toctree: + :recursive: +{% for item in modules %} + {{ item }} +{%- endfor %} +{% endif %} +{% endblock %} diff --git a/docs/source/_templates/breadcrumbs.html b/docs/source/_templates/breadcrumbs.html new file mode 100644 index 0000000000..49fa4779f4 --- /dev/null +++ b/docs/source/_templates/breadcrumbs.html @@ -0,0 +1,94 @@ +{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #} + +{% if page_source_suffix %} +{% set suffix = page_source_suffix %} +{% else %} +{% set suffix = source_suffix %} +{% endif %} + +{# modification to enable custom github_url #} + +{% if meta is not defined or meta is none %} + {% set meta = {} %} +{% endif %} + +{% if github_url is defined %} + {% set _dummy = meta.update({'github_url': github_url}) %} +{% endif %} + +{# // modification to enable custom github_url #} + +{% if meta is defined and meta is not none %} +{% set check_meta = True %} +{% else %} +{% set check_meta = False %} +{% endif %} + +{% if check_meta and 'github_url' in meta %} +{% set display_github = True %} +{% endif %} + +{% if check_meta and 'bitbucket_url' in meta %} +{% set display_bitbucket = True %} +{% endif %} + +{% if check_meta and 'gitlab_url' in meta %} +{% set display_gitlab = True %} +{% endif %} + +
+ + + + {% if (theme_prev_next_buttons_location == 'top' or theme_prev_next_buttons_location == 'both') and (next or prev) %} + + {% endif %} +
+
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html new file mode 100644 index 0000000000..f4cf55a471 --- /dev/null +++ b/docs/source/_templates/layout.html @@ -0,0 +1,45 @@ +{% extends "!layout.html" %} {%- block extrabody %} + +
+ + +
+ Kedro + Kedro-Viz + Kedro-Datasets +
+ + {%- include "searchbox.html" %} +
+ +{% endblock %} + +{%- block extrahead %} + + + +{% endblock %} diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000000..79b74a3721 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,87 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + + +from __future__ import annotations + +import importlib +import os +import re +import sys +from inspect import getmembers, isclass, isfunction +from pathlib import Path + +from click import secho, style + +from kedro_viz import __version__ as release + +# -- Project information ----------------------------------------------------- + +project = "kedro-viz" +author = "kedro-viz" + +# The short X.Y version. +version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) + + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.intersphinx", + "sphinx_copybutton", + "myst_parser", + "notfound.extension", + "sphinxcontrib.jquery", +] + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] +html_static_path = ["_static"] + +exclude_patterns = [] +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} + +myst_heading_anchors = 2 + +intersphinx_mapping = { + "kedro-datasets": ("https://docs.kedro.org/projects/kedro-datasets/en/kedro-datasets-1.7.1/", None), +} + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + + +# Theme options are theme-specific and customise the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = {"collapse_navigation": False, "style_external_links": True} + +# Removes, from all docs, the copyright footer. +html_show_copyright = False + +html_context = { + "display_github": True, + "github_url": "https://github.com/kedro-org/kedro/tree/main/docs/source", +} + + +def setup(app): + app.add_css_file("css/qb1-sphinx-rtd.css") + # fix a bug with table wraps in Read the Docs Sphinx theme: + # https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html + app.add_css_file("css/theme-overrides.css") diff --git a/docs/source/experiment_tracking.md b/docs/source/experiment_tracking.md new file mode 100644 index 0000000000..fe4825f342 --- /dev/null +++ b/docs/source/experiment_tracking.md @@ -0,0 +1,354 @@ +# Experiment tracking in Kedro-Viz + + +Experiment tracking is the process of saving all the metadata related to an experiment each time you run it. It enables you to compare different runs of a machine-learning model as part of the experimentation process. + +The metadata you store may include: + +* Scripts used for running the experiment +* Environment configuration files +* Versions of the data used for training and evaluation +* Evaluation metrics +* Model weights +* Plots and other visualisations + +You can use Kedro-Viz experiment tracking to store and access results, and to share them with others for comparison. Storage can be local or remote, such as cloud storage on AWS S3. + +Kedro's [experiment tracking demo](https://demo.kedro.org/experiment-tracking) enables you to explore the experiment tracking capabilities of Kedro-Viz. + +![](./images/experiment-tracking_demo.gif) + +## Kedro versions supporting experiment tracking +Kedro has always supported parameter versioning (as part of your codebase with a version control system like `git`) and Kedro’s dataset versioning capabilities enabled you to [snapshot models, datasets and plots](https://docs.kedro.org/en/stable/data/data_catalog.html#dataset-versioning). + +Kedro-Viz version 4.1.1 introduced metadata capture, visualisation, discovery and comparison, enabling you to access, edit and [compare your experiments](#access-run-data-and-compare-runs) and additionally [track how your metrics change over time](#view-and-compare-metrics-data). + +Kedro-Viz version 5.0 also supports the [display and comparison of plots, such as Plotly and Matplotlib](./visualise_charts_with_plotly.md). Support for metric plots (timeseries and parellel coords) was added to Kedro-Viz version 5.2.1. + +Kedro-Viz version 6.2 includes support for collaborative experiment tracking using a cloud storage solution. This means that multiple users can store their experiment data in a centralized remote storage, such as AWS S3, and access it through Kedro-Viz. + +## When should I use experiment tracking in Kedro? + +The choice of experiment tracking tool depends on your use case and choice of complementary tools, such as MLflow and Neptune: + +- **Kedro** - If you need experiment tracking, are looking for improved metrics visualisation and want a lightweight tool to work alongside existing functionality in Kedro. Kedro does not support a model registry. +- **MLflow** - You can combine MLflow with Kedro by using [`kedro-mlflow`](https://kedro-mlflow.readthedocs.io/en/stable/) if you require experiment tracking, model registry and/or model serving capabilities or have access to Managed MLflow within the Databricks ecosystem. +- **Neptune** - If you require experiment tracking and model registry functionality, improved visualisation of metrics and support for collaborative data science, you may consider [`kedro-neptune`](https://docs.neptune.ai/integrations/kedro/) for your workflow. + +[We support a growing list of integrations](https://docs.kedro.org/en/stable/extend_kedro/plugins.html). + +## Set up a project + +This section describes the steps necessary to set up experiment tracking and access logged metrics, using the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html) with a version of Kedro equal to or higher than 0.18.4, and a version of Kedro-Viz equal to or higher than 5.2. + +There are three steps to enable experiment tracking features with Kedro-Viz. We illustrate how to: + +- [Set up a session store to capture experiment metadata](#set-up-the-session-store) +- [Set up experiment tracking datasets to list the metrics to track](#set-up-experiment-tracking-datasets) +- [Modify your nodes and pipelines to output those metrics](#modify-your-nodes-and-pipelines-to-log-metrics) + +### Install Kedro and Kedro-Viz +To use this tutorial code, you must already have [installed Kedro](https://docs.kedro.org/en/stable/get_started/install.html) and [Kedro-Viz](./kedro-viz_visualisation.md). You can confirm the versions you have installed by running `kedro info` + +```{note} +The example code uses a version of Kedro-Viz `>6.2.0`. +``` + +Create a new project using the spaceflights starter. From the terminal run: + +```bash +kedro new --starter=spaceflights +``` + +Feel free to name your project as you like, but this guide assumes the project is named `Spaceflights`. + +### Install the dependencies for the project + +Once you have created the project, to run project-specific Kedro commands, you must navigate to the directory in which it has been created: + +```bash +cd spaceflights +``` +Install the project's dependencies: + +```bash +pip install -r src/requirements.txt +``` + +## Set up the session store + +In the domain of experiment tracking, each pipeline run is considered a session. A session store records all related metadata for each pipeline run, from logged metrics to other run-related data such as timestamp, `git` username and branch. The session store is a [SQLite](https://www.sqlite.org/index.html) database that is generated during your first pipeline run after it has been set up in your project. + +### Local storage +To set up the session store locally, go to the `src/spaceflights/settings.py` file and add the following: + +```python +from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore +from pathlib import Path + +SESSION_STORE_CLASS = SQLiteStore +SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")} +``` + +This specifies the creation of the `SQLiteStore` under the `data` subfolder, using the `SQLiteStore` setup from your installed Kedro-Viz plugin + +This step is crucial to enable experiment tracking features on Kedro-Viz, as it is the database used to serve all run data to the Kedro-Viz front-end. Once this step is complete, you can either proceed to [set up the tracking datasets](#set-up-experiment-tracking-datasets) or [set up your nodes and pipelines to log metrics](#modify-your-nodes-and-pipelines-to-log-metrics); these two activities are interchangeable, but both should be completed to get a working experiment tracking setup. + + +## Collaborative experiment tracking + +```{note} +To use collaborative experiment tracking, ensure that your installed version of Kedro-Viz is `>=6.2.0`. +``` + +For collaborative experiment tracking, Kedro-Viz saves your experiments as SQLite database files on a central cloud storage. To ensure that all users have a unique filename, set up your `KEDRO_SQLITE_STORE_USERNAME` in the environment variables. By default, Kedro-Viz will take your computer user name if this is not specified. + +> Note: In Kedro-Viz version 6.2, the only way to set up credentials for accessing your cloud storage is through environment variables. + +```bash +export KEDRO_SQLITE_STORE_USERNAME="your_unique__username" + +``` + +Now specify a remote path in the `SESSION_STORE_ARGS` variable, which links to your cloud storage. + + +```python +from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore +from pathlib import Path + +SESSION_STORE_CLASS = SQLiteStore +SESSION_STORE_ARGS = { + "path": str(Path(__file__).parents[2] / "data"), + "remote_path": "s3://my-bucket-name/path/to/experiments", +} +``` + +Finally, ensure you have the necessary credentials set up as shown below: + +```bash +export AWS_ACCESS_KEY_ID="your_access_key_id" +export AWS_SECRET_ACCESS_KEY="your_secret_access_key" +export AWS_REGION="your_aws_region" + +``` + +## Set up experiment tracking datasets + +There are two types of tracking datasets: {py:class}`tracking.MetricsDataset ` and {py:class}`tracking.JSONDataset `. The `tracking.MetricsDataset` should be used for tracking numerical metrics, and the `tracking.JSONDataset` can be used for tracking any other JSON-compatible data like boolean or text-based data. + +Set up two datasets to log the columns used in the companies dataset (`companies_columns`) and experiment metrics for the data science pipeline (`metrics`) like the coefficient of determination (`r2 score`), max error (`me`) and mean absolute error (`mae`) by adding the following in the `conf/base/catalog.yml` file: + +```yaml +metrics: + type: tracking.MetricsDataSet + filepath: data/09_tracking/metrics.json + +companies_columns: + type: tracking.JSONDataSet + filepath: data/09_tracking/companies_columns.json +``` + +## Modify your nodes and pipelines to log metrics + +Now that you have set up the tracking datasets to log experiment tracking data, next ensure that the data is returned from your nodes. + +Set up the data to be logged for the metrics dataset - under `nodes.py` of your `data_science` pipeline (`src/spaceflights/pipelines/data_science/nodes.py`), add three different metrics to your `evaluate_model` function to log `r2_score`, `mae` and `me` and return these 3 metrics as key-value pairs. + +The new `evaluate_model` function should look like this: + +```python +from sklearn.metrics import mean_absolute_error, max_error + + +def evaluate_model( + regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series +) -> Dict[str, float]: + """Calculates and logs the coefficient of determination. + + Args: + regressor: Trained model. + X_test: Testing data of independent features. + y_test: Testing data for price. + """ + y_pred = regressor.predict(X_test) + score = r2_score(y_test, y_pred) + mae = mean_absolute_error(y_test, y_pred) + me = max_error(y_test, y_pred) + logger = logging.getLogger(__name__) + logger.info("Model has a coefficient R^2 of %.3f on test data.", score) + return {"r2_score": score, "mae": mae, "max_error": me} +``` + +Next, ensure that the dataset is also specified as an output of your `evaluate_model` node. In the `src/spaceflights/pipelines/data_science/pipeline.py` file, specify the `output` of your `evaluate_model` to be the `metrics` dataset. Note that the output dataset must exactly match the name of the tracking dataset specified in the catalog file. + +The node of the `evaluate_model` on the pipeline should look like this: + +```python +node( + func=evaluate_model, + inputs=["regressor", "X_test", "y_test"], + name="evaluate_model_node", + outputs="metrics", +) +``` + +Repeat the same steps to set up the `companies_column` dataset. For this dataset, log the column that contains the list of companies as outlined in the `companies.csv` file under the `data/01_raw` directory. Modify the `preprocess_companies` node under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/nodes.py`) to return the data under a key-value pair, as shown below: + +```python +from typing import Tuple, Dict + + +def preprocess_companies(companies: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]: + """Preprocesses the data for companies. + + Args: + companies: Raw data. + Returns: + Preprocessed data, with `company_rating` converted to a float and + `iata_approved` converted to boolean. + """ + companies["iata_approved"] = _is_true(companies["iata_approved"]) + companies["company_rating"] = _parse_percentage(companies["company_rating"]) + return companies, {"columns": companies.columns.tolist(), "data_type": "companies"} +``` + +Again, you must ensure that the dataset is also specified as an output on the `pipeline.py` file under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`), as follows: + +```python +node( + func=preprocess_companies, + inputs="companies", + outputs=["preprocessed_companies", "companies_columns"], + name="preprocess_companies_node", +) +``` + +Having set up both datasets, you can now generate your first set of experiment tracking data! + +## Generate the run data + +The beauty of native experiment tracking in Kedro is that all tracked data is generated and stored each time you do a Kedro run. Hence, to generate the data, you need only execute: + +```bash +kedro run +``` + +After the run completes, under `data/09_tracking`, you can now see two folders, `companies_column.json` and `metrics.json`. On performing a pipeline run after setting up the tracking datasets, Kedro generates a folder with the dataset name for each tracked dataset. Each folder of the tracked dataset contains folders named by the timestamp of each pipeline run to store the saved metrics of the dataset, and each future pipeline run generates a new timestamp folder with the JSON file of the saved metrics under the folder of its subsequent tracked dataset. + +You can also see the `session_store.db` generated from your first pipeline run after enabling experiment tracking, which is used to store all the generated run metadata, alongside the tracking dataset, to be used for exposing experiment tracking to Kedro-Viz. + +![](./images/experiment-tracking-folder.png) + +Execute `kedro run` a few times in a row to generate a larger set of experiment data. You can also play around with setting up different tracking datasets, and check the logged data via the generated JSON data files. + +## Access run data and compare runs + +Here comes the fun part of accessing your run data on Kedro-Viz. Having generated some run data, execute the following command: + +```bash +kedro viz +``` + +When you open the Kedro-Viz web app, you see an experiment tracking icon on the left-hand side of the screen. + +![](./images/experiment-tracking-icon.png) + +Click the icon to go to the experiment tracking page (you can also access the page from your browser at `http://127.0.0.1:4141/experiment-tracking`), where you can see the sets of experiment data generated from all previous runs: + +![](./images/experiment-tracking-runs-list.png) + +You can now access, compare and pin your runs by toggling the `Compare runs` button: + +![](./images/experiment-tracking-compare-runs.png) + +## View and compare plots + +In this section, we illustrate how to compare Matplotlib plots across experimental runs (functionality available since Kedro-Viz version 5.0). + +### Update the dependencies + +Update the `src/requirements.txt` file in your Kedro project by adding the following dataset to enable Matplotlib for your project: + +```text +kedro-datasets[matplotlib.MatplotlibWriter]~=1.1 +seaborn~=0.12.1 +``` + +And install the requirements with: + +```bash +pip install -r src/requirements.txt +``` + +### Add a plotting node + +Add a new node to the `data_processing` nodes (`src/spaceflights/pipelines/data_processing/nodes.py`): + +```python +import matplotlib.pyplot as plt +import seaborn as sn + + +def create_confusion_matrix(companies: pd.DataFrame): + actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1] + predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1] + data = {"y_Actual": actuals, "y_Predicted": predicted} + df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"]) + confusion_matrix = pd.crosstab( + df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"] + ) + sn.heatmap(confusion_matrix, annot=True) + return plt +``` + +And now add this node to the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`): + +```python +from .nodes import create_confusion_matrix + +node( + func=create_confusion_matrix, + inputs="companies", + outputs="confusion_matrix", +), +``` + +In the catalog (`conf/base/catalog.yml`) add the `confusion_matrix` data definition, making sure to set the versioned flag to `true` within the project catalog to include the plot in experiment tracking: + +```yaml +confusion_matrix: + type: matplotlib.MatplotlibWriter + filepath: data/09_tracking/confusion_matrix.png + versioned: true +``` + +After running the pipeline with `kedro run`, the plot is saved and you can see it in the experiment tracking panel when you execute `kedro viz`. Clicking on a plot expands it. When in comparison view, expanding a plot shows all the plots in that view for side-by-side comparison. + +![](./images/experiment-tracking-plots-comparison.png) + +![](./images/experiment-tracking-plots-comparison-expanded.png) + +## View and compare metrics data + +From Kedro-Viz `>=5.2.1` experiment tracking also supports the display and comparison of metrics data through two chart types: time series and parallel coordinates. + +Time series displays one metric per graph, showing how the metric value has changed over time. + +Parallel coordinates displays all metrics on a single graph, with each vertical line representing one metric with its own scale. The metric values are positioned along those vertical lines and connected across each axis. + +When in comparison view, comparing runs highlights your selections on the respective chart types, improving readability even in the event there is a multitude of data points. + +```{note} +The following graphic is taken from the [Kedro-Viz experiment tracking demo](https://demo.kedro.org/experiment-tracking) (it is not a visualisation from the example code you created above). +``` + +![](./images/experiment-tracking-metrics-comparison.gif) + +Additionally, you can monitor the changes to metrics over time from the pipeline visualisation tab which you can access by following the icon on the left-hand side of the screen. + +![](./images/pipeline_visualisation_icon.png) + +Clicking on any `MetricsDataset` node opens a side panel displaying how the metric value has changed over time: + +![](./images/pipeline_show_metrics.gif) diff --git a/docs/source/images/autoreload.gif b/docs/source/images/autoreload.gif new file mode 100644 index 0000000000..6f29479ceb Binary files /dev/null and b/docs/source/images/autoreload.gif differ diff --git a/docs/source/images/chart-icon.png b/docs/source/images/chart-icon.png new file mode 100644 index 0000000000..0fd88f028d Binary files /dev/null and b/docs/source/images/chart-icon.png differ diff --git a/docs/source/images/experiment-tracking-compare-runs.png b/docs/source/images/experiment-tracking-compare-runs.png new file mode 100644 index 0000000000..747c832bfb Binary files /dev/null and b/docs/source/images/experiment-tracking-compare-runs.png differ diff --git a/docs/source/images/experiment-tracking-folder.png b/docs/source/images/experiment-tracking-folder.png new file mode 100644 index 0000000000..40c0c5f28c Binary files /dev/null and b/docs/source/images/experiment-tracking-folder.png differ diff --git a/docs/source/images/experiment-tracking-icon.png b/docs/source/images/experiment-tracking-icon.png new file mode 100644 index 0000000000..3990a4782d Binary files /dev/null and b/docs/source/images/experiment-tracking-icon.png differ diff --git a/docs/source/images/experiment-tracking-metrics-comparison.gif b/docs/source/images/experiment-tracking-metrics-comparison.gif new file mode 100644 index 0000000000..db868015a3 Binary files /dev/null and b/docs/source/images/experiment-tracking-metrics-comparison.gif differ diff --git a/docs/source/images/experiment-tracking-plots-comparison-expanded.png b/docs/source/images/experiment-tracking-plots-comparison-expanded.png new file mode 100644 index 0000000000..7635d5e598 Binary files /dev/null and b/docs/source/images/experiment-tracking-plots-comparison-expanded.png differ diff --git a/docs/source/images/experiment-tracking-plots-comparison.png b/docs/source/images/experiment-tracking-plots-comparison.png new file mode 100644 index 0000000000..f61e17cca2 Binary files /dev/null and b/docs/source/images/experiment-tracking-plots-comparison.png differ diff --git a/docs/source/images/experiment-tracking-runs-list.png b/docs/source/images/experiment-tracking-runs-list.png new file mode 100644 index 0000000000..33e4600b4e Binary files /dev/null and b/docs/source/images/experiment-tracking-runs-list.png differ diff --git a/docs/source/images/experiment-tracking_demo.gif b/docs/source/images/experiment-tracking_demo.gif new file mode 100644 index 0000000000..2a59c81a6a Binary files /dev/null and b/docs/source/images/experiment-tracking_demo.gif differ diff --git a/docs/source/images/kedro-publish-share.gif b/docs/source/images/kedro-publish-share.gif new file mode 100644 index 0000000000..fc0fb6b03a Binary files /dev/null and b/docs/source/images/kedro-publish-share.gif differ diff --git a/docs/source/images/kedro_viz_autoreload.gif b/docs/source/images/kedro_viz_autoreload.gif new file mode 100644 index 0000000000..52754b73bb Binary files /dev/null and b/docs/source/images/kedro_viz_autoreload.gif differ diff --git a/docs/source/images/pipeline_show_metrics.gif b/docs/source/images/pipeline_show_metrics.gif new file mode 100644 index 0000000000..57be22618b Binary files /dev/null and b/docs/source/images/pipeline_show_metrics.gif differ diff --git a/docs/source/images/pipeline_visualisation.png b/docs/source/images/pipeline_visualisation.png new file mode 100644 index 0000000000..42cddbf12b Binary files /dev/null and b/docs/source/images/pipeline_visualisation.png differ diff --git a/docs/source/images/pipeline_visualisation_icon.png b/docs/source/images/pipeline_visualisation_icon.png new file mode 100644 index 0000000000..c63f489c29 Binary files /dev/null and b/docs/source/images/pipeline_visualisation_icon.png differ diff --git a/docs/source/images/pipeline_visualisation_matplotlib.png b/docs/source/images/pipeline_visualisation_matplotlib.png new file mode 100644 index 0000000000..fb78d51dfb Binary files /dev/null and b/docs/source/images/pipeline_visualisation_matplotlib.png differ diff --git a/docs/source/images/pipeline_visualisation_matplotlib_expand.png b/docs/source/images/pipeline_visualisation_matplotlib_expand.png new file mode 100644 index 0000000000..bbc0ce121b Binary files /dev/null and b/docs/source/images/pipeline_visualisation_matplotlib_expand.png differ diff --git a/docs/source/images/pipeline_visualisation_plotly_1.png b/docs/source/images/pipeline_visualisation_plotly_1.png new file mode 100644 index 0000000000..de33027d79 Binary files /dev/null and b/docs/source/images/pipeline_visualisation_plotly_1.png differ diff --git a/docs/source/images/pipeline_visualisation_plotly_expand_1.png b/docs/source/images/pipeline_visualisation_plotly_expand_1.png new file mode 100644 index 0000000000..f9b8fc6a40 Binary files /dev/null and b/docs/source/images/pipeline_visualisation_plotly_expand_1.png differ diff --git a/docs/source/images/pipeline_visualisation_with_layers.png b/docs/source/images/pipeline_visualisation_with_layers.png new file mode 100644 index 0000000000..45ff7438ca Binary files /dev/null and b/docs/source/images/pipeline_visualisation_with_layers.png differ diff --git a/docs/source/images/preview_datasets_expanded.png b/docs/source/images/preview_datasets_expanded.png new file mode 100644 index 0000000000..fdf1f4ed49 Binary files /dev/null and b/docs/source/images/preview_datasets_expanded.png differ diff --git a/docs/source/images/preview_datasets_metadata.png b/docs/source/images/preview_datasets_metadata.png new file mode 100644 index 0000000000..429f0eb6cf Binary files /dev/null and b/docs/source/images/preview_datasets_metadata.png differ diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000000..caaa0b1877 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,35 @@ +```{image} https://raw.githubusercontent.com/kedro-org/kedro/main/static/img/kedro_banner.png +:alt: Kedro logo +:class: kedro-logo +``` + +# Welcome to Kedro-Viz documentation! + + +Kedro-Viz is an interactive development tool for building data science pipelines with [Kedro](https://github.com/kedro-org/kedro). Kedro-Viz also allows users to view and compare different runs in the Kedro project. + +Kedro-Viz features include: + +✨ Complete visualisation of a Kedro project and its pipelines. +🎨 Support for light & dark themes out of the box. +πŸš€ Scalable to big pipelines with hundreds of nodes. +πŸ”Ž Highly interactive, filterable and searchable. +πŸ”¬ Focus mode for modular pipeline visualisation. +🎨 Rich metadata side panel to display parameters, plots, etc. +πŸ“Š Support for all types of [Plotly charts](https://plotly.com/javascript/). +♻️ Autoreload on code change. +πŸ§ͺ Support for experiment tracking and comparing runs in a Kedro project. + + + +```{toctree} +:caption: Learn about Kedro-Viz +:maxdepth: 2 + +kedro-viz_visualisation +share_kedro_viz +preview_datasets +visualise_charts_with_plotly +visualise_charts_with_matplotlib +experiment_tracking +``` diff --git a/docs/source/kedro-viz_visualisation.md b/docs/source/kedro-viz_visualisation.md new file mode 100644 index 0000000000..b382a96cb6 --- /dev/null +++ b/docs/source/kedro-viz_visualisation.md @@ -0,0 +1,166 @@ +# Visualise the spaceflights project + + +This section assumes you are familiar with the basic Kedro concepts described in the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html). If you have not yet worked through the tutorial, you can still follow this example. + +If you haven't installed Kedro [follow the documentation to get set up](https://docs.kedro.org/en/stable/get_started/install.html). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project and install the dependencies for the project, which include Kedro-Viz: + +```bash +pip install -r src/requirements.txt +``` + +The next step is optional, but useful to check that all is working. Run the full set of pipelines for the tutorial project: + +```bash +kedro run +``` + +To start Kedro-Viz, type the following into your terminal from the project directory: + +```bash +kedro viz +``` + +The command opens a browser tab to serve the visualisation at `http://127.0.0.1:4141/`. + +You should see the following: + +![](./images/pipeline_visualisation.png) + +If a visualisation panel opens up and a pipeline is not visible, refresh the view, and check that your tutorial project code is complete if you've not generated it from the starter template. If you still don't see the visualisation, the Kedro community can help: + +* use the [#questions channel](https://slack.kedro.org/) on our Slack channel to ask the community for help +* search the [searchable archive of Slack discussions](https://linen-slack.kedro.org/) + +To exit the visualisation, close the browser tab. To regain control of the terminal, enter `^+c` on Mac or `Ctrl+c` on Windows or Linux machines. + +## Automatic visualisation updates + +You can use the `--autoreload` flag to autoreload Kedro-Viz when a `Python` or `YAML` file changes in the project. Add the flag to the command you use to start Kedro-Viz: + +```bash +kedro viz --autoreload +``` + +![](./images/kedro_viz_autoreload.gif) + +The `autoreload` flag reflects changes to the project as they happen. For example, commenting out `create_model_input_table_node` in `pipeline.py` will trigger a re-render of the pipeline: + +![](./images/autoreload.gif) + +## Visualise layers + +By convention, a [pipeline can be defined as having different layers](https://docs.kedro.org/en/stable/resources/glossary.html#layers-data-engineering-convention) according to how data is processed, which makes it easier to collaborate. + +For example, the [data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71) labels datasets according to the stage of the pipeline (e.g. whether the data has been cleaned). + +In Kedro version 0.18.9 we changed the way layers are defined in the Data Catalog. The definition is now included under the `metadata` key for `kedro-viz` (previously it was an attribute specified within a dataset's definition). + +Here's an example of how to use the Kedro-Viz metadata to define layers: + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw +``` + +In earlier versions of Kedro, layers were specified within a dataset's definition in the Data Catalog, but this will **no longer be supported** from Kedro version 0.19.0. From that version onwards, your `catalog.yml` must specify layers as metadata. + +```diff +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv +- layer: raw ++ metadata: ++ kedro-viz: ++ layer: raw +``` + +Open `catalog.yml` for the completed spaceflights tutorial and define layers in the following way: + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw + +reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + metadata: + kedro-viz: + layer: raw + +shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + metadata: + kedro-viz: + layer: raw + +preprocessed_companies: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_companies.pq + metadata: + kedro-viz: + layer: intermediate + +preprocessed_shuttles: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_shuttles.pq + metadata: + kedro-viz: + layer: intermediate + +model_input_table: + type: pandas.ParquetDataSet + filepath: data/03_primary/model_input_table.pq + metadata: + kedro-viz: + layer: primary + +regressor: + type: pickle.PickleDataSet + filepath: data/06_models/regressor.pickle + versioned: true + metadata: + kedro-viz: + layer: models +``` + +The visualisation now includes the layers: + +![](./images/pipeline_visualisation_with_layers.png) + +## Share a pipeline visualisation + +You can share a the pipeline structure within a Kedro-Viz visualisation as a JSON file from the terminal: + +```bash +kedro viz --save-file=my_shareable_pipeline.json +``` + +This command will save a visualisation of the `__default__` pipeline as a JSON file called `my_shareable_pipeline.json`. It doesn't share data, such as that in the code panel, nor can you share images or charts. + +To visualise the shared file, type the following to load it from the terminal: + +```bash +kedro viz --load-file=my_shareable_pipeline.json +``` diff --git a/docs/source/preview_datasets.md b/docs/source/preview_datasets.md new file mode 100644 index 0000000000..26958703f2 --- /dev/null +++ b/docs/source/preview_datasets.md @@ -0,0 +1,81 @@ +# Preview data in Kedro-Viz + +This page describes how to preview data from different datasets in a Kedro project with Kedro-Viz. Dataset preview was introduced in Kedro-Viz version 6.3.0, which offers preview for `CSVDatasets` and `ExcelDatasets`. + +We use the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html) to demonstrate how to add data preview for the `customer`, `shuttle` and `reviews` datasets. Even if you have not yet worked through the tutorial, you can still follow this example; you'll need to use the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate a copy of the project with working code in place. + +If you haven't installed Kedro [follow the documentation to get set up](https://docs.kedro.org/en/stable/get_started/install.html). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + +## Configure the Data Catalog + +Kedro-Viz version 6.3.0 currently supports preview of two types of datasets: + +* `pandas.CSVDataset` +* `pandas.ExcelDataset` + + +To enable dataset preview, add the `preview_args` attribute to the kedro-viz configuration under the `metadata` section in the Data Catalog. Within preview_args, specify `nrows` as the number of rows to preview for the dataset. + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 5 + +reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 10 + +shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 15 +``` + + + +## Previewing Data on Kedro-viz + +After you've configured the Data Catalog, you can preview the datasets on Kedro-Viz. Start Kedro-Viz by running the following command in your terminal: + +```bash +kedro viz +``` + +The previews are shown as follows: + +Click on each dataset node to see a small preview in the metadata panel: + + +![](./images/preview_datasets_metadata.png) + + +View the larger preview of the dataset by clicking the `Expand Preview Table` button on the bottom of the metadata panel. + + +![](./images/preview_datasets_expanded.png) diff --git a/docs/source/robots.txt b/docs/source/robots.txt new file mode 100644 index 0000000000..1f53798bb4 --- /dev/null +++ b/docs/source/robots.txt @@ -0,0 +1,2 @@ +User-agent: * +Disallow: / diff --git a/docs/source/share_kedro_viz.md b/docs/source/share_kedro_viz.md new file mode 100644 index 0000000000..75d3657982 --- /dev/null +++ b/docs/source/share_kedro_viz.md @@ -0,0 +1,71 @@ +# Publish and share Kedro-Viz + +```{note} +Kedro-Viz sharing was introduced in version 6.6.0. +``` + +This page describes how to publish Kedro-Viz so you can share it with others. It uses the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html) as an example. + +If you haven't installed Kedro [follow the documentation to get set up](https://docs.kedro.org/en/stable/get_started/install.html). In your terminal window, navigate to the folder you want to store the project. + +If you have not yet worked through the tutorial, use the [Kedro starter for spaceflights](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate the project with working code in place. Type the following in your terminal: + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + +## Update and install the dependencies + +Kedro-Viz requires specific minimum versions of `fsspec[s3]`, and `kedro` to publish your project. + +You can ensure you have these correct versions by updating the `requirements.txt` file in the `src` folder of the Kedro project to the following: + +```text +fsspec[s3]>=2023.9.0 +kedro>=0.18.2 +``` + +Install the dependencies from the project root directory by typing the following in your terminal: + +```bash +pip install -r src/requirements.txt +``` + +## Configure your AWS S3 bucket and set credentials + +You can host your Kedro-Viz project on Amazon S3. You must first create an S3 bucket and then enable static website hosting. To do so, follow the [AWS tutorial](https://docs.aws.amazon.com/AmazonS3/latest/userguide/HostingWebsiteOnS3Setup.html) to configure a static website on Amazon S3. + +Once that's completed, you'll need to set your AWS credentials as environment variables in your terminal window, as shown below: + +```bash +export AWS_ACCESS_KEY_ID="your_access_key_id" +export AWS_SECRET_ACCESS_KEY="your_secret_access_key" +``` + +For more information, see the official AWS documentation about [how to work with credentials](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html). + +## Publish and share the project + +You're now ready to publish and share your Kedro-Viz project. Start Kedro-Viz by running the following command in your terminal: + +```bash +kedro viz +``` + +Click the **Publish and share** icon in the lower-left of the application. You will see a modal dialog to select your relevant AWS Bucket Region and enter your Bucket Name. + +Once those two details are complete, click **Publish**. A hosted, shareable URL will be returned to you after the process completes. + +Here's an example of the flow: + +![](./images/kedro-publish-share.gif) + +## Permissions and access control + +All permissions and access control are controlled by AWS. It's up to you, the user, if you want to allow anyone to see your project or limit access to certain IP addresses, users, or groups. + +You can control who can view your visualisation using [bucket and user policies](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-iam-policies.html) or [access control lists](https://docs.aws.amazon.com/AmazonS3/latest/userguide/acls.html). See the official AWS documentation for more information. diff --git a/docs/source/visualise_charts_with_matplotlib.md b/docs/source/visualise_charts_with_matplotlib.md new file mode 100644 index 0000000000..7631d0c3b9 --- /dev/null +++ b/docs/source/visualise_charts_with_matplotlib.md @@ -0,0 +1,101 @@ +# Visualise charts in Kedro-Viz with Matplotlib + +This page describes how to output interactive visualisations of a Kedro project with Kedro-Viz, which supports integration with [Matplotlib](https://matplotlib.org/). You can view Matplotlib charts in Kedro-Viz when you use the MatplotLibWriter dataset. + + +```{note} +The MatplotlibWriter dataset converts Matplotlib objects to image files. This means that Matplotlib charts within Kedro-Viz are static and not interactive, unlike the [Plotly charts seen separately](./visualise_charts_with_plotly.md). +``` + +We use the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html) and add a reporting pipeline. Even if you have not yet worked through the tutorial, you can still follow this example; you'll need to use the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate a copy of the project with working code in place. + +If you haven't installed Kedro [follow the documentation to get set up](https://docs.kedro.org/en/stable/get_started/install.html). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + + +## Update the dependencies + +You must update the `src/requirements.txt` file in the Kedro project by adding the following dataset to enable Matplotlib for the project: + +```bash +kedro-datasets[matplotlib.MatplotlibWriter]~=1.1 +seaborn~=0.12.1 +``` + +## Configure the Data Catalog +You must also specify the output type in the `catalog.yml` file for the Data Catalog: + +```yaml +dummy_confusion_matrix: + type: matplotlib.MatplotlibWriter + filepath: data/08_reporting/dummy_confusion_matrix.png + versioned: true +``` + +## Add another node +Add the following to `src/spaceflights/pipelines/reporting/nodes.py`: + +```python +import matplotlib.pyplot as plt +import seaborn as sn + +... + + +def create_confusion_matrix(companies: pd.DataFrame): + actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1] + predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1] + data = {"y_Actual": actuals, "y_Predicted": predicted} + df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"]) + confusion_matrix = pd.crosstab( + df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"] + ) + sn.heatmap(confusion_matrix, annot=True) + return plt +``` + +## Update the pipeline + +Update `src/spaceflights/pipelines/reporting/pipeline.py` to add the following to `create_pipeline`: + +```python +from .nodes import create_confusion_matrix + +... + + +def create_pipeline(**kwargs) -> Pipeline: + """This is a simple pipeline which generates a plot""" + return pipeline( + [ + node( + func=create_confusion_matrix, + inputs="companies", + outputs="dummy_confusion_matrix", + ), + ] + ) +``` + +## Run the pipeline + +Run the pipelines with `kedro run` and then visualise the result with `kedro viz`. + +Click to see a small preview of the Matplotlib image in the metadata panel. + +![](./images/pipeline_visualisation_matplotlib.png) + +View the larger visualisation of the chart by clicking the 'Expand Matplotlib Image' button on the bottom of the metadata panel. + +![](./images/pipeline_visualisation_matplotlib_expand.png) diff --git a/docs/source/visualise_charts_with_plotly.md b/docs/source/visualise_charts_with_plotly.md new file mode 100644 index 0000000000..e9297aa3c8 --- /dev/null +++ b/docs/source/visualise_charts_with_plotly.md @@ -0,0 +1,165 @@ +# Visualise charts in Kedro-Viz with Plotly + +This page describes how to make interactive visualisations of a Kedro project with Kedro-Viz, which supports integration with [Plotly](https://plotly.com/python/). + +We use the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html) and add a reporting pipeline that uses Plotly. Even if you have not yet worked through the tutorial, you can still follow this example; you'll need to use the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate a copy of the project with working code in place. + +If you haven't installed Kedro [follow the documentation to get set up](https://docs.kedro.org/en/stable/get_started/install.html). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + +## Update the dependencies + +There are two types of Plotly datasets supported by Kedro: + +* `plotly.PlotlyDataSet` which only supports [Plotly Express](https://plotly.com/python/plotly-express) +* `plotly.JSONDataSet` which supports Plotly Express and [Plotly Graph Objects](https://plotly.com/python/graph-objects/) + +To use the Plotly datasets, you must update the `requirements.txt` file in the `src` folder of the Kedro project to add the following dependencies: + + +```text +kedro-datasets[pandas.CSVDataSet, pandas.ExcelDataSet, pandas.ParquetDataSet]~=1.1 +kedro-datasets[plotly.PlotlyDataSet, plotly.JSONDataSet]~=1.1 +``` + +Navigate to the root directory of the project in your terminal and install the dependencies for the tutorial project: + +```bash +pip install -r src/requirements.txt +``` + +## Configure the Data Catalog + +To use the datasets, add them to the Data Catalog by updating `conf/base/catalog.yml`: + +```yaml +shuttle_passenger_capacity_plot_exp: + type: plotly.PlotlyDataSet + filepath: data/08_reporting/shuttle_passenger_capacity_plot_exp.json + versioned: true + plotly_args: + type: bar + fig: + x: shuttle_type + y: passenger_capacity + orientation: h + layout: + xaxis_title: Shuttles + yaxis_title: Average passenger capacity + title: Shuttle Passenger capacity + +shuttle_passenger_capacity_plot_go: + type: plotly.JSONDataSet + filepath: data/08_reporting/shuttle_passenger_capacity_plot_go.json + versioned: true +``` + + +## Create the template reporting pipeline + +In the terminal, run the following command to generate a template for the reporting pipeline: + +```bash +kedro pipeline create reporting +``` + +### Add the Plotly reporting nodes + +Add the following to `src/spaceflights/pipelines/reporting/nodes.py`: + +```python +import plotly.express as px +import plotly.graph_objs as go +import pandas as pd + +# This function uses plotly.express +def compare_passenger_capacity_exp(preprocessed_shuttles: pd.DataFrame): + return ( + preprocessed_shuttles.groupby(["shuttle_type"]) + .mean(numeric_only=True) + .reset_index() + ) + + +# This function uses plotly.graph_objects +def compare_passenger_capacity_go(preprocessed_shuttles: pd.DataFrame): + + data_frame = ( + preprocessed_shuttles.groupby(["shuttle_type"]) + .mean(numeric_only=True) + .reset_index() + ) + fig = go.Figure( + [ + go.Bar( + x=data_frame["shuttle_type"], + y=data_frame["passenger_capacity"], + ) + ] + ) + + return fig +``` + +### Update the reporting pipeline code + +Update `src/spaceflights/pipelines/reporting/pipeline.py` to replace the existing code with the following: + +```python +from kedro.pipeline import Pipeline, node, pipeline +from .nodes import compare_passenger_capacity_exp, compare_passenger_capacity_go + + +def create_pipeline(**kwargs) -> Pipeline: + """This is a simple pipeline which generates a pair of plots""" + return pipeline( + [ + node( + func=compare_passenger_capacity_exp, + inputs="preprocessed_shuttles", + outputs="shuttle_passenger_capacity_plot_exp", + ), + node( + func=compare_passenger_capacity_go, + inputs="preprocessed_shuttles", + outputs="shuttle_passenger_capacity_plot_go", + ), + ] + ) +``` + + +## Run the pipeline + +Now run the pipelines: + +```bash +kedro run +``` + +Then visualise with `kedro viz` + +The generated charts are shown as follows: + +![](./images/chart-icon.png). + +Click on each of see a small preview in the metadata panel: + +![](./images/pipeline_visualisation_plotly_1.png) + +View the larger visualisation of the chart by clicking the 'Expand Plotly Visualisation' button on the bottom of the metadata panel. + +![](./images/pipeline_visualisation_plotly_expand_1.png) + + diff --git a/package/setup.py b/package/setup.py index 5f5c3185ef..dc1154448e 100644 --- a/package/setup.py +++ b/package/setup.py @@ -50,4 +50,13 @@ "kedro-dataset-stats = kedro_viz.integrations.kedro.hooks:dataset_stats_hook" ], }, + extras_require={ + "docs": [ + "sphinx~=5.3.0", + "sphinx_copybutton==0.3.1", + "sphinx-notfound-page", + "sphinx_rtd_theme==1.2.0", + "myst-parser~=1.0.0", + ], + }, )